From 3a6820fa65410661c52cfe5a3133d7537c21e9da Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Tue, 15 Oct 2024 13:07:29 -0700
Subject: [PATCH] Refactor matmul test suite. (#22)

Progress on https://github.com/iree-org/iree-test-suites/issues/2. See
also the long [Discord thread
here](https://discord.com/channels/689900678990135345/1270451599231156266).

## Summaries of changes

### Further decoupled test suites from the core CMake project

* Forked `iree_native_test.cmake` to
`iree_test_suites_native_test.cmake`
* Dropped support (temporarily?) for testing on Android, RISC-V, and ARM
with SME
* Forked `iree_e2e_generated_runner_test.cmake` to
`iree_test_suites_runner_test.cmake`
* Dropped support (temporarily?) for filtering within the build system
which tests are defined and compile .vmfb files
* Now we can set `-DIREE_BUILD_TESTS=OFF` and avoid pulling in IREE's
other tests
* Added a new hand-authored `linalg_ops/matmul/CMakeLists.txt` that runs
tests on each backend using default flags

### Simplified the test generator

* Dropped unused functions
* Folded GPU-specific shapes into generic "small" and "large" shape test
suites

### Ran the `generate_e2e_matmul_tests.py` script offline and checked in
the generated files

* Currently 56 files totaling 1.90MB on disk (~27000 lines of code
according to GitHub)
* Now we can inspect the test cases without needing to run the generator
locally, and I fixed a few formatting issues
* I think this makes test suite management easier, and having the
generated files in this test suites repository doesn't cost the main
repository much (just extra `git checkout` time), but I could see a case
for more tightly coupling the generator with the test runner

## What is left to do?

* I want to iterate some more on the `linalg_ops/matmul/CMakeLists.txt`
file or move to a different test runner somehow. I mainly want to
support XFAIL in some way for both compiling and running.
* We should add back tests using CPU features like AVX512, GPU features
like Vulkan float16 extensions, and other non-default flags somehow.
Either infer what the compiler can from the host / target, or add test
suites explicitly.
---
 linalg_ops/CMakeLists.txt                     |    8 +-
 .../iree_e2e_generated_runner_test.cmake      |  502 ----
 linalg_ops/iree_test_suites_native_test.cmake |  114 +
 linalg_ops/iree_test_suites_runner_test.cmake |  100 +
 linalg_ops/matmul/CMakeLists.txt              | 2229 ++---------------
 .../matmul/generate_e2e_matmul_tests.py       |   92 +-
 linalg_ops/matmul/generate_test_mlir_files.sh |   89 +
 .../matmul_bf16_into_bf16_large.mlir          |  136 +
 .../matmul_bf16_into_bf16_large_calls.mlir    |  882 +++++++
 .../matmul_bf16_into_bf16_small.mlir          |   99 +
 .../matmul_bf16_into_bf16_small_calls.mlir    |  906 +++++++
 ...tmul_transpose_b_bf16_into_bf16_large.mlir |  136 +
 ...ranspose_b_bf16_into_bf16_large_calls.mlir |  882 +++++++
 ...tmul_transpose_b_bf16_into_bf16_small.mlir |   99 +
 ...ranspose_b_bf16_into_bf16_small_calls.mlir |  906 +++++++
 .../matmul_bf16_into_f32_large.mlir           |  136 +
 .../matmul_bf16_into_f32_large_calls.mlir     |  882 +++++++
 .../matmul_bf16_into_f32_small.mlir           |   99 +
 .../matmul_bf16_into_f32_small_calls.mlir     |  906 +++++++
 ...atmul_transpose_b_bf16_into_f32_large.mlir |  136 +
 ...transpose_b_bf16_into_f32_large_calls.mlir |  882 +++++++
 ...atmul_transpose_b_bf16_into_f32_small.mlir |   99 +
 ...transpose_b_bf16_into_f32_small_calls.mlir |  906 +++++++
 .../matmul_f16_into_f16_large.mlir            |  136 +
 .../matmul_f16_into_f16_large_calls.mlir      |  882 +++++++
 .../matmul_f16_into_f16_small.mlir            |   99 +
 .../matmul_f16_into_f16_small_calls.mlir      |  906 +++++++
 ...matmul_transpose_b_f16_into_f16_large.mlir |  136 +
 ..._transpose_b_f16_into_f16_large_calls.mlir |  882 +++++++
 ...matmul_transpose_b_f16_into_f16_small.mlir |   99 +
 ..._transpose_b_f16_into_f16_small_calls.mlir |  906 +++++++
 .../matmul_f16_into_f32_large.mlir            |  136 +
 .../matmul_f16_into_f32_large_calls.mlir      |  882 +++++++
 .../matmul_f16_into_f32_small.mlir            |   99 +
 .../matmul_f16_into_f32_small_calls.mlir      |  906 +++++++
 ...matmul_transpose_b_f16_into_f32_large.mlir |  136 +
 ..._transpose_b_f16_into_f32_large_calls.mlir |  882 +++++++
 ...matmul_transpose_b_f16_into_f32_small.mlir |   99 +
 ..._transpose_b_f16_into_f32_small_calls.mlir |  906 +++++++
 .../matmul_f32_into_f32_large.mlir            |  136 +
 .../matmul_f32_into_f32_large_calls.mlir      |  882 +++++++
 .../matmul_f32_into_f32_small.mlir            |   99 +
 .../matmul_f32_into_f32_small_calls.mlir      |  906 +++++++
 ...matmul_transpose_b_f32_into_f32_large.mlir |  136 +
 ..._transpose_b_f32_into_f32_large_calls.mlir |  882 +++++++
 ...matmul_transpose_b_f32_into_f32_small.mlir |   99 +
 ..._transpose_b_f32_into_f32_small_calls.mlir |  906 +++++++
 .../matmul_f8E4M3FNUZ_into_f32_large.mlir     |  172 ++
 ...atmul_f8E4M3FNUZ_into_f32_large_calls.mlir |  882 +++++++
 .../matmul_f8E4M3FNUZ_into_f32_small.mlir     |  131 +
 ...atmul_f8E4M3FNUZ_into_f32_small_calls.mlir |  906 +++++++
 ...transpose_b_f8E4M3FNUZ_into_f32_large.mlir |  172 ++
 ...ose_b_f8E4M3FNUZ_into_f32_large_calls.mlir |  882 +++++++
 ...transpose_b_f8E4M3FNUZ_into_f32_small.mlir |  131 +
 ...ose_b_f8E4M3FNUZ_into_f32_small_calls.mlir |  906 +++++++
 .../i8_into_i32/matmul_i8_into_i32_large.mlir |  136 +
 .../matmul_i8_into_i32_large_calls.mlir       |  882 +++++++
 .../i8_into_i32/matmul_i8_into_i32_small.mlir |   99 +
 .../matmul_i8_into_i32_small_calls.mlir       |  906 +++++++
 .../matmul_transpose_b_i8_into_i32_large.mlir |  136 +
 ...l_transpose_b_i8_into_i32_large_calls.mlir |  882 +++++++
 .../matmul_transpose_b_i8_into_i32_small.mlir |   99 +
 ...l_transpose_b_i8_into_i32_small_calls.mlir |  906 +++++++
 63 files changed, 28985 insertions(+), 2607 deletions(-)
 delete mode 100644 linalg_ops/iree_e2e_generated_runner_test.cmake
 create mode 100644 linalg_ops/iree_test_suites_native_test.cmake
 create mode 100644 linalg_ops/iree_test_suites_runner_test.cmake
 create mode 100755 linalg_ops/matmul/generate_test_mlir_files.sh
 create mode 100644 linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_large.mlir
 create mode 100644 linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_large_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_small.mlir
 create mode 100644 linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_small_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_large.mlir
 create mode 100644 linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_large_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_small.mlir
 create mode 100644 linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_small_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_large.mlir
 create mode 100644 linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_large_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_small.mlir
 create mode 100644 linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_small_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_large.mlir
 create mode 100644 linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_large_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_small.mlir
 create mode 100644 linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_small_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_large.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_large_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_small.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_small_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_large.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_large_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_small.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_small_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_large.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_large_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_small.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_small_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_large.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_large_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_small.mlir
 create mode 100644 linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_small_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_large.mlir
 create mode 100644 linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_large_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_small.mlir
 create mode 100644 linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_small_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_large.mlir
 create mode 100644 linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_large_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_small.mlir
 create mode 100644 linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_small_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_large.mlir
 create mode 100644 linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_large_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_small.mlir
 create mode 100644 linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_small_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_large.mlir
 create mode 100644 linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_large_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_small.mlir
 create mode 100644 linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_small_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_large.mlir
 create mode 100644 linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_large_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_small.mlir
 create mode 100644 linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_small_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_large.mlir
 create mode 100644 linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_large_calls.mlir
 create mode 100644 linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_small.mlir
 create mode 100644 linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_small_calls.mlir

diff --git a/linalg_ops/CMakeLists.txt b/linalg_ops/CMakeLists.txt
index 969e9e5..08cf318 100644
--- a/linalg_ops/CMakeLists.txt
+++ b/linalg_ops/CMakeLists.txt
@@ -29,10 +29,7 @@ set(IREE_PACKAGE_ROOT_DIR "${CMAKE_CURRENT_LIST_DIR}")
 set(IREE_PACKAGE_ROOT_PREFIX "iree-test-suites")
 set(IREE_BUILD_COMPILER OFF)
 set(IREE_BUILD_SAMPLES OFF)
-# We should also be able to set -DIREE_BUILD_TESTS=OFF, but this currently
-# depends on the core project's CMake functions like iree_native_test and
-# iree_bytecode_module.
-set(IREE_BUILD_TESTS ON)
+set(IREE_BUILD_TESTS OFF)
 
 if(IREE_USE_LOCAL_REPO)
   message(STATUS "Using IREE repo at path '${IREE_LOCAL_REPO_PATH}'")
@@ -122,6 +119,7 @@ iree_cc_binary(
 #-------------------------------------------------------------------------------
 
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR})
-include(iree_e2e_generated_runner_test)
+include(iree_test_suites_native_test)
+include(iree_test_suites_runner_test)
 
 add_subdirectory(matmul)
diff --git a/linalg_ops/iree_e2e_generated_runner_test.cmake b/linalg_ops/iree_e2e_generated_runner_test.cmake
deleted file mode 100644
index a99409a..0000000
--- a/linalg_ops/iree_e2e_generated_runner_test.cmake
+++ /dev/null
@@ -1,502 +0,0 @@
-# Copyright 2021 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-include(CMakeParseArguments)
-
-# iree_e2e_runner_test()
-#
-# Creates a test using a specified test runner program for the specified
-# test files.
-#
-# Parameters:
-#   NAME: Name of the target
-#   TEST_TYPE: Type of test (Currently, matmul and conv2d are supported).
-#   VARIANT_NAME: Variant name to suffix NAME with.
-#       Will reuse the same TEST_TYPE/calls vmfb files.
-#   TESTS_SRC: mlir source file with TEST_TYPE to be compiled to an IREE module.
-#   TESTS_VMFB: specifies the path to use for the generated IREE module.
-#   CALLS_SRC: mlir source file with calls to be compiled to an IREE module.
-#   CALLS_VMFB: specifies the path to use for the generated IREE module.
-#   TARGET_BACKEND: target backend to compile for.
-#   DRIVER: driver to run the module with.
-#   COMPILER_FLAGS: additional flags to pass to the compiler. Bytecode output
-#       format and backend flags are passed automatically.
-#   RUNNER_ARGS: additional args to pass to the trace-runner program. The driver
-#       and input file flags are passed automatically.
-#   LABELS: Additional labels to apply to the test. The package path and
-#       "driver=${DRIVER}" are added automatically.
-#   TEST_RUNNER: trace-runner program to run.
-#   TARGET_CPU_FEATURES: If specified, a string passed as argument to
-#       --iree-llvmcpu-target-cpu-features.
-#   TEST_DEFINED: Whether to define a test target.
-#   TEST_DISABLED: The test target will be skipped and its status will be
-#       'Not Run'.
-function(iree_e2e_runner_test)
-  if(NOT IREE_BUILD_TESTS)
-    return()
-  endif()
-
-  # See comment in iree_check_test about this condition.
-  if(NOT IREE_BUILD_COMPILER AND NOT IREE_HOST_BIN_DIR)
-    return()
-  endif()
-
-  cmake_parse_arguments(
-    _RULE
-    ""
-    "NAME;TEST_TYPE;VARIANT_NAME;TESTS_SRC;TESTS_VMFB;CALLS_SRC;CALLS_VMFB;TRACE;TARGET_BACKEND;DRIVER;TEST_RUNNER;TEST_DEFINED;TEST_DISABLED"
-    "COMPILER_FLAGS;RUNNER_ARGS;LABELS;TARGET_CPU_FEATURES"
-    ${ARGN}
-  )
-
-  iree_is_bytecode_module_test_excluded_by_labels(_EXCLUDED_BY_LABELS "${_RULE_LABELS}")
-  if(_EXCLUDED_BY_LABELS)
-    return()
-  endif()
-
-  iree_package_name(_PACKAGE_NAME)
-  set(_NAME "${_PACKAGE_NAME}_${_RULE_NAME}")
-
-  set(_BASE_COMPILER_FLAGS
-    "--iree-hal-target-backends=${_RULE_TARGET_BACKEND}"
-  )
-  if (_RULE_TARGET_CPU_FEATURES)
-    list(APPEND _BASE_COMPILER_FLAGS "--iree-llvmcpu-target-cpu-features=${_RULE_TARGET_CPU_FEATURES}")
-  endif()
-
-  if(NOT TARGET "${_NAME}_${_RULE_TEST_TYPE}_module")
-    iree_bytecode_module(
-      NAME
-        "${_RULE_NAME}_${_RULE_TEST_TYPE}_module"
-      MODULE_FILE_NAME
-        "${_RULE_TESTS_VMFB}"
-      SRC
-        "${_RULE_TESTS_SRC}"
-      FLAGS
-        "${_BASE_COMPILER_FLAGS}"
-        "${_RULE_COMPILER_FLAGS}"
-    )
-  endif()
-
-  if(NOT TARGET "${_NAME}_calls_module")
-    iree_bytecode_module(
-      NAME
-        "${_RULE_NAME}_calls_module"
-      MODULE_FILE_NAME
-        "${_RULE_CALLS_VMFB}"
-      SRC
-        "${_RULE_CALLS_SRC}"
-      FLAGS
-        "${_BASE_COMPILER_FLAGS}"
-        "${_RULE_COMPILER_FLAGS}"
-    )
-  endif()
-
-  # A target specifically for the test. We could combine this with the above,
-  # but we want that one to get pulled into iree_bytecode_module.
-  add_custom_target("${_NAME}${_RULE_VARIANT_NAME}" ALL)
-  add_dependencies(
-    "${_NAME}${_RULE_VARIANT_NAME}"
-    "${_NAME}_${_RULE_TEST_TYPE}_module"
-    "${_NAME}_calls_module"
-    "${_RULE_TEST_RUNNER}"
-  )
-
-  add_dependencies(iree-test-suites-linalg-ops-deps "${_NAME}${_RULE_VARIANT_NAME}")
-
-  if(_RULE_TEST_DEFINED)
-    iree_native_test(
-      NAME
-        "${_RULE_NAME}${_RULE_VARIANT_NAME}"
-      DRIVER
-        "${_RULE_DRIVER}"
-      SRC
-        "${_RULE_TEST_RUNNER}"
-      DATA
-        ${_TESTS_VMFB}
-        ${_CALLS_VMFB}
-      ARGS
-        "--module={{${_TESTS_VMFB}}}"
-        "--module={{${_CALLS_VMFB}}}"
-        ${_RULE_RUNNER_ARGS}
-      LABELS
-        ${_RULE_LABELS}
-      DISABLED
-        ${_RULE_TEST_DISABLED}
-    )
-  endif()
-endfunction()
-
-# iree_single_backend_e2e_runner_test()
-#
-# Parameters:
-#   NAME: Name of the target
-#   TEST_TYPE: Type of test (Currently, matmul and conv are supported).
-#   GENERATOR: Program (at the moment, must be Python3) to run to generate the
-#       source file (and possibly a trace file and module path). It will be
-#       invoked with the following standard flags, in addition to GENERATOR_ARGS:
-#         --output_${TEST_TYPE}_mlir=${CMAKE_CURRENT_BINARY_DIR}/name_${TEST_TYPE}.mlir
-#         --output_calls_mlir=${CMAKE_CURRENT_BINARY_DIR}/name_calls.mlir
-#       and if TARGET_CPU_FEATURES is not empty:
-#         --requirements=${TARGET_CPU_FEATURES}
-#   GENERATOR_ARGS: additional args to pass to the generator program.
-#   TARGET_BACKEND: target backend to compile for.
-#   DRIVER: driver to run the module with.
-#   COMPILER_FLAGS: additional flags to pass to the compiler. Bytecode output
-#       format and backend flags are passed automatically.
-#   RUNNER_ARGS: additional args to pass to the trace-runner program. The driver
-#       and input file flags are passed automatically.
-#   LABELS: Additional labels to apply to the test. The package path and
-#       "driver=${DRIVER}" are added automatically.
-#   TEST_RUNNER: trace-runner program to run.
-#   TARGET_CPU_FEATURES: If specified, a string passed as argument to
-#       --iree-llvmcpu-target-cpu-features.
-function(iree_single_backend_e2e_runner_test)
-  if(NOT IREE_BUILD_TESTS)
-    return()
-  endif()
-
-  # Copied from iree_check_test. Refer to the comment there.
-  if(NOT IREE_BUILD_COMPILER AND NOT IREE_HOST_BIN_DIR)
-    return()
-  endif()
-
-  cmake_parse_arguments(
-    _RULE
-    ""
-    "NAME;TEST_TYPE;GENERATOR;TARGET_BACKEND;DRIVER;TEST_RUNNER"
-    "GENERATOR_ARGS;COMPILER_FLAGS;RUNNER_ARGS;LABELS;TARGET_CPU_FEATURES"
-    ${ARGN}
-  )
-
-  # ---------------------------------------------------------------------------
-  # Bytecode module builds require
-  #   1. the compiler, either in the same build or provided in IREE_HOST_BIN_DIR
-  #   2. compiler support for _RULE_INPUT_TYPE
-  #   3. compiler support for _RULE_TARGET_BACKEND
-  set(_BYTECODE_MODULE_BUILD_ENABLED TRUE)
-
-  # 1. Check for the compiler.
-  if(NOT IREE_BUILD_COMPILER AND NOT IREE_HOST_BIN_DIR)
-    set(_BYTECODE_MODULE_BUILD_ENABLED FALSE)
-  endif()
-
-  # 2. Check target backend availability.
-  # Note: we can only reliably check for this when building the compiler host
-  # tools from source. If the tools are already built, we assume that all target
-  # backends are enabled. We could query the tools in the binary directory for
-  # support dynamically if optionality would be useful.
-  if(NOT IREE_HOST_BIN_DIR)
-    string(TOUPPER ${_RULE_TARGET_BACKEND} _UPPERCASE_TARGET_BACKEND)
-    string(REPLACE "-" "_" _NORMALIZED_TARGET_BACKEND ${_UPPERCASE_TARGET_BACKEND})
-    # TODO(scotttodd): allow plugins to provide external backends here
-    if(NOT DEFINED IREE_TARGET_BACKEND_${_NORMALIZED_TARGET_BACKEND})
-      message(SEND_ERROR "Unknown backend '${_RULE_TARGET_BACKEND}'. Check IREE_TARGET_BACKEND_* options.")
-    endif()
-    if(NOT IREE_TARGET_BACKEND_${_NORMALIZED_TARGET_BACKEND})
-      set(_BYTECODE_MODULE_BUILD_ENABLED FALSE)
-    endif()
-  endif()
-  # ---------------------------------------------------------------------------
-
-  # ---------------------------------------------------------------------------
-  # Tests are defined if _RULE_DRIVER is defined.
-  set(_TEST_DEFINED TRUE)
-  if(NOT DEFINED _RULE_DRIVER)
-    set(_TEST_DEFINED FALSE)
-  endif()
-
-  # Test execution requires
-  #   1. the bytecode module build to be enabled
-  #   2. _RULE_DRIVER is defined and runtime support is enabled
-  #   3. no other label exclusions (e.g. 'optonly' test with 'debug' config)
-  set(_TEST_DISABLED FALSE)
-
-  # 1. Check bytecode module build.
-  if(NOT _BYTECODE_MODULE_BUILD_ENABLED)
-    set(_TEST_DISABLED TRUE)
-  endif()
-
-  # 2. Check driver availability.
-  if(DEFINED _RULE_DRIVER)
-    string(TOUPPER ${_RULE_DRIVER} _UPPERCASE_DRIVER)
-    string(REPLACE "-" "_" _NORMALIZED_DRIVER ${_UPPERCASE_DRIVER})
-    if((NOT IREE_HAL_DRIVER_${_NORMALIZED_DRIVER}) AND
-       (NOT IREE_EXTERNAL_${_NORMALIZED_DRIVER}_HAL_DRIVER_FOUND))
-      set(_TEST_DISABLED TRUE)
-    endif()
-  endif()
-
-  # 3. Check label exclusions.
-  iree_is_bytecode_module_test_excluded_by_labels(_EXCLUDED_BY_LABELS "${_RULE_LABELS}")
-  if(_EXCLUDED_BY_LABELS)
-    set(_TEST_DISABLED TRUE)
-  endif()
-
-  if((_TEST_DISABLED OR NOT _TEST_DEFINED) AND NOT IREE_BUILD_ALL_CHECK_TEST_MODULES)
-    set(_BYTECODE_MODULE_BUILD_ENABLED FALSE)
-  endif()
-  # ---------------------------------------------------------------------------
-
-  iree_package_name(_PACKAGE_NAME)
-  set(_NAME "${_PACKAGE_NAME}_${_RULE_NAME}")
-
-  set(_TESTS_SRC "${CMAKE_CURRENT_BINARY_DIR}/${_RULE_NAME}_${_RULE_TEST_TYPE}.mlir")
-  set(_CALLS_SRC "${CMAKE_CURRENT_BINARY_DIR}/${_RULE_NAME}_calls.mlir")
-  set(_TESTS_VMFB "${CMAKE_CURRENT_BINARY_DIR}/${_RULE_NAME}_${_RULE_TEST_TYPE}.vmfb")
-  set(_CALLS_VMFB "${CMAKE_CURRENT_BINARY_DIR}/${_RULE_NAME}_calls.vmfb")
-
-  list(APPEND _GENERATOR_STANDARD_FLAGS "--output_${_RULE_TEST_TYPE}_mlir=${_TESTS_SRC}")
-  list(APPEND _GENERATOR_STANDARD_FLAGS "--output_calls_mlir=${_CALLS_SRC}")
-  if(_RULE_TARGET_CPU_FEATURES)
-    list(APPEND _GENERATOR_STANDARD_FLAGS "--requirements=${_RULE_TARGET_CPU_FEATURES}")
-  endif()
-
-  if(NOT _BYTECODE_MODULE_BUILD_ENABLED)
-    return()
-  endif()
-
-  add_custom_command(
-    COMMAND
-      "${Python3_EXECUTABLE}"
-      "${CMAKE_CURRENT_SOURCE_DIR}/${_RULE_GENERATOR}"
-      ${_GENERATOR_STANDARD_FLAGS}
-      ${_RULE_GENERATOR_ARGS}
-    OUTPUT
-      ${_TESTS_SRC}
-      ${_CALLS_SRC}
-    DEPENDS
-      ${_RULE_GENERATOR}
-  )
-
-  add_custom_target(
-    "${_NAME}_generated_files"
-    DEPENDS
-      ${_TESTS_SRC}
-      ${_CALLS_SRC}
-  )
-
-  # When using the llvm-cpu backend, the runtime build config may need to
-  # match the compiled executable config using (`--iree-llvmcpu-sanitize=`):
-  #
-  # | Runtime type         | Compatible with these executable types |
-  # | -------------------- | -------------------------------------- |
-  # | Base (no sanitizers) | Base, ASan                             |
-  # | ASan                 | Base, ASan                             |
-  # | TSan                 | TSan (ABI break)                       |
-
-  # Define the regular test suite, unless the config is llvm-cpu + TSan.
-  if(NOT _RULE_TARGET_BACKEND STREQUAL "llvm-cpu" OR NOT IREE_ENABLE_TSAN)
-    iree_e2e_runner_test(
-      NAME ${_RULE_NAME}
-      TEST_TYPE ${_RULE_TEST_TYPE}
-      VARIANT_NAME ""
-      TESTS_SRC ${_TESTS_SRC}
-      TESTS_VMFB ${_TESTS_VMFB}
-      CALLS_SRC ${_CALLS_SRC}
-      CALLS_VMFB ${_CALLS_VMFB}
-      TEST_RUNNER ${_RULE_TEST_RUNNER}
-      TARGET_BACKEND ${_RULE_TARGET_BACKEND}
-      DRIVER ${_RULE_DRIVER}
-      COMPILER_FLAGS ${_RULE_COMPILER_FLAGS}
-      RUNNER_ARGS ${_RULE_RUNNER_ARGS}
-      LABELS ${_RULE_LABELS}
-      TARGET_CPU_FEATURES ${_RULE_TARGET_CPU_FEATURES}
-      TEST_DEFINED ${_TEST_DEFINED}
-      TEST_DISABLED ${_TEST_DISABLED}
-    )
-    # Note we are relying on the fact that the target created by
-    # iree_e2e_runner_test is _NAME, even though we passed _RULE_NAME to it,
-    # i.e. we are relying on the prefixing to be identical.
-    add_dependencies("${_NAME}" "${_NAME}_generated_files")
-  endif()
-
-  # Define tests for AddressSanitizer (ASan) and ThreadSanitizer (TSan).
-  # Normally test suites should do this sort of branching at the leaves rather
-  # than modify the base CMake function directly, but sanitizers are applied
-  # at the build system uniformly, so until we decouple the test suites from
-  # source builds further this felt like a reasonable compromise.
-  if(_RULE_TARGET_BACKEND STREQUAL "llvm-cpu")
-    if(IREE_ENABLE_ASAN)
-      set(_ASAN_COMPILER_FLAGS ${_RULE_COMPILER_FLAGS})
-      list(APPEND _ASAN_COMPILER_FLAGS "--iree-llvmcpu-link-embedded=false")
-      list(APPEND _ASAN_COMPILER_FLAGS "--iree-llvmcpu-sanitize=address")
-      iree_e2e_runner_test(
-        NAME ${_RULE_NAME}
-        TEST_TYPE ${_RULE_TEST_TYPE}
-        VARIANT_NAME "_asan"
-        TESTS_SRC ${_TESTS_SRC}
-        TESTS_VMFB ${_TESTS_VMFB}
-        CALLS_SRC ${_CALLS_SRC}
-        CALLS_VMFB ${_CALLS_VMFB}
-        TEST_RUNNER ${_RULE_TEST_RUNNER}
-        TARGET_BACKEND ${_RULE_TARGET_BACKEND}
-        DRIVER ${_RULE_DRIVER}
-        COMPILER_FLAGS ${_ASAN_COMPILER_FLAGS}
-        RUNNER_ARGS ${_RULE_RUNNER_ARGS}
-        LABELS ${_RULE_LABELS}
-        TARGET_CPU_FEATURES ${_RULE_TARGET_CPU_FEATURES}
-        TEST_DEFINED ${_TEST_DEFINED}
-        TEST_DISABLED ${_TEST_DISABLED}
-      )
-      # Note we are relying on the fact that the target created by
-      # iree_e2e_runner_test is _NAME, even though we passed _RULE_NAME to it,
-      # i.e. we are relying on the prefixing to be identical.
-      add_dependencies("${_NAME}_asan" "${_NAME}_generated_files")
-    endif()
-
-    if(IREE_ENABLE_TSAN)
-      set(_TSAN_COMPILER_FLAGS ${_RULE_COMPILER_FLAGS})
-      list(APPEND _TSAN_COMPILER_FLAGS "--iree-llvmcpu-link-embedded=false")
-      list(APPEND _TSAN_COMPILER_FLAGS "--iree-llvmcpu-sanitize=thread")
-      iree_e2e_runner_test(
-        NAME ${_RULE_NAME}
-        VARIANT_NAME "_tsan"
-        TESTS_SRC ${_TESTS_SRC}
-        TESTS_VMFB ${_TESTS_VMFB}
-        CALLS_SRC ${_CALLS_SRC}
-        CALLS_VMFB ${_CALLS_VMFB}
-        TEST_RUNNER ${_RULE_TEST_RUNNER}
-        TARGET_BACKEND ${_RULE_TARGET_BACKEND}
-        DRIVER ${_RULE_DRIVER}
-        COMPILER_FLAGS ${_TSAN_COMPILER_FLAGS}
-        RUNNER_ARGS ${_RULE_RUNNER_ARGS}
-        LABELS ${_RULE_LABELS}
-        TARGET_CPU_FEATURES ${_RULE_TARGET_CPU_FEATURES}
-        TEST_DEFINED ${_TEST_DEFINED}
-        TEST_DISABLED ${_TEST_DISABLED}
-      )
-      # Note we are relying on the fact that the target created by
-      # iree_e2e_runner_test is _NAME, even though we passed _RULE_NAME to it,
-      # i.e. we are relying on the prefixing to be identical.
-      add_dependencies("${_NAME}_tsan" "${_NAME}_generated_files")
-    endif()
-  endif()
-endfunction()
-
-
-# iree_generated_e2e_runner_test()
-#
-# Creates a set of iree_single_backend_e2e_runner_test's differing
-# by target backend and driver.
-#
-# Mirrors the bzl rule of the same name.
-#
-# One test is generated per source and backend/driver pair.
-# Parameters:
-#   NAME: Name of the target
-#   TEST_TYPE: Type of test (Currently, matmul and conv are supported).
-#   GENERATOR: Program (at the moment, must be Python3) to run to generate the
-#       source file (and possibly a trace file and module path). It will be
-#       invoked with the following standard flags, in addition to GENERATOR_ARGS:
-#         --output_${TEST_TYPE}_mlir=${CMAKE_CURRENT_BINARY_DIR}/name_${TEST_TYPE}.mlir
-#         --output_calls_mlir=${CMAKE_CURRENT_BINARY_DIR}/name_calls.mlir
-#   GENERATOR_ARGS: additional args to pass to the generator program.
-#   TARGET_BACKENDS: backends to compile the module for. These form pairs with
-#       the DRIVERS argument (due to cmake limitations they are separate list
-#       arguments). The lengths must exactly match. If no backends or drivers are
-#       specified, a test will be generated for every supported pair.
-#   DRIVERS: drivers to run the module with. These form pairs with the
-#       TARGET_BACKENDS argument (due to cmake limitations they are separate list
-#       arguments). The lengths must exactly match. If no backends or drivers are
-#       specified, a test will be generated for every supported pair.
-#   COMPILER_FLAGS: additional flags to pass to the compiler. Bytecode output
-#       format and backend flags are passed automatically.
-#   RUNNER_ARGS: additional args to pass to the trace-runner program. The driver
-#       and input file flags are passed automatically.
-#   LABELS: Additional labels to apply to the test. The package path and
-#       "driver=${DRIVER}" are added automatically.
-#   TEST_RUNNER: trace-runner program to run.
-#   TARGET_CPU_FEATURES_VARIANTS:list of target cpu features variants. Each
-#       entry is either "default" for the architecture defaults, or a colon-
-#       separated triple "arch:name:cpu_features" where "arch" filters
-#       for a target CPU architecture (in IREE_ARCH format), "name" is a
-#       short name for the CPU features set (used to generate target names)
-#       and cpu_features is a comma-separated list of LLVM target attributes
-#       to enable. Example:
-#         x86_64:avx2_fma:+avx,+avx2,+fma
-function(iree_generated_e2e_runner_test)
-  if(NOT IREE_BUILD_TESTS)
-    return()
-  endif()
-
-  cmake_parse_arguments(
-    _RULE
-    ""
-    "NAME;TEST_TYPE;GENERATOR;TEST_RUNNER"
-    "TARGET_BACKENDS;DRIVERS;GENERATOR_ARGS;COMPILER_FLAGS;RUNNER_ARGS;LABELS;TARGET_CPU_FEATURES_VARIANTS"
-    ${ARGN}
-  )
-
-  iree_is_bytecode_module_test_excluded_by_labels(_EXCLUDED_BY_LABELS "${_RULE_LABELS}")
-  if(_EXCLUDED_BY_LABELS)
-    return()
-  endif()
-
-  if(_RULE_TARGET_CPU_FEATURES_VARIANTS)
-    set(_TARGET_CPU_FEATURES_VARIANTS "${_RULE_TARGET_CPU_FEATURES_VARIANTS}")
-  else()
-    set(_TARGET_CPU_FEATURES_VARIANTS "default")
-  endif()
-
-
-  if(NOT DEFINED _RULE_TARGET_BACKENDS AND NOT DEFINED _RULE_DRIVERS)
-    set(_RULE_TARGET_BACKENDS "vmvx" "vulkan-spirv" "llvm-cpu")
-    set(_RULE_DRIVERS "local-task" "vulkan" "local-task")
-  endif()
-
-  list(LENGTH _RULE_TARGET_BACKENDS _TARGET_BACKEND_COUNT)
-  list(LENGTH _RULE_DRIVERS _DRIVER_COUNT)
-
-  if(NOT _TARGET_BACKEND_COUNT EQUAL _DRIVER_COUNT)
-    message(SEND_ERROR
-        "TARGET_BACKENDS count ${_TARGET_BACKEND_COUNT} does not match DRIVERS count ${_DRIVER_COUNT}")
-  endif()
-
-  math(EXPR _MAX_INDEX "${_TARGET_BACKEND_COUNT} - 1")
-  foreach(_INDEX RANGE "${_MAX_INDEX}")
-    list(GET _RULE_TARGET_BACKENDS ${_INDEX} _TARGET_BACKEND)
-    list(GET _RULE_DRIVERS ${_INDEX} _DRIVER)
-    foreach(_VARIANT_STRING IN LISTS _TARGET_CPU_FEATURES_VARIANTS)
-      parse_target_cpu_features_variant("${_VARIANT_STRING}"
-        _ENABLED _TARGET_CPU_FEATURES_NAME _TARGET_CPU_FEATURES)
-      if(NOT _ENABLED)
-        # The current entry is disabled on the target CPU architecture.
-        continue()
-      endif()
-      set(_TARGET_CPU_FEATURES_SUFFIX "")
-      set(_LABELS "${_RULE_LABELS}")
-      if (_TARGET_CPU_FEATURES_NAME)
-        set(_TARGET_CPU_FEATURES_SUFFIX "_${_TARGET_CPU_FEATURES_NAME}")
-        list(APPEND _LABELS "cpu_features=${_TARGET_CPU_FEATURES_NAME}")
-      endif()
-      iree_single_backend_e2e_runner_test(
-        NAME
-          "${_RULE_NAME}_${_TARGET_BACKEND}_${_DRIVER}${_TARGET_CPU_FEATURES_SUFFIX}"
-        TEST_TYPE
-          ${_RULE_TEST_TYPE}
-        GENERATOR
-          ${_RULE_GENERATOR}
-        GENERATOR_ARGS
-          ${_RULE_GENERATOR_ARGS}
-        TEST_RUNNER
-          ${_RULE_TEST_RUNNER}
-        TARGET_BACKEND
-          ${_TARGET_BACKEND}
-        DRIVER
-          ${_DRIVER}
-        COMPILER_FLAGS
-          ${_RULE_COMPILER_FLAGS}
-        RUNNER_ARGS
-          ${_RULE_RUNNER_ARGS}
-        LABELS
-          ${_LABELS}
-        TARGET_CPU_FEATURES
-          ${_TARGET_CPU_FEATURES}
-      )
-    endforeach()
-  endforeach()
-endfunction()
diff --git a/linalg_ops/iree_test_suites_native_test.cmake b/linalg_ops/iree_test_suites_native_test.cmake
new file mode 100644
index 0000000..a2c3a8f
--- /dev/null
+++ b/linalg_ops/iree_test_suites_native_test.cmake
@@ -0,0 +1,114 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+include(CMakeParseArguments)
+
+# iree_test_suites_native_test()
+#
+# Creates a test that runs the specified binary with the specified arguments.
+#
+# Parameters:
+# NAME: name of target
+# DRIVER: If specified, will pass --device=DRIVER to the test binary and adds
+#     a driver label to the test.
+#     TODO(scotttodd): Remove automatic args/labels, push those up a level
+# DATA: Additional input files needed by the test binary.
+# ARGS: additional arguments passed to the test binary.
+#     --device=DRIVER is automatically added if specified.
+#     File-related arguments can be passed with `{{}}` locator,
+#     e.g., --input=@{{foo.npy}}. The locator is used to portably
+#     pass the file arguments to tests and add the file to DATA.
+# SRC: binary target to run as the test.
+# WILL_FAIL: The target will run, but its pass/fail status will be inverted.
+# DISABLED: The target will be skipped and its status will be 'Not Run'.
+# LABELS: Additional labels to apply to the test. The package path is added
+#     automatically.
+# TIMEOUT: Test target timeout in seconds.
+#
+# Note: the DATA argument is not actually adding dependencies because CMake
+# doesn't have a good way to specify a data dependency for a test.
+#
+# Usage:
+# iree_cc_binary(
+#   NAME
+#     requires_args_to_run
+#   ...
+# )
+# iree_test_suites_native_test(
+#   NAME
+#     requires_args_to_run_test
+#   ARGS
+#    --do-the-right-thing
+#   SRC
+#     ::requires_args_to_run
+# )
+
+function(iree_test_suites_native_test)
+  cmake_parse_arguments(
+    _RULE
+    ""
+    "NAME;SRC;DRIVER;WILL_FAIL;DISABLED"
+    "ARGS;LABELS;DATA;TIMEOUT"
+    ${ARGN}
+  )
+
+  # Prefix the test with the package name, so we get: iree_package_name
+  iree_package_name(_PACKAGE_NAME)
+  set(_NAME "${_PACKAGE_NAME}_${_RULE_NAME}")
+  iree_package_ns(_PACKAGE_NS)
+  iree_package_path(_PACKAGE_PATH)
+  set(_TEST_NAME "${_PACKAGE_PATH}/${_RULE_NAME}")
+
+  # If driver was specified, add the corresponding test arg and label.
+  if(DEFINED _RULE_DRIVER)
+    list(APPEND _RULE_ARGS "--device=${_RULE_DRIVER}")
+    list(APPEND _RULE_LABELS "driver=${_RULE_DRIVER}")
+  endif()
+
+  # Detect file location with `{{}}` and handle its portability for all entries
+  # in `_RULE_ARGS`.
+  foreach(_ARG ${_RULE_ARGS})
+    string(REGEX MATCH ".*{{(.+)}}" _FILE_ARG "${_ARG}")
+    if(_FILE_ARG)
+      set(_FILE_PATH ${CMAKE_MATCH_1})
+      list(APPEND _RULE_DATA "${_FILE_PATH}")
+      # remove the `{{}}` from `_ARG` and append it to `_TEST_ARGS`.
+      string(REGEX REPLACE "{{.+}}" "" _FILE_FLAG_PREFIX "${_ARG}")
+      list(APPEND _TEST_ARGS "${_FILE_FLAG_PREFIX}${_FILE_PATH}")
+    else()  # naive append
+      list(APPEND _TEST_ARGS "${_ARG}")
+    endif(_FILE_ARG)
+  endforeach(_ARG)
+
+  # Replace binary passed by relative ::name with iree::package::name
+  string(REGEX REPLACE "^::" "${_PACKAGE_NS}::" _SRC_TARGET ${_RULE_SRC})
+
+  add_test(
+    NAME
+      ${_TEST_NAME}
+    COMMAND
+      "$<TARGET_FILE:${_SRC_TARGET}>"
+      ${_TEST_ARGS}
+  )
+
+  # File extension cmake uses for the target platform.
+  set_property(TEST ${TEST_NAME} APPEND PROPERTY ENVIRONMENT "IREE_DYLIB_EXT=${CMAKE_SHARED_LIBRARY_SUFFIX}")
+
+  if (NOT DEFINED _RULE_TIMEOUT)
+    set(_RULE_TIMEOUT 60)
+  endif()
+
+  list(APPEND _RULE_LABELS "${_PACKAGE_PATH}")
+  set_property(TEST ${_TEST_NAME} PROPERTY LABELS "${_RULE_LABELS}")
+  set_property(TEST "${_TEST_NAME}" PROPERTY REQUIRED_FILES "${_RULE_DATA}")
+  set_property(TEST ${_TEST_NAME} PROPERTY TIMEOUT ${_RULE_TIMEOUT})
+  if(_RULE_WILL_FAIL)
+    set_property(TEST ${_TEST_NAME} PROPERTY WILL_FAIL ${_RULE_WILL_FAIL})
+  endif()
+  if(_RULE_DISABLED)
+    set_property(TEST ${_TEST_NAME} PROPERTY DISABLED ${_RULE_DISABLED})
+  endif()
+endfunction()
diff --git a/linalg_ops/iree_test_suites_runner_test.cmake b/linalg_ops/iree_test_suites_runner_test.cmake
new file mode 100644
index 0000000..fc8ccfc
--- /dev/null
+++ b/linalg_ops/iree_test_suites_runner_test.cmake
@@ -0,0 +1,100 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+include(CMakeParseArguments)
+
+# iree_test_suites_runner_test()
+#
+# Creates a test using a specified test runner program for the specified
+# test files.
+#
+# Parameters:
+#   NAME: Name of the target
+#   TESTS_SRC: MLIR source file to be compiled to an IREE module.
+#   CALLS_SRC: MLIR source file with calls to be compiled to an IREE module.
+#   TEST_RUNNER: Test runner program.
+#   TARGET_BACKEND: Target backend to compile for.
+#   DRIVER: Driver to run the module with.
+#   COMPILER_ARGS: additional args to pass to the compiler.
+#       Target backend flags are passed automatically.
+#   RUNNER_ARGS: Additional args to pass to the runner program.
+#       The device and input file flags are passed automatically.
+#   LABELS: Additional labels to apply to the test.
+#       "driver=${DRIVER}" is added automatically.
+function(iree_test_suites_runner_test)
+  cmake_parse_arguments(
+    _RULE
+    ""
+    "NAME;TESTS_SRC;CALLS_SRC;TEST_RUNNER;TARGET_BACKEND;DRIVER"
+    "COMPILER_ARGS;RUNNER_ARGS;LABELS"
+    ${ARGN}
+  )
+
+  iree_package_name(_PACKAGE_NAME)
+  set(_NAME "${_PACKAGE_NAME}_${_RULE_NAME}")
+
+  set(_BASE_COMPILER_FLAGS
+    "--iree-hal-target-backends=${_RULE_TARGET_BACKEND}"
+  )
+
+  set(_TESTS_VMFB "${CMAKE_CURRENT_BINARY_DIR}/${_RULE_NAME}.vmfb")
+  set(_CALLS_VMFB "${CMAKE_CURRENT_BINARY_DIR}/${_RULE_NAME}_calls.vmfb")
+
+  iree_bytecode_module(
+    NAME
+      "${_RULE_NAME}_module"
+    MODULE_FILE_NAME
+      "${_TESTS_VMFB}"
+    SRC
+      "${_RULE_TESTS_SRC}"
+    FLAGS
+      "${_BASE_COMPILER_FLAGS}"
+      "${_RULE_COMPILER_FLAGS}"
+  )
+  iree_bytecode_module(
+    NAME
+      "${_RULE_NAME}_calls_module"
+    MODULE_FILE_NAME
+      "${_CALLS_VMFB}"
+    SRC
+      "${_RULE_CALLS_SRC}"
+    FLAGS
+      "${_BASE_COMPILER_FLAGS}"
+      "${_RULE_COMPILER_FLAGS}"
+  )
+
+  # A target specifically for the test. We could combine this with the above,
+  # but we want that one to get pulled into iree_bytecode_module.
+  add_custom_target("${_NAME}" ALL)
+  add_dependencies(
+    "${_NAME}"
+    "${_NAME}_module"
+    "${_NAME}_calls_module"
+    "${_RULE_TEST_RUNNER}"
+  )
+
+  add_dependencies(iree-test-suites-linalg-ops-deps "${_NAME}")
+
+  iree_test_suites_native_test(
+    NAME
+      "${_RULE_NAME}${_RULE_VARIANT_NAME}"
+    DRIVER
+      "${_RULE_DRIVER}"
+    SRC
+      "${_RULE_TEST_RUNNER}"
+    DATA
+      ${_TESTS_VMFB}
+      ${_CALLS_VMFB}
+    ARGS
+      "--module={{${_TESTS_VMFB}}}"
+      "--module={{${_CALLS_VMFB}}}"
+      ${_RULE_RUNNER_ARGS}
+    LABELS
+      ${_RULE_LABELS}
+    DISABLED
+      ${_RULE_TEST_DISABLED}
+  )
+endfunction()
diff --git a/linalg_ops/matmul/CMakeLists.txt b/linalg_ops/matmul/CMakeLists.txt
index 346e251..b3d6a70 100644
--- a/linalg_ops/matmul/CMakeLists.txt
+++ b/linalg_ops/matmul/CMakeLists.txt
@@ -1,2036 +1,193 @@
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# tests/e2e/matmul/BUILD.bazel                                                 #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
-
-iree_add_all_subdirs()
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_arm_sme_nondt_f32_small_transpose_lhs_peel
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-    "--iree-preprocessing-pass-pipeline=builtin.module\(util.func\(iree-preprocessing-transpose-matmul-pass{input=lhs}\)\)"
-    "--iree-llvmcpu-vector-pproc-strategy=peel"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_arm_sme_nondt_f32_small_transpose_lhs
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-    "--iree-preprocessing-pass-pipeline=builtin.module\(util.func\(iree-preprocessing-transpose-matmul-pass{input=lhs}\)\)"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_arm_sme_nondt_f32_small_peel
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-    "--iree-llvmcpu-vector-pproc-strategy=peel"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_arm_sme_nondt_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_arm_sme_nondt_f32_large_transpose_lhs_peel
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-    "--iree-preprocessing-pass-pipeline=builtin.module\(util.func\(iree-preprocessing-transpose-matmul-pass{input=lhs}\)\)"
-    "--iree-llvmcpu-vector-pproc-strategy=peel"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_arm_sme_nondt_f32_large_transpose_lhs
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-    "--iree-preprocessing-pass-pipeline=builtin.module\(util.func\(iree-preprocessing-transpose-matmul-pass{input=lhs}\)\)"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_arm_sme_nondt_f32_large_peel
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-    "--iree-llvmcpu-vector-pproc-strategy=peel"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_arm_sme_nondt_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling=false"
-    "--iree-llvmcpu-enable-scalable-vectorization"
-    "--iree-llvmcpu-target-triple=aarch64-unknown-unknown"
-  LABELS
-    "requires-arm-sme"
-  TARGET_CPU_FEATURES_VARIANTS
-    "arm_64:sme:+sve,+sme"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_i8_i32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_i8_i32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_f32_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_f32_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_f16_f16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_f16_f16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_f16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_f16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_bf16_bf16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_bf16_bf16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_bf16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_bf16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_uk_i8_i32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_uk_i8_i32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_uk_f32_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_uk_f32_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_uk_f16_f16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_uk_f16_f16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_uk_f16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_uk_f16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_uk_bf16_bf16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_uk_bf16_bf16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_uk_bf16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_dt_uk_bf16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_i8_i32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_i8_i32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_f32_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_f32_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_f16_f16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_f16_f16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_f16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_f16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_bf16_bf16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_bf16_bf16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_bf16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_bf16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=none"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_uk_i8_i32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_uk_i8_i32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "arm_64:dotprod:+dotprod"
-    "arm_64:i8mm:+i8mm"
-    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_uk_f32_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_uk_f32_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_uk_f16_f16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_uk_f16_f16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fullfp16:+fullfp16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_uk_f16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_uk_f16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "arm_64:fp16fml:+fp16fml"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_uk_bf16_bf16_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_uk_bf16_bf16_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=bf16"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_uk_bf16_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cpu_experimental_dt_uk_bf16_f32_large
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=bf16"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "llvm-cpu"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-    "--iree-llvmcpu-enable-ukernels=all"
-  LABELS
-    "noasan"
-    "notsan"
-    "noriscv"
-    "nowasm"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
-    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
-    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
-    "arm_64:bf16:+bf16"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_vmvx_experimental_dt_f32_f32
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vmvx"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-  LABELS
-
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_vmvx_experimental_dt_uk_f32_f32
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vmvx"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-  LABELS
-
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cuda_experimental_dt_f32_f32
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "cuda"
-  DRIVERS
-    "cuda"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-  LABELS
-
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_spirv_experimental_dt_f32_f32
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vulkan-spirv"
-  DRIVERS
-    "vulkan"
-  COMPILER_FLAGS
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-  LABELS
-
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_vmvx_dt_uk_i8_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--acc_type=i32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vmvx"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-vmvx-enable-microkernels"
-    "--iree-opt-data-tiling"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_vmvx_dt_uk_f32_small
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "vmvx"
-  DRIVERS
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-vmvx-enable-microkernels"
-    "--iree-opt-data-tiling"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cuda_f32_large_unaligned
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=gpu_large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "cuda"
-  DRIVERS
-    "cuda"
-  COMPILER_FLAGS
-    "--iree-cuda-target=sm_80"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-sm80"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cuda_f16_large_unaligned
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f16"
-    "--acc_type=f32"
-    "--shapes=gpu_large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "cuda"
-  DRIVERS
-    "cuda"
-  COMPILER_FLAGS
-    "--iree-cuda-target=sm_80"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-sm80"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cuda_f32_large_splitk
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=large"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "cuda"
-  DRIVERS
-    "cuda"
-  COMPILER_FLAGS
-    "--iree-dispatch-creation-split-matmul-reduction=4"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-nvidia"
-    "noriscv"
-)
-
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
-
-# To distinguish between CDNA(gfx9) and RDNA3(gfx11)
-if(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx9")
-
-unset(IREE_HIP_TEST_COMPILER_FLAGS)
-list(APPEND IREE_HIP_TEST_COMPILER_FLAGS
-  "--iree-rocm-target-chip=${IREE_HIP_TEST_TARGET_CHIP}"
-)
-
-if(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx94")
-
-# I8 Intrinsics has different layout on CDNA3/gfx94x,
-# and only CDNA3/gfx94x has F8 intrinsics.
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_cdna_experimental_dt_f32_f32
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "rocm"
-  DRIVERS
-    "hip"
-  COMPILER_FLAGS
-    ${IREE_HIP_TEST_COMPILER_FLAGS}
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-cdna3"
-)
-endif()
-
-elseif(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx11")
-
-unset(IREE_HIP_TEST_COMPILER_FLAGS)
-list(APPEND IREE_HIP_TEST_COMPILER_FLAGS
-  "--iree-rocm-target-chip=${IREE_HIP_TEST_TARGET_CHIP}"
-)
-
-iree_generated_e2e_runner_test(
-  NAME
-    matmul_rdna3_experimental_dt_f32_f32
-  TEST_TYPE
-    matmul
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--acc_type=f32"
-    "--shapes=small"
-  TEST_RUNNER
-    iree-test-suites_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "rocm"
-  DRIVERS
-    "hip"
-  COMPILER_FLAGS
-    ${IREE_HIP_TEST_COMPILER_FLAGS}
-    "--iree-opt-data-tiling"
-    "--iree-global-opt-enable-early-materialization=false"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-rdna3"
-)
-
-endif()
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# TODO(scotttodd): add filtering here, in the helper functions, or in ctest to
+#                  choose which tests to compile and run
+
+set(_SIZES)
+list(APPEND _SIZES "large")
+list(APPEND _SIZES "small")
+
+###############################################################################
+#
+# CPU - llvm-cpu on local-task, default flags.
+#
+###############################################################################
+
+set(_DTYPES)
+list(APPEND _DTYPES "i8_into_i32")
+list(APPEND _DTYPES "f32_into_f32")
+list(APPEND _DTYPES "f16_into_f16")
+list(APPEND _DTYPES "f16_into_f32")
+list(APPEND _DTYPES "bf16_into_bf16")
+list(APPEND _DTYPES "bf16_into_f32")
+# list(APPEND _DTYPES "f8E4M3FNUZ_into_f32")  # Unsupported data type.
+foreach(_DTYPE IN LISTS _DTYPES)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        matmul_cpu_${_DTYPE}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE}/matmul_${_DTYPE}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE}/matmul_${_DTYPE}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-matmul-test
+      TARGET_BACKEND
+        "llvm-cpu"
+      DRIVER
+        "local-task"
+      COMPILER_ARGS
+      RUNNER_ARGS
+      LABELS
+    )
+  endforeach()
+endforeach()
+
+###############################################################################
+#
+# CPU - vmvx on local-task, default flags.
+#
+###############################################################################
+
+set(_DTYPES)
+list(APPEND _DTYPES "i8_into_i32")
+list(APPEND _DTYPES "f32_into_f32")
+# list(APPEND _DTYPES "f16_into_f16")  # Unsupported data type.
+# list(APPEND _DTYPES "f16_into_f32")  # Unsupported data type.
+# list(APPEND _DTYPES "bf16_into_bf16")  # Unsupported data type.
+# list(APPEND _DTYPES "bf16_into_f32")  # Unsupported data type.
+# list(APPEND _DTYPES "f8E4M3FNUZ_into_f32")  # Unsupported data type.
+foreach(_DTYPE IN LISTS _DTYPES)
+  # Note: not running large tests on vmvx, too slow.
+  set(_SIZE "small")
+  iree_test_suites_runner_test(
+    NAME
+      matmul_vmvx_${_DTYPE}_${_SIZE}
+    TESTS_SRC
+      "generated/${_DTYPE}/matmul_${_DTYPE}_${_SIZE}.mlir"
+    CALLS_SRC
+      "generated/${_DTYPE}/matmul_${_DTYPE}_${_SIZE}_calls.mlir"
+    TEST_RUNNER
+      iree-test-suites_iree-e2e-matmul-test
+    TARGET_BACKEND
+      "vmvx"
+    DRIVER
+      "local-task"
+    COMPILER_ARGS
+    RUNNER_ARGS
+    LABELS
+  )
+endforeach()
+
+###############################################################################
+#
+# GPU - Vulkan, default flags.
+#
+###############################################################################
+
+set(_DTYPES)
+# list(APPEND _DTYPES "i8_into_i32")  # Currently failing.
+list(APPEND _DTYPES "f32_into_f32")
+# list(APPEND _DTYPES "f16_into_f16")  # Failing to compile.
+# list(APPEND _DTYPES "f16_into_f32")  # Failing to compile.
+# list(APPEND _DTYPES "bf16_into_bf16")  # Failing to compile.
+# list(APPEND _DTYPES "bf16_into_f32")  # Failing to compile.
+# list(APPEND _DTYPES "f8E4M3FNUZ_into_f32")  # Unsupported data type.
+foreach(_DTYPE IN LISTS _DTYPES)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        matmul_vulkan_${_DTYPE}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE}/matmul_${_DTYPE}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE}/matmul_${_DTYPE}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-matmul-test
+      TARGET_BACKEND
+        "vulkan-spirv"
+      DRIVER
+        "vulkan"
+      COMPILER_ARGS
+      RUNNER_ARGS
+      LABELS
+    )
+  endforeach()
+endforeach()
+
+###############################################################################
+#
+# GPU - CUDA, default flags.
+#
+###############################################################################
+
+set(_DTYPES)
+list(APPEND _DTYPES "i8_into_i32")
+list(APPEND _DTYPES "f32_into_f32")
+# list(APPEND _DTYPES "f16_into_f16")  # Timeout running.
+list(APPEND _DTYPES "f16_into_f32")
+# list(APPEND _DTYPES "bf16_into_bf16")  # Timeout running.
+list(APPEND _DTYPES "bf16_into_f32")
+# list(APPEND _DTYPES "f8E4M3FNUZ_into_f32")  # Unsupported data type.
+foreach(_DTYPE IN LISTS _DTYPES)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        matmul_cuda_${_DTYPE}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE}/matmul_${_DTYPE}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE}/matmul_${_DTYPE}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-matmul-test
+      TARGET_BACKEND
+        "cuda"
+      DRIVER
+        "cuda"
+      COMPILER_ARGS
+      RUNNER_ARGS
+      LABELS
+    )
+  endforeach()
+endforeach()
+
+###############################################################################
+#
+# GPU - ROCm/HIP, default flags.
+#
+###############################################################################
+
+set(_DTYPES)
+list(APPEND _DTYPES "i8_into_i32")
+list(APPEND _DTYPES "f32_into_f32")
+list(APPEND _DTYPES "f16_into_f16")
+list(APPEND _DTYPES "f16_into_f32")
+list(APPEND _DTYPES "bf16_into_bf16")
+list(APPEND _DTYPES "bf16_into_f32")
+# list(APPEND _DTYPES "f8E4M3FNUZ_into_f32")  # Failing to compile.
+foreach(_DTYPE IN LISTS _DTYPES)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        matmul_hip_${_DTYPE}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE}/matmul_${_DTYPE}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE}/matmul_${_DTYPE}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-matmul-test
+      TARGET_BACKEND
+        "rocm"
+      DRIVER
+        "hip"
+      COMPILER_ARGS
+        "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
+      RUNNER_ARGS
+      LABELS
+    )
+  endforeach()
+endforeach()
diff --git a/linalg_ops/matmul/generate_e2e_matmul_tests.py b/linalg_ops/matmul/generate_e2e_matmul_tests.py
index 1565c0b..0f6f8e0 100644
--- a/linalg_ops/matmul/generate_e2e_matmul_tests.py
+++ b/linalg_ops/matmul/generate_e2e_matmul_tests.py
@@ -33,8 +33,6 @@ class MatrixElemTypeId(enum.Enum):
 class ShapesId(enum.Enum):
     SMALL = "small"
     LARGE = "large"
-    GPU_LARGE = "gpu_large"
-    GPU_LARGE_ALIGNED = "gpu_large_aligned"
 
 
 # Enumerates ways to construct MLIR tensor types.
@@ -45,13 +43,6 @@ class Dynamicity(enum.Enum):
     MIXED = "mixed"  # Randomly mix '?' and values. Example: tensor<?x4xf32>.
 
 
-# Enumerates ways to initialize matrix buffer contents.
-@enum.unique
-class MatrixGenerator(enum.Enum):
-    ZERO = "zero"  # Fill with zeros
-    RANDOM = "random"  # Fill with (deterministic) pseudorandom values.
-
-
 # Describes the shape of a matrix multiplication in the usual convention:
 # the LHS is {m}x{k}, the RHS is {k}x{n}, the accumulator/result is {m}x{n}.
 # The extra `accumulate` boolean tells whether the matmul is accumulating into
@@ -73,9 +64,8 @@ def get_test_shapes(shapes_id: ShapesId):
     #    build and execution latency of tests. The build latency is nearly the
     #    same for all shapes, while execution latency grows cubicly i.e.
     #    linearly with m*k*n.
-    # 2. Some shapes are commented out: they used to be tested but have been
-    #    disabled to improve the trade-off between test coverage and build
-    #    latency.
+    # 2. Some shapes may be commented out to improve the trade-off between test
+    #    coverage and build latency.
     if shapes_id == ShapesId.SMALL:
         return [
             # square matrices. Start by the simplest case of 1x1x1.
@@ -107,57 +97,35 @@ def get_test_shapes(shapes_id: ShapesId):
         ]
     if shapes_id == ShapesId.LARGE:
         return [
-            # some random large sizes
-            TestShape(m=123, k=456, n=789, accumulate=True),
-            TestShape(m=654, k=321, n=234, accumulate=False),
-            # shapes involving vectors (i.e. most rectangular cases)
-            TestShape(m=1, k=1000, n=1000, accumulate=True),  # large vector*matrix
-            TestShape(m=1000, k=1000, n=1, accumulate=True),  # large matrix*vector
-            TestShape(m=1000, k=1000, n=1, accumulate=False),  # large matrix*vector
-            # Be conservative in adding larger shapes. They can result in
-            # high latency tests. If you have to, consider splitting them
-            # out in a way that constrains the latency impact, e.g. by
-            # running on fewer backends/drivers or with fewer generators
-            # (see get_test_generators).
-        ]
-    if shapes_id == ShapesId.GPU_LARGE_ALIGNED:
-        return [
+            # Large aligned sizes.
             TestShape(m=512, k=128, n=512, accumulate=True),
             TestShape(m=512, k=128, n=512, accumulate=False),
-        ]
-    if shapes_id == ShapesId.GPU_LARGE:
-        return [
-            # unaligned cases.
-            TestShape(m=457, k=330, n=512, accumulate=False),
-            TestShape(m=457, k=330, n=514, accumulate=False),
-            TestShape(m=438, k=330, n=514, accumulate=False),
-            TestShape(m=540, k=332, n=516, accumulate=False),
             TestShape(m=1000, k=4, n=512, accumulate=False),
             TestShape(m=4, k=1000, n=512, accumulate=False),
             TestShape(m=512, k=1000, n=4, accumulate=False),
             TestShape(m=512, k=128, n=500, accumulate=False),
+            # Large unaligned sizes.
+            # TestShape(m=123, k=456, n=789, accumulate=True),  # Failing on Vulkan
+            TestShape(m=457, k=330, n=512, accumulate=False),
+            TestShape(m=457, k=330, n=514, accumulate=False),
+            TestShape(m=438, k=330, n=514, accumulate=False),
+            TestShape(m=540, k=332, n=516, accumulate=False),
+            TestShape(m=654, k=321, n=234, accumulate=False),
             TestShape(m=457, k=160, n=512, accumulate=False),
             TestShape(m=512, k=330, n=512, accumulate=False),
+            # Shapes involving vectors (i.e. most rectangular cases).
+            TestShape(m=1, k=1000, n=1000, accumulate=True),  # large vector*matrix
+            TestShape(m=1000, k=1000, n=1, accumulate=True),  # large matrix*vector
+            TestShape(m=1000, k=1000, n=1, accumulate=False),  # large matrix*vector
+            # Be conservative in adding larger shapes. They can result in
+            # high latency tests. If you have to, consider splitting them
+            # out in a way that constrains the latency impact, e.g. by
+            # running on fewer backends/drivers.
         ]
 
     raise ValueError(shapes_id)
 
 
-# Returns the list of Dynamicity's to use for the collection of shapes
-# identified by shapes_id.
-def get_dynamicities(shapes_id: ShapesId):
-    if shapes_id == ShapesId.GPU_LARGE or shapes_id == ShapesId.GPU_LARGE_ALIGNED:
-        return [
-            Dynamicity.STATIC,
-        ]
-    else:
-        return [
-            Dynamicity.DYNAMIC,
-            Dynamicity.STATIC,
-        ]
-    raise ValueError(shapes_id)
-
-
 # A shape dimension value, i.e. a size value that could appear in a MLIR type
 # such as 'tensor<?x4xf32>'. None means a dynamic size, similar to '?' in MLIR.
 @dataclasses.dataclass
@@ -312,14 +280,14 @@ def generate_function(
         compute = (
             f"  %lhs_casted = {castback_op} %lhs: {lhs_tensor_type} to {compute_lhs_tensor_type}\n"
             f"  %rhs_casted = {castback_op} %rhs: {rhs_tensor_type} to {compute_rhs_tensor_type}\n"
-            f"  %result = {op_name} ins(%lhs_casted, %rhs_casted: {compute_lhs_tensor_type}, {compute_rhs_tensor_type}) outs(%acc: {acc_tensor_type}) -> {acc_tensor_type}"
+            f"  %result = {op_name} ins(%lhs_casted, %rhs_casted: {compute_lhs_tensor_type}, {compute_rhs_tensor_type}) outs(%acc: {acc_tensor_type}) -> {acc_tensor_type}\n"
         )
     if shape.accumulate:
         signature = f"({lhs_tensor_type}, {rhs_tensor_type}, {acc_tensor_type}) -> {acc_tensor_type}"
         import_declaration = f"func.func private @module.{func_name}(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view"
         func_definition = func_definition + (
             f"func.func @{func_name}(%lhs: {lhs_tensor_type}, %rhs: {rhs_tensor_type}, %acc: {acc_tensor_type}) -> {acc_tensor_type} {{\n"
-            f"{compute}\n"
+            f"{compute}"
             f"  return %result: {acc_tensor_type}\n"
             f"}}\n"
         )
@@ -375,17 +343,6 @@ class TestCall:
 pseudorandom_generator_seed = 1
 
 
-def contents_generator_tag(generator: MatrixGenerator):
-    if generator == MatrixGenerator.ZERO:
-        return ""
-    elif generator == MatrixGenerator.RANDOM:
-        global pseudorandom_generator_seed
-        pseudorandom_generator_seed = pseudorandom_generator_seed + 1
-        return f"!tag:iree:fully_specified_pseudorandom {pseudorandom_generator_seed}"
-    else:
-        raise ValueError(generator)
-
-
 # Generate a matrix function argument of the given size as `%name`.
 def generate_random_matrix(
     name: str,
@@ -483,7 +440,7 @@ def generate(
     calls = []
 
     for shape in get_test_shapes(shapes_id):
-        for dynamicity in get_dynamicities(shapes_id):
+        for dynamicity in [Dynamicity.DYNAMIC, Dynamicity.STATIC]:
             function = generate_function(
                 lhs_rhs_type,
                 acc_type,
@@ -558,13 +515,20 @@ def parse_arguments():
 
 
 def write_code_file(functions, filename):
+    # TODO(scotttodd): write "GENERATED BY" comment to the top of the file
+
     with open(filename, "w") as file:
         for function in functions.values():
             file.write(function.definition + "\n")
 
 
 def write_calls_file(functions, calls, filename, requirements):
+    # TODO(scotttodd): write "GENERATED BY" comment to the top of the file
+
     # Module-level reflection information used to control the test tool.
+    # TODO(scotttodd): drop this and whatever logic in the test tool used it
+    #     multiple backends should be able to use the same input IR, so the
+    #     input IR shouldn't need things like CPU features in it
     reflection = ""
     if requirements:
         reflection = (
diff --git a/linalg_ops/matmul/generate_test_mlir_files.sh b/linalg_ops/matmul/generate_test_mlir_files.sh
new file mode 100755
index 0000000..0f797da
--- /dev/null
+++ b/linalg_ops/matmul/generate_test_mlir_files.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# This script runs generate_e2e_matmul_tests for all argument combinations that
+# we are interested in testing.
+#
+# The output is a 'generated' folder with contents like this:
+#   linalg_ops/
+#     matmul/
+#       generated/
+#         f16_into_f16/
+#           matmul_f16_into_f16_large_calls.mlir
+#           matmul_f16_into_f16_large.mlir
+#           matmul_f16_into_f16_small_calls.mlir
+#           matmul_f16_into_f16_small.mlir
+#           ...
+#           matmul_transpose_b_f16_into_f16_large_calls.mlir
+#           matmul_transpose_b_f16_into_f16_large.mlir
+#           matmul_transpose_b_f16_into_f16_small_calls.mlir
+#           matmul_transpose_b_f16_into_f16_small.mlir
+#         f16_into_f32/
+#           ...
+#         f32_into_f32
+#           ...
+#         ...
+#
+# Usage:
+#   generate_test_mlir_files.sh
+
+set -euo pipefail
+
+this_dir="$(cd $(dirname $0) && pwd)"
+generated_dir_root="${this_dir}/generated"
+
+# Reset generated directory.
+rm -rf ${generated_dir_root?}
+mkdir -p ${generated_dir_root?}
+
+shapes=(
+  "small"
+  "large"
+)
+
+# lhs_rhs_type;accumulator_type
+type_combinations=(
+  "i8;i32"
+  "f32;f32"
+  "f16;f16"
+  "f16;f32"
+  "bf16;bf16"
+  "bf16;f32"
+  "f8E4M3FNUZ;f32"
+)
+
+for type_combination in ${type_combinations[@]}; do
+  IFS=";" read -r -a types <<< "${type_combination}"
+  lhs_rhs_type="${types[0]}"
+  acc_type="${types[1]}"
+  type_name="${lhs_rhs_type}_into_${acc_type}"
+
+  type_combination_dir="${generated_dir_root}/${type_name}"
+  mkdir -p ${type_combination_dir}
+
+  for shape in ${shapes[@]}; do
+    echo "Generating matmul test files for ${type_name}_${shape}"
+
+    name="matmul_${type_name}_${shape}"
+    python ${this_dir}/generate_e2e_matmul_tests.py \
+      --output_matmul_mlir=${type_combination_dir}/${name}.mlir \
+      --output_calls_mlir=${type_combination_dir}/${name}_calls.mlir \
+      --lhs_rhs_type=${lhs_rhs_type} \
+      --acc_type=${acc_type} \
+      --shapes=${shape}
+
+    name="matmul_transpose_b_${type_name}_${shape}"
+    python ${this_dir}/generate_e2e_matmul_tests.py \
+      --output_matmul_mlir=${type_combination_dir}/${name}.mlir \
+      --output_calls_mlir=${type_combination_dir}/${name}_calls.mlir \
+      --lhs_rhs_type=${lhs_rhs_type} \
+      --acc_type=${acc_type} \
+      --shapes=${shape} \
+      --transpose_rhs
+  done
+done
diff --git a/linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_large.mlir b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_large.mlir
new file mode 100644
index 0000000..52e78d6
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_large.mlir
@@ -0,0 +1,136 @@
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>, %acc: tensor<?x?xbf16>) -> tensor<?x?xbf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xbf16>) -> tensor<?x?xbf16>
+  return %result: tensor<?x?xbf16>
+}
+
+func.func @matmul_accumulate_512x128xbf16_times_128x512xbf16_into_512x512xbf16(%lhs: tensor<512x128xbf16>, %rhs: tensor<128x512xbf16>, %acc: tensor<512x512xbf16>) -> tensor<512x512xbf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xbf16>, tensor<128x512xbf16>) outs(%acc: tensor<512x512xbf16>) -> tensor<512x512xbf16>
+  return %result: tensor<512x512xbf16>
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>) -> tensor<?x?xbf16> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xbf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xbf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<?x?xbf16>) -> tensor<?x?xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xbf16>) -> tensor<?x?xbf16>
+  return %result: tensor<?x?xbf16>
+}
+
+func.func @matmul_512x128xbf16_times_128x512xbf16_into_512x512xbf16(%lhs: tensor<512x128xbf16>, %rhs: tensor<128x512xbf16>) -> tensor<512x512xbf16> {
+  %init_acc = tensor.empty() : tensor<512x512xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<512x512xbf16>) -> tensor<512x512xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xbf16>, tensor<128x512xbf16>) outs(%acc: tensor<512x512xbf16>) -> tensor<512x512xbf16>
+  return %result: tensor<512x512xbf16>
+}
+
+func.func @matmul_1000x4xbf16_times_4x512xbf16_into_1000x512xbf16(%lhs: tensor<1000x4xbf16>, %rhs: tensor<4x512xbf16>) -> tensor<1000x512xbf16> {
+  %init_acc = tensor.empty() : tensor<1000x512xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<1000x512xbf16>) -> tensor<1000x512xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x4xbf16>, tensor<4x512xbf16>) outs(%acc: tensor<1000x512xbf16>) -> tensor<1000x512xbf16>
+  return %result: tensor<1000x512xbf16>
+}
+
+func.func @matmul_4x1000xbf16_times_1000x512xbf16_into_4x512xbf16(%lhs: tensor<4x1000xbf16>, %rhs: tensor<1000x512xbf16>) -> tensor<4x512xbf16> {
+  %init_acc = tensor.empty() : tensor<4x512xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<4x512xbf16>) -> tensor<4x512xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<4x1000xbf16>, tensor<1000x512xbf16>) outs(%acc: tensor<4x512xbf16>) -> tensor<4x512xbf16>
+  return %result: tensor<4x512xbf16>
+}
+
+func.func @matmul_512x1000xbf16_times_1000x4xbf16_into_512x4xbf16(%lhs: tensor<512x1000xbf16>, %rhs: tensor<1000x4xbf16>) -> tensor<512x4xbf16> {
+  %init_acc = tensor.empty() : tensor<512x4xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<512x4xbf16>) -> tensor<512x4xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x1000xbf16>, tensor<1000x4xbf16>) outs(%acc: tensor<512x4xbf16>) -> tensor<512x4xbf16>
+  return %result: tensor<512x4xbf16>
+}
+
+func.func @matmul_512x128xbf16_times_128x500xbf16_into_512x500xbf16(%lhs: tensor<512x128xbf16>, %rhs: tensor<128x500xbf16>) -> tensor<512x500xbf16> {
+  %init_acc = tensor.empty() : tensor<512x500xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<512x500xbf16>) -> tensor<512x500xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xbf16>, tensor<128x500xbf16>) outs(%acc: tensor<512x500xbf16>) -> tensor<512x500xbf16>
+  return %result: tensor<512x500xbf16>
+}
+
+func.func @matmul_457x330xbf16_times_330x512xbf16_into_457x512xbf16(%lhs: tensor<457x330xbf16>, %rhs: tensor<330x512xbf16>) -> tensor<457x512xbf16> {
+  %init_acc = tensor.empty() : tensor<457x512xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<457x512xbf16>) -> tensor<457x512xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x330xbf16>, tensor<330x512xbf16>) outs(%acc: tensor<457x512xbf16>) -> tensor<457x512xbf16>
+  return %result: tensor<457x512xbf16>
+}
+
+func.func @matmul_457x330xbf16_times_330x514xbf16_into_457x514xbf16(%lhs: tensor<457x330xbf16>, %rhs: tensor<330x514xbf16>) -> tensor<457x514xbf16> {
+  %init_acc = tensor.empty() : tensor<457x514xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<457x514xbf16>) -> tensor<457x514xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x330xbf16>, tensor<330x514xbf16>) outs(%acc: tensor<457x514xbf16>) -> tensor<457x514xbf16>
+  return %result: tensor<457x514xbf16>
+}
+
+func.func @matmul_438x330xbf16_times_330x514xbf16_into_438x514xbf16(%lhs: tensor<438x330xbf16>, %rhs: tensor<330x514xbf16>) -> tensor<438x514xbf16> {
+  %init_acc = tensor.empty() : tensor<438x514xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<438x514xbf16>) -> tensor<438x514xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<438x330xbf16>, tensor<330x514xbf16>) outs(%acc: tensor<438x514xbf16>) -> tensor<438x514xbf16>
+  return %result: tensor<438x514xbf16>
+}
+
+func.func @matmul_540x332xbf16_times_332x516xbf16_into_540x516xbf16(%lhs: tensor<540x332xbf16>, %rhs: tensor<332x516xbf16>) -> tensor<540x516xbf16> {
+  %init_acc = tensor.empty() : tensor<540x516xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<540x516xbf16>) -> tensor<540x516xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<540x332xbf16>, tensor<332x516xbf16>) outs(%acc: tensor<540x516xbf16>) -> tensor<540x516xbf16>
+  return %result: tensor<540x516xbf16>
+}
+
+func.func @matmul_654x321xbf16_times_321x234xbf16_into_654x234xbf16(%lhs: tensor<654x321xbf16>, %rhs: tensor<321x234xbf16>) -> tensor<654x234xbf16> {
+  %init_acc = tensor.empty() : tensor<654x234xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<654x234xbf16>) -> tensor<654x234xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<654x321xbf16>, tensor<321x234xbf16>) outs(%acc: tensor<654x234xbf16>) -> tensor<654x234xbf16>
+  return %result: tensor<654x234xbf16>
+}
+
+func.func @matmul_457x160xbf16_times_160x512xbf16_into_457x512xbf16(%lhs: tensor<457x160xbf16>, %rhs: tensor<160x512xbf16>) -> tensor<457x512xbf16> {
+  %init_acc = tensor.empty() : tensor<457x512xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<457x512xbf16>) -> tensor<457x512xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x160xbf16>, tensor<160x512xbf16>) outs(%acc: tensor<457x512xbf16>) -> tensor<457x512xbf16>
+  return %result: tensor<457x512xbf16>
+}
+
+func.func @matmul_512x330xbf16_times_330x512xbf16_into_512x512xbf16(%lhs: tensor<512x330xbf16>, %rhs: tensor<330x512xbf16>) -> tensor<512x512xbf16> {
+  %init_acc = tensor.empty() : tensor<512x512xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<512x512xbf16>) -> tensor<512x512xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x330xbf16>, tensor<330x512xbf16>) outs(%acc: tensor<512x512xbf16>) -> tensor<512x512xbf16>
+  return %result: tensor<512x512xbf16>
+}
+
+func.func @matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xbf16(%lhs: tensor<1x1000xbf16>, %rhs: tensor<1000x1000xbf16>, %acc: tensor<1x1000xbf16>) -> tensor<1x1000xbf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1000xbf16>, tensor<1000x1000xbf16>) outs(%acc: tensor<1x1000xbf16>) -> tensor<1x1000xbf16>
+  return %result: tensor<1x1000xbf16>
+}
+
+func.func @matmul_accumulate_1000x1000xbf16_times_1000x1xbf16_into_1000x1xbf16(%lhs: tensor<1000x1000xbf16>, %rhs: tensor<1000x1xbf16>, %acc: tensor<1000x1xbf16>) -> tensor<1000x1xbf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x1000xbf16>, tensor<1000x1xbf16>) outs(%acc: tensor<1000x1xbf16>) -> tensor<1000x1xbf16>
+  return %result: tensor<1000x1xbf16>
+}
+
+func.func @matmul_1000x1000xbf16_times_1000x1xbf16_into_1000x1xbf16(%lhs: tensor<1000x1000xbf16>, %rhs: tensor<1000x1xbf16>) -> tensor<1000x1xbf16> {
+  %init_acc = tensor.empty() : tensor<1000x1xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<1000x1xbf16>) -> tensor<1000x1xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x1000xbf16>, tensor<1000x1xbf16>) outs(%acc: tensor<1000x1xbf16>) -> tensor<1000x1xbf16>
+  return %result: tensor<1000x1xbf16>
+}
+
diff --git a/linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_large_calls.mlir b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_large_calls.mlir
new file mode 100644
index 0000000..c2e72e9
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_large_calls.mlir
@@ -0,0 +1,882 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_512x128xbf16_times_128x512xbf16_into_512x512xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xbf16_times_128x512xbf16_into_512x512xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x4xbf16_times_4x512xbf16_into_1000x512xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_4x1000xbf16_times_1000x512xbf16_into_4x512xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x1000xbf16_times_1000x4xbf16_into_512x4xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xbf16_times_128x500xbf16_into_512x500xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xbf16_times_330x512xbf16_into_457x512xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xbf16_times_330x514xbf16_into_457x514xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_438x330xbf16_times_330x514xbf16_into_438x514xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_540x332xbf16_times_332x516xbf16_into_540x516xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_654x321xbf16_times_321x234xbf16_into_654x234xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x160xbf16_times_160x512xbf16_into_457x512xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x330xbf16_times_330x512xbf16_into_512x512xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1000x1000xbf16_times_1000x1xbf16_into_1000x1xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x1000xbf16_times_1000x1xbf16_into_1000x1xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_512_128_512_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_512x128xbf16_times_128x512xbf16_into_512x512xbf16_512_128_512_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_512x128xbf16_times_128x512xbf16_into_512x512xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_512_128_512_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xbf16_times_128x512xbf16_into_512x512xbf16_512_128_512_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xbf16_times_128x512xbf16_into_512x512xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1000_4_512_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x4xbf16_times_4x512xbf16_into_1000x512xbf16_1000_4_512_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 14 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 15 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x4xbf16_times_4x512xbf16_into_1000x512xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_4_1000_512_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 16 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 17 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_4x1000xbf16_times_1000x512xbf16_into_4x512xbf16_4_1000_512_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_4x1000xbf16_times_1000x512xbf16_into_4x512xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_512_1000_4_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 20 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 21 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x1000xbf16_times_1000x4xbf16_into_512x4xbf16_512_1000_4_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 22 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 23 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x1000xbf16_times_1000x4xbf16_into_512x4xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_512_128_500_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 500 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xbf16_times_128x500xbf16_into_512x500xbf16_512_128_500_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 26 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 500 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 27 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xbf16_times_128x500xbf16_into_512x500xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_457_330_512_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 28 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 29 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xbf16_times_330x512xbf16_into_457x512xbf16_457_330_512_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xbf16_times_330x512xbf16_into_457x512xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_457_330_514_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 32 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 33 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xbf16_times_330x514xbf16_into_457x514xbf16_457_330_514_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 34 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 35 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xbf16_times_330x514xbf16_into_457x514xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_438_330_514_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_438x330xbf16_times_330x514xbf16_into_438x514xbf16_438_330_514_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 38 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 39 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_438x330xbf16_times_330x514xbf16_into_438x514xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_540_332_516_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 40 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 332 : i64
+  %rhs_dim1 = arith.constant 516 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 41 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_540x332xbf16_times_332x516xbf16_into_540x516xbf16_540_332_516_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 332 : i64
+  %rhs_dim1 = arith.constant 516 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_540x332xbf16_times_332x516xbf16_into_540x516xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_654_321_234_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 321 : i64
+  %rhs_dim1 = arith.constant 234 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_654x321xbf16_times_321x234xbf16_into_654x234xbf16_654_321_234_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 321 : i64
+  %rhs_dim1 = arith.constant 234 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_654x321xbf16_times_321x234xbf16_into_654x234xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_457_160_512_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 48 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 160 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 49 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x160xbf16_times_160x512xbf16_into_457x512xbf16_457_160_512_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 50 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 160 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 51 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x160xbf16_times_160x512xbf16_into_457x512xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_512_330_512_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x330xbf16_times_330x512xbf16_into_512x512xbf16_512_330_512_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 54 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 55 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x330xbf16_times_330x512xbf16_into_512x512xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1_1000_1000_acc_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 56 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 57 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 58 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 58 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xbf16_1_1000_1000_acc_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 59 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 60 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 61 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 61 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1000_1000_1_acc_28() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1000x1000xbf16_times_1000x1xbf16_into_1000x1xbf16_1000_1000_1_acc_29() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1000x1000xbf16_times_1000x1xbf16_into_1000x1xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1000_1000_1_30() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x1000xbf16_times_1000x1xbf16_into_1000x1xbf16_1000_1000_1_31() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 70 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 71 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x1000xbf16_times_1000x1xbf16_into_1000x1xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_small.mlir b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_small.mlir
new file mode 100644
index 0000000..4537ce7
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_small.mlir
@@ -0,0 +1,99 @@
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>, %acc: tensor<?x?xbf16>) -> tensor<?x?xbf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xbf16>) -> tensor<?x?xbf16>
+  return %result: tensor<?x?xbf16>
+}
+
+func.func @matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xbf16(%lhs: tensor<1x1xbf16>, %rhs: tensor<1x1xbf16>, %acc: tensor<1x1xbf16>) -> tensor<1x1xbf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1xbf16>, tensor<1x1xbf16>) outs(%acc: tensor<1x1xbf16>) -> tensor<1x1xbf16>
+  return %result: tensor<1x1xbf16>
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>) -> tensor<?x?xbf16> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xbf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xbf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<?x?xbf16>) -> tensor<?x?xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xbf16>) -> tensor<?x?xbf16>
+  return %result: tensor<?x?xbf16>
+}
+
+func.func @matmul_1x1xbf16_times_1x1xbf16_into_1x1xbf16(%lhs: tensor<1x1xbf16>, %rhs: tensor<1x1xbf16>) -> tensor<1x1xbf16> {
+  %init_acc = tensor.empty() : tensor<1x1xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<1x1xbf16>) -> tensor<1x1xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1xbf16>, tensor<1x1xbf16>) outs(%acc: tensor<1x1xbf16>) -> tensor<1x1xbf16>
+  return %result: tensor<1x1xbf16>
+}
+
+func.func @matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xbf16(%lhs: tensor<2x2xbf16>, %rhs: tensor<2x2xbf16>, %acc: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<2x2xbf16>, tensor<2x2xbf16>) outs(%acc: tensor<2x2xbf16>) -> tensor<2x2xbf16>
+  return %result: tensor<2x2xbf16>
+}
+
+func.func @matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xbf16(%lhs: tensor<4x4xbf16>, %rhs: tensor<4x4xbf16>, %acc: tensor<4x4xbf16>) -> tensor<4x4xbf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<4x4xbf16>, tensor<4x4xbf16>) outs(%acc: tensor<4x4xbf16>) -> tensor<4x4xbf16>
+  return %result: tensor<4x4xbf16>
+}
+
+func.func @matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xbf16(%lhs: tensor<8x8xbf16>, %rhs: tensor<8x8xbf16>, %acc: tensor<8x8xbf16>) -> tensor<8x8xbf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<8x8xbf16>, tensor<8x8xbf16>) outs(%acc: tensor<8x8xbf16>) -> tensor<8x8xbf16>
+  return %result: tensor<8x8xbf16>
+}
+
+func.func @matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xbf16(%lhs: tensor<9x9xbf16>, %rhs: tensor<9x9xbf16>, %acc: tensor<9x9xbf16>) -> tensor<9x9xbf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<9x9xbf16>, tensor<9x9xbf16>) outs(%acc: tensor<9x9xbf16>) -> tensor<9x9xbf16>
+  return %result: tensor<9x9xbf16>
+}
+
+func.func @matmul_accumulate_6x13xbf16_times_13x3xbf16_into_6x3xbf16(%lhs: tensor<6x13xbf16>, %rhs: tensor<13x3xbf16>, %acc: tensor<6x3xbf16>) -> tensor<6x3xbf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<6x13xbf16>, tensor<13x3xbf16>) outs(%acc: tensor<6x3xbf16>) -> tensor<6x3xbf16>
+  return %result: tensor<6x3xbf16>
+}
+
+func.func @matmul_15x37xbf16_times_37x7xbf16_into_15x7xbf16(%lhs: tensor<15x37xbf16>, %rhs: tensor<37x7xbf16>) -> tensor<15x7xbf16> {
+  %init_acc = tensor.empty() : tensor<15x7xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<15x7xbf16>) -> tensor<15x7xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<15x37xbf16>, tensor<37x7xbf16>) outs(%acc: tensor<15x7xbf16>) -> tensor<15x7xbf16>
+  return %result: tensor<15x7xbf16>
+}
+
+func.func @matmul_accumulate_81x19xbf16_times_19x41xbf16_into_81x41xbf16(%lhs: tensor<81x19xbf16>, %rhs: tensor<19x41xbf16>, %acc: tensor<81x41xbf16>) -> tensor<81x41xbf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<81x19xbf16>, tensor<19x41xbf16>) outs(%acc: tensor<81x41xbf16>) -> tensor<81x41xbf16>
+  return %result: tensor<81x41xbf16>
+}
+
+func.func @matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xbf16(%lhs: tensor<1x10xbf16>, %rhs: tensor<10x10xbf16>, %acc: tensor<1x10xbf16>) -> tensor<1x10xbf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x10xbf16>, tensor<10x10xbf16>) outs(%acc: tensor<1x10xbf16>) -> tensor<1x10xbf16>
+  return %result: tensor<1x10xbf16>
+}
+
+func.func @matmul_1x10xbf16_times_10x10xbf16_into_1x10xbf16(%lhs: tensor<1x10xbf16>, %rhs: tensor<10x10xbf16>) -> tensor<1x10xbf16> {
+  %init_acc = tensor.empty() : tensor<1x10xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<1x10xbf16>) -> tensor<1x10xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x10xbf16>, tensor<10x10xbf16>) outs(%acc: tensor<1x10xbf16>) -> tensor<1x10xbf16>
+  return %result: tensor<1x10xbf16>
+}
+
+func.func @matmul_accumulate_10x1xbf16_times_1x10xbf16_into_10x10xbf16(%lhs: tensor<10x1xbf16>, %rhs: tensor<1x10xbf16>, %acc: tensor<10x10xbf16>) -> tensor<10x10xbf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x1xbf16>, tensor<1x10xbf16>) outs(%acc: tensor<10x10xbf16>) -> tensor<10x10xbf16>
+  return %result: tensor<10x10xbf16>
+}
+
+func.func @matmul_accumulate_10x10xbf16_times_10x1xbf16_into_10x1xbf16(%lhs: tensor<10x10xbf16>, %rhs: tensor<10x1xbf16>, %acc: tensor<10x1xbf16>) -> tensor<10x1xbf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x10xbf16>, tensor<10x1xbf16>) outs(%acc: tensor<10x1xbf16>) -> tensor<10x1xbf16>
+  return %result: tensor<10x1xbf16>
+}
+
+func.func @matmul_10x10xbf16_times_10x1xbf16_into_10x1xbf16(%lhs: tensor<10x10xbf16>, %rhs: tensor<10x1xbf16>) -> tensor<10x1xbf16> {
+  %init_acc = tensor.empty() : tensor<10x1xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<10x1xbf16>) -> tensor<10x1xbf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x10xbf16>, tensor<10x1xbf16>) outs(%acc: tensor<10x1xbf16>) -> tensor<10x1xbf16>
+  return %result: tensor<10x1xbf16>
+}
+
diff --git a/linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_small_calls.mlir b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_small_calls.mlir
new file mode 100644
index 0000000..e577a7c
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_bf16_into_bf16_small_calls.mlir
@@ -0,0 +1,906 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x1xbf16_times_1x1xbf16_into_1x1xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_6x13xbf16_times_13x3xbf16_into_6x3xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_15x37xbf16_times_37x7xbf16_into_15x7xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_81x19xbf16_times_19x41xbf16_into_81x41xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x10xbf16_times_10x10xbf16_into_1x10xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x1xbf16_times_1x10xbf16_into_10x10xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x10xbf16_times_10x1xbf16_into_10x1xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_10x10xbf16_times_10x1xbf16_into_10x1xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xbf16_1_1_1_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1_1_1_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x1xbf16_times_1x1xbf16_into_1x1xbf16_1_1_1_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x1xbf16_times_1x1xbf16_into_1x1xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_2_2_2_acc_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 14 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 14 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xbf16_2_2_2_acc_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 15 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 16 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 17 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 17 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_4_4_4_acc_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 20 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 20 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xbf16_4_4_4_acc_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 21 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 22 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 23 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 23 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_8_8_8_acc_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 26 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 26 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xbf16_8_8_8_acc_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 27 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 28 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 29 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 29 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_9_9_9_acc_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 32 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 32 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xbf16_9_9_9_acc_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 33 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 34 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 35 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 35 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_6_13_3_acc_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 13 : i64
+  %rhs_dim1 = arith.constant 3 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 38 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 38 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_6x13xbf16_times_13x3xbf16_into_6x3xbf16_6_13_3_acc_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 39 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 13 : i64
+  %rhs_dim1 = arith.constant 3 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 40 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 41 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 41 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_6x13xbf16_times_13x3xbf16_into_6x3xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_15_37_7_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 37 : i64
+  %rhs_dim1 = arith.constant 7 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_15x37xbf16_times_37x7xbf16_into_15x7xbf16_15_37_7_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 37 : i64
+  %rhs_dim1 = arith.constant 7 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_15x37xbf16_times_37x7xbf16_into_15x7xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_81_19_41_acc_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 19 : i64
+  %rhs_dim1 = arith.constant 41 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 48 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 48 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_81x19xbf16_times_19x41xbf16_into_81x41xbf16_81_19_41_acc_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 49 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 19 : i64
+  %rhs_dim1 = arith.constant 41 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 50 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 51 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 51 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_81x19xbf16_times_19x41xbf16_into_81x41xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1_10_10_acc_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 54 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 54 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xbf16_1_10_10_acc_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 55 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 56 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 57 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 57 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1_10_10_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 58 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 59 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x10xbf16_times_10x10xbf16_into_1x10xbf16_1_10_10_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 60 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 61 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x10xbf16_times_10x10xbf16_into_1x10xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_10_1_10_acc_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x1xbf16_times_1x10xbf16_into_10x10xbf16_10_1_10_acc_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x1xbf16_times_1x10xbf16_into_10x10xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_10_10_1_acc_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 70 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 70 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x10xbf16_times_10x1xbf16_into_10x1xbf16_10_10_1_acc_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 71 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 72 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 73 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 73 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x10xbf16_times_10x1xbf16_into_10x1xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_10_10_1_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 74 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 75 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_10x10xbf16_times_10x1xbf16_into_10x1xbf16_10_10_1_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 76 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 77 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_10x10xbf16_times_10x1xbf16_into_10x1xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_large.mlir b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_large.mlir
new file mode 100644
index 0000000..47d9f91
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_large.mlir
@@ -0,0 +1,136 @@
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>, %acc: tensor<?x?xbf16>) -> tensor<?x?xbf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xbf16>) -> tensor<?x?xbf16>
+  return %result: tensor<?x?xbf16>
+}
+
+func.func @matmul_accumulate_512x128xbf16_times_512x128xbf16_into_512x512xbf16(%lhs: tensor<512x128xbf16>, %rhs: tensor<512x128xbf16>, %acc: tensor<512x512xbf16>) -> tensor<512x512xbf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xbf16>, tensor<512x128xbf16>) outs(%acc: tensor<512x512xbf16>) -> tensor<512x512xbf16>
+  return %result: tensor<512x512xbf16>
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>) -> tensor<?x?xbf16> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xbf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xbf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<?x?xbf16>) -> tensor<?x?xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xbf16>) -> tensor<?x?xbf16>
+  return %result: tensor<?x?xbf16>
+}
+
+func.func @matmul_512x128xbf16_times_512x128xbf16_into_512x512xbf16(%lhs: tensor<512x128xbf16>, %rhs: tensor<512x128xbf16>) -> tensor<512x512xbf16> {
+  %init_acc = tensor.empty() : tensor<512x512xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<512x512xbf16>) -> tensor<512x512xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xbf16>, tensor<512x128xbf16>) outs(%acc: tensor<512x512xbf16>) -> tensor<512x512xbf16>
+  return %result: tensor<512x512xbf16>
+}
+
+func.func @matmul_1000x4xbf16_times_512x4xbf16_into_1000x512xbf16(%lhs: tensor<1000x4xbf16>, %rhs: tensor<512x4xbf16>) -> tensor<1000x512xbf16> {
+  %init_acc = tensor.empty() : tensor<1000x512xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<1000x512xbf16>) -> tensor<1000x512xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x4xbf16>, tensor<512x4xbf16>) outs(%acc: tensor<1000x512xbf16>) -> tensor<1000x512xbf16>
+  return %result: tensor<1000x512xbf16>
+}
+
+func.func @matmul_4x1000xbf16_times_512x1000xbf16_into_4x512xbf16(%lhs: tensor<4x1000xbf16>, %rhs: tensor<512x1000xbf16>) -> tensor<4x512xbf16> {
+  %init_acc = tensor.empty() : tensor<4x512xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<4x512xbf16>) -> tensor<4x512xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<4x1000xbf16>, tensor<512x1000xbf16>) outs(%acc: tensor<4x512xbf16>) -> tensor<4x512xbf16>
+  return %result: tensor<4x512xbf16>
+}
+
+func.func @matmul_512x1000xbf16_times_4x1000xbf16_into_512x4xbf16(%lhs: tensor<512x1000xbf16>, %rhs: tensor<4x1000xbf16>) -> tensor<512x4xbf16> {
+  %init_acc = tensor.empty() : tensor<512x4xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<512x4xbf16>) -> tensor<512x4xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x1000xbf16>, tensor<4x1000xbf16>) outs(%acc: tensor<512x4xbf16>) -> tensor<512x4xbf16>
+  return %result: tensor<512x4xbf16>
+}
+
+func.func @matmul_512x128xbf16_times_500x128xbf16_into_512x500xbf16(%lhs: tensor<512x128xbf16>, %rhs: tensor<500x128xbf16>) -> tensor<512x500xbf16> {
+  %init_acc = tensor.empty() : tensor<512x500xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<512x500xbf16>) -> tensor<512x500xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xbf16>, tensor<500x128xbf16>) outs(%acc: tensor<512x500xbf16>) -> tensor<512x500xbf16>
+  return %result: tensor<512x500xbf16>
+}
+
+func.func @matmul_457x330xbf16_times_512x330xbf16_into_457x512xbf16(%lhs: tensor<457x330xbf16>, %rhs: tensor<512x330xbf16>) -> tensor<457x512xbf16> {
+  %init_acc = tensor.empty() : tensor<457x512xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<457x512xbf16>) -> tensor<457x512xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x330xbf16>, tensor<512x330xbf16>) outs(%acc: tensor<457x512xbf16>) -> tensor<457x512xbf16>
+  return %result: tensor<457x512xbf16>
+}
+
+func.func @matmul_457x330xbf16_times_514x330xbf16_into_457x514xbf16(%lhs: tensor<457x330xbf16>, %rhs: tensor<514x330xbf16>) -> tensor<457x514xbf16> {
+  %init_acc = tensor.empty() : tensor<457x514xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<457x514xbf16>) -> tensor<457x514xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x330xbf16>, tensor<514x330xbf16>) outs(%acc: tensor<457x514xbf16>) -> tensor<457x514xbf16>
+  return %result: tensor<457x514xbf16>
+}
+
+func.func @matmul_438x330xbf16_times_514x330xbf16_into_438x514xbf16(%lhs: tensor<438x330xbf16>, %rhs: tensor<514x330xbf16>) -> tensor<438x514xbf16> {
+  %init_acc = tensor.empty() : tensor<438x514xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<438x514xbf16>) -> tensor<438x514xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<438x330xbf16>, tensor<514x330xbf16>) outs(%acc: tensor<438x514xbf16>) -> tensor<438x514xbf16>
+  return %result: tensor<438x514xbf16>
+}
+
+func.func @matmul_540x332xbf16_times_516x332xbf16_into_540x516xbf16(%lhs: tensor<540x332xbf16>, %rhs: tensor<516x332xbf16>) -> tensor<540x516xbf16> {
+  %init_acc = tensor.empty() : tensor<540x516xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<540x516xbf16>) -> tensor<540x516xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<540x332xbf16>, tensor<516x332xbf16>) outs(%acc: tensor<540x516xbf16>) -> tensor<540x516xbf16>
+  return %result: tensor<540x516xbf16>
+}
+
+func.func @matmul_654x321xbf16_times_234x321xbf16_into_654x234xbf16(%lhs: tensor<654x321xbf16>, %rhs: tensor<234x321xbf16>) -> tensor<654x234xbf16> {
+  %init_acc = tensor.empty() : tensor<654x234xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<654x234xbf16>) -> tensor<654x234xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<654x321xbf16>, tensor<234x321xbf16>) outs(%acc: tensor<654x234xbf16>) -> tensor<654x234xbf16>
+  return %result: tensor<654x234xbf16>
+}
+
+func.func @matmul_457x160xbf16_times_512x160xbf16_into_457x512xbf16(%lhs: tensor<457x160xbf16>, %rhs: tensor<512x160xbf16>) -> tensor<457x512xbf16> {
+  %init_acc = tensor.empty() : tensor<457x512xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<457x512xbf16>) -> tensor<457x512xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x160xbf16>, tensor<512x160xbf16>) outs(%acc: tensor<457x512xbf16>) -> tensor<457x512xbf16>
+  return %result: tensor<457x512xbf16>
+}
+
+func.func @matmul_512x330xbf16_times_512x330xbf16_into_512x512xbf16(%lhs: tensor<512x330xbf16>, %rhs: tensor<512x330xbf16>) -> tensor<512x512xbf16> {
+  %init_acc = tensor.empty() : tensor<512x512xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<512x512xbf16>) -> tensor<512x512xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x330xbf16>, tensor<512x330xbf16>) outs(%acc: tensor<512x512xbf16>) -> tensor<512x512xbf16>
+  return %result: tensor<512x512xbf16>
+}
+
+func.func @matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xbf16(%lhs: tensor<1x1000xbf16>, %rhs: tensor<1000x1000xbf16>, %acc: tensor<1x1000xbf16>) -> tensor<1x1000xbf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1000xbf16>, tensor<1000x1000xbf16>) outs(%acc: tensor<1x1000xbf16>) -> tensor<1x1000xbf16>
+  return %result: tensor<1x1000xbf16>
+}
+
+func.func @matmul_accumulate_1000x1000xbf16_times_1x1000xbf16_into_1000x1xbf16(%lhs: tensor<1000x1000xbf16>, %rhs: tensor<1x1000xbf16>, %acc: tensor<1000x1xbf16>) -> tensor<1000x1xbf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x1000xbf16>, tensor<1x1000xbf16>) outs(%acc: tensor<1000x1xbf16>) -> tensor<1000x1xbf16>
+  return %result: tensor<1000x1xbf16>
+}
+
+func.func @matmul_1000x1000xbf16_times_1x1000xbf16_into_1000x1xbf16(%lhs: tensor<1000x1000xbf16>, %rhs: tensor<1x1000xbf16>) -> tensor<1000x1xbf16> {
+  %init_acc = tensor.empty() : tensor<1000x1xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<1000x1xbf16>) -> tensor<1000x1xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x1000xbf16>, tensor<1x1000xbf16>) outs(%acc: tensor<1000x1xbf16>) -> tensor<1000x1xbf16>
+  return %result: tensor<1000x1xbf16>
+}
+
diff --git a/linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_large_calls.mlir b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_large_calls.mlir
new file mode 100644
index 0000000..28b0f2a
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_large_calls.mlir
@@ -0,0 +1,882 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_512x128xbf16_times_512x128xbf16_into_512x512xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xbf16_times_512x128xbf16_into_512x512xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x4xbf16_times_512x4xbf16_into_1000x512xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_4x1000xbf16_times_512x1000xbf16_into_4x512xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x1000xbf16_times_4x1000xbf16_into_512x4xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xbf16_times_500x128xbf16_into_512x500xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xbf16_times_512x330xbf16_into_457x512xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xbf16_times_514x330xbf16_into_457x514xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_438x330xbf16_times_514x330xbf16_into_438x514xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_540x332xbf16_times_516x332xbf16_into_540x516xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_654x321xbf16_times_234x321xbf16_into_654x234xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x160xbf16_times_512x160xbf16_into_457x512xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x330xbf16_times_512x330xbf16_into_512x512xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1000x1000xbf16_times_1x1000xbf16_into_1000x1xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x1000xbf16_times_1x1000xbf16_into_1000x1xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_512_128_512_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_512x128xbf16_times_512x128xbf16_into_512x512xbf16_512_128_512_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_512x128xbf16_times_512x128xbf16_into_512x512xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_512_128_512_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xbf16_times_512x128xbf16_into_512x512xbf16_512_128_512_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xbf16_times_512x128xbf16_into_512x512xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1000_4_512_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x4xbf16_times_512x4xbf16_into_1000x512xbf16_1000_4_512_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 14 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 15 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x4xbf16_times_512x4xbf16_into_1000x512xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_4_1000_512_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 16 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 17 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_4x1000xbf16_times_512x1000xbf16_into_4x512xbf16_4_1000_512_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_4x1000xbf16_times_512x1000xbf16_into_4x512xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_512_1000_4_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 20 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 21 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x1000xbf16_times_4x1000xbf16_into_512x4xbf16_512_1000_4_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 22 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 23 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x1000xbf16_times_4x1000xbf16_into_512x4xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_512_128_500_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 500 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xbf16_times_500x128xbf16_into_512x500xbf16_512_128_500_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 26 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 500 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 27 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xbf16_times_500x128xbf16_into_512x500xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_457_330_512_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 28 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 29 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xbf16_times_512x330xbf16_into_457x512xbf16_457_330_512_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xbf16_times_512x330xbf16_into_457x512xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_457_330_514_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 32 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 33 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xbf16_times_514x330xbf16_into_457x514xbf16_457_330_514_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 34 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 35 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xbf16_times_514x330xbf16_into_457x514xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_438_330_514_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_438x330xbf16_times_514x330xbf16_into_438x514xbf16_438_330_514_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 38 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 39 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_438x330xbf16_times_514x330xbf16_into_438x514xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_540_332_516_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 40 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 516 : i64
+  %rhs_dim1 = arith.constant 332 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 41 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_540x332xbf16_times_516x332xbf16_into_540x516xbf16_540_332_516_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 516 : i64
+  %rhs_dim1 = arith.constant 332 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_540x332xbf16_times_516x332xbf16_into_540x516xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_654_321_234_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 234 : i64
+  %rhs_dim1 = arith.constant 321 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_654x321xbf16_times_234x321xbf16_into_654x234xbf16_654_321_234_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 234 : i64
+  %rhs_dim1 = arith.constant 321 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_654x321xbf16_times_234x321xbf16_into_654x234xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_457_160_512_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 48 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 160 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 49 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x160xbf16_times_512x160xbf16_into_457x512xbf16_457_160_512_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 50 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 160 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 51 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x160xbf16_times_512x160xbf16_into_457x512xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_512_330_512_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x330xbf16_times_512x330xbf16_into_512x512xbf16_512_330_512_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 54 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 55 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x330xbf16_times_512x330xbf16_into_512x512xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1_1000_1000_acc_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 56 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 57 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 58 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 58 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xbf16_1_1000_1000_acc_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 59 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 60 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 61 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 61 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1000_1000_1_acc_28() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1000x1000xbf16_times_1x1000xbf16_into_1000x1xbf16_1000_1000_1_acc_29() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1000x1000xbf16_times_1x1000xbf16_into_1000x1xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1000_1000_1_30() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x1000xbf16_times_1x1000xbf16_into_1000x1xbf16_1000_1000_1_31() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 70 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 71 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x1000xbf16_times_1x1000xbf16_into_1000x1xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_small.mlir b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_small.mlir
new file mode 100644
index 0000000..439259c
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_small.mlir
@@ -0,0 +1,99 @@
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>, %acc: tensor<?x?xbf16>) -> tensor<?x?xbf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xbf16>) -> tensor<?x?xbf16>
+  return %result: tensor<?x?xbf16>
+}
+
+func.func @matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xbf16(%lhs: tensor<1x1xbf16>, %rhs: tensor<1x1xbf16>, %acc: tensor<1x1xbf16>) -> tensor<1x1xbf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1xbf16>, tensor<1x1xbf16>) outs(%acc: tensor<1x1xbf16>) -> tensor<1x1xbf16>
+  return %result: tensor<1x1xbf16>
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>) -> tensor<?x?xbf16> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xbf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xbf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<?x?xbf16>) -> tensor<?x?xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xbf16>) -> tensor<?x?xbf16>
+  return %result: tensor<?x?xbf16>
+}
+
+func.func @matmul_1x1xbf16_times_1x1xbf16_into_1x1xbf16(%lhs: tensor<1x1xbf16>, %rhs: tensor<1x1xbf16>) -> tensor<1x1xbf16> {
+  %init_acc = tensor.empty() : tensor<1x1xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<1x1xbf16>) -> tensor<1x1xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1xbf16>, tensor<1x1xbf16>) outs(%acc: tensor<1x1xbf16>) -> tensor<1x1xbf16>
+  return %result: tensor<1x1xbf16>
+}
+
+func.func @matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xbf16(%lhs: tensor<2x2xbf16>, %rhs: tensor<2x2xbf16>, %acc: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<2x2xbf16>, tensor<2x2xbf16>) outs(%acc: tensor<2x2xbf16>) -> tensor<2x2xbf16>
+  return %result: tensor<2x2xbf16>
+}
+
+func.func @matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xbf16(%lhs: tensor<4x4xbf16>, %rhs: tensor<4x4xbf16>, %acc: tensor<4x4xbf16>) -> tensor<4x4xbf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<4x4xbf16>, tensor<4x4xbf16>) outs(%acc: tensor<4x4xbf16>) -> tensor<4x4xbf16>
+  return %result: tensor<4x4xbf16>
+}
+
+func.func @matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xbf16(%lhs: tensor<8x8xbf16>, %rhs: tensor<8x8xbf16>, %acc: tensor<8x8xbf16>) -> tensor<8x8xbf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<8x8xbf16>, tensor<8x8xbf16>) outs(%acc: tensor<8x8xbf16>) -> tensor<8x8xbf16>
+  return %result: tensor<8x8xbf16>
+}
+
+func.func @matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xbf16(%lhs: tensor<9x9xbf16>, %rhs: tensor<9x9xbf16>, %acc: tensor<9x9xbf16>) -> tensor<9x9xbf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<9x9xbf16>, tensor<9x9xbf16>) outs(%acc: tensor<9x9xbf16>) -> tensor<9x9xbf16>
+  return %result: tensor<9x9xbf16>
+}
+
+func.func @matmul_accumulate_6x13xbf16_times_3x13xbf16_into_6x3xbf16(%lhs: tensor<6x13xbf16>, %rhs: tensor<3x13xbf16>, %acc: tensor<6x3xbf16>) -> tensor<6x3xbf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<6x13xbf16>, tensor<3x13xbf16>) outs(%acc: tensor<6x3xbf16>) -> tensor<6x3xbf16>
+  return %result: tensor<6x3xbf16>
+}
+
+func.func @matmul_15x37xbf16_times_7x37xbf16_into_15x7xbf16(%lhs: tensor<15x37xbf16>, %rhs: tensor<7x37xbf16>) -> tensor<15x7xbf16> {
+  %init_acc = tensor.empty() : tensor<15x7xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<15x7xbf16>) -> tensor<15x7xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<15x37xbf16>, tensor<7x37xbf16>) outs(%acc: tensor<15x7xbf16>) -> tensor<15x7xbf16>
+  return %result: tensor<15x7xbf16>
+}
+
+func.func @matmul_accumulate_81x19xbf16_times_41x19xbf16_into_81x41xbf16(%lhs: tensor<81x19xbf16>, %rhs: tensor<41x19xbf16>, %acc: tensor<81x41xbf16>) -> tensor<81x41xbf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<81x19xbf16>, tensor<41x19xbf16>) outs(%acc: tensor<81x41xbf16>) -> tensor<81x41xbf16>
+  return %result: tensor<81x41xbf16>
+}
+
+func.func @matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xbf16(%lhs: tensor<1x10xbf16>, %rhs: tensor<10x10xbf16>, %acc: tensor<1x10xbf16>) -> tensor<1x10xbf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x10xbf16>, tensor<10x10xbf16>) outs(%acc: tensor<1x10xbf16>) -> tensor<1x10xbf16>
+  return %result: tensor<1x10xbf16>
+}
+
+func.func @matmul_1x10xbf16_times_10x10xbf16_into_1x10xbf16(%lhs: tensor<1x10xbf16>, %rhs: tensor<10x10xbf16>) -> tensor<1x10xbf16> {
+  %init_acc = tensor.empty() : tensor<1x10xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<1x10xbf16>) -> tensor<1x10xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x10xbf16>, tensor<10x10xbf16>) outs(%acc: tensor<1x10xbf16>) -> tensor<1x10xbf16>
+  return %result: tensor<1x10xbf16>
+}
+
+func.func @matmul_accumulate_10x1xbf16_times_10x1xbf16_into_10x10xbf16(%lhs: tensor<10x1xbf16>, %rhs: tensor<10x1xbf16>, %acc: tensor<10x10xbf16>) -> tensor<10x10xbf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x1xbf16>, tensor<10x1xbf16>) outs(%acc: tensor<10x10xbf16>) -> tensor<10x10xbf16>
+  return %result: tensor<10x10xbf16>
+}
+
+func.func @matmul_accumulate_10x10xbf16_times_1x10xbf16_into_10x1xbf16(%lhs: tensor<10x10xbf16>, %rhs: tensor<1x10xbf16>, %acc: tensor<10x1xbf16>) -> tensor<10x1xbf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x10xbf16>, tensor<1x10xbf16>) outs(%acc: tensor<10x1xbf16>) -> tensor<10x1xbf16>
+  return %result: tensor<10x1xbf16>
+}
+
+func.func @matmul_10x10xbf16_times_1x10xbf16_into_10x1xbf16(%lhs: tensor<10x10xbf16>, %rhs: tensor<1x10xbf16>) -> tensor<10x1xbf16> {
+  %init_acc = tensor.empty() : tensor<10x1xbf16>
+  %c0_acc_type = arith.constant 0.0: bf16
+  %acc = linalg.fill ins(%c0_acc_type : bf16) outs(%init_acc : tensor<10x1xbf16>) -> tensor<10x1xbf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x10xbf16>, tensor<1x10xbf16>) outs(%acc: tensor<10x1xbf16>) -> tensor<10x1xbf16>
+  return %result: tensor<10x1xbf16>
+}
+
diff --git a/linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_small_calls.mlir b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_small_calls.mlir
new file mode 100644
index 0000000..b06ab1f
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_bf16/matmul_transpose_b_bf16_into_bf16_small_calls.mlir
@@ -0,0 +1,906 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x1xbf16_times_1x1xbf16_into_1x1xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_6x13xbf16_times_3x13xbf16_into_6x3xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_15x37xbf16_times_7x37xbf16_into_15x7xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_81x19xbf16_times_41x19xbf16_into_81x41xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x10xbf16_times_10x10xbf16_into_1x10xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x1xbf16_times_10x1xbf16_into_10x10xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x10xbf16_times_1x10xbf16_into_10x1xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_10x10xbf16_times_1x10xbf16_into_10x1xbf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xbf16_1_1_1_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1_1_1_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x1xbf16_times_1x1xbf16_into_1x1xbf16_1_1_1_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x1xbf16_times_1x1xbf16_into_1x1xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_2_2_2_acc_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 14 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 14 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xbf16_2_2_2_acc_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 15 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 16 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 17 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 17 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_4_4_4_acc_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 20 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 20 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xbf16_4_4_4_acc_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 21 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 22 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 23 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 23 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_8_8_8_acc_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 26 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 26 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xbf16_8_8_8_acc_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 27 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 28 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 29 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 29 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_9_9_9_acc_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 32 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 32 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xbf16_9_9_9_acc_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 33 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 34 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 35 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 35 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_6_13_3_acc_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 3 : i64
+  %rhs_dim1 = arith.constant 13 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 38 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 38 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_6x13xbf16_times_3x13xbf16_into_6x3xbf16_6_13_3_acc_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 39 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 3 : i64
+  %rhs_dim1 = arith.constant 13 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 40 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 41 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 41 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_6x13xbf16_times_3x13xbf16_into_6x3xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_15_37_7_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 7 : i64
+  %rhs_dim1 = arith.constant 37 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_15x37xbf16_times_7x37xbf16_into_15x7xbf16_15_37_7_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 7 : i64
+  %rhs_dim1 = arith.constant 37 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_15x37xbf16_times_7x37xbf16_into_15x7xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_81_19_41_acc_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 41 : i64
+  %rhs_dim1 = arith.constant 19 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 48 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 48 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_81x19xbf16_times_41x19xbf16_into_81x41xbf16_81_19_41_acc_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 49 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 41 : i64
+  %rhs_dim1 = arith.constant 19 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 50 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 51 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 51 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_81x19xbf16_times_41x19xbf16_into_81x41xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1_10_10_acc_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 54 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 54 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xbf16_1_10_10_acc_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 55 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 56 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 57 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 57 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_1_10_10_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 58 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 59 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x10xbf16_times_10x10xbf16_into_1x10xbf16_1_10_10_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 60 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 61 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x10xbf16_times_10x10xbf16_into_1x10xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_10_1_10_acc_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x1xbf16_times_10x1xbf16_into_10x10xbf16_10_1_10_acc_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x1xbf16_times_10x1xbf16_into_10x10xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_10_10_1_acc_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 70 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 70 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x10xbf16_times_1x10xbf16_into_10x1xbf16_10_10_1_acc_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 71 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 72 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<bf16> : i32
+  %acc_seed = arith.constant 73 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<bf16> : i32
+  %acc_copy_seed = arith.constant 73 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x10xbf16_times_1x10xbf16_into_10x1xbf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16_10_10_1_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 74 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 75 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_10x10xbf16_times_1x10xbf16_into_10x1xbf16_10_10_1_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 76 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 77 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_10x10xbf16_times_1x10xbf16_into_10x1xbf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_large.mlir b/linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_large.mlir
new file mode 100644
index 0000000..4753c6a
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_large.mlir
@@ -0,0 +1,136 @@
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_512x128xbf16_times_128x512xbf16_into_512x512xf32(%lhs: tensor<512x128xbf16>, %rhs: tensor<128x512xbf16>, %acc: tensor<512x512xf32>) -> tensor<512x512xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xbf16>, tensor<128x512xbf16>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xbf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xbf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_512x128xbf16_times_128x512xbf16_into_512x512xf32(%lhs: tensor<512x128xbf16>, %rhs: tensor<128x512xbf16>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xbf16>, tensor<128x512xbf16>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_1000x4xbf16_times_4x512xbf16_into_1000x512xf32(%lhs: tensor<1000x4xbf16>, %rhs: tensor<4x512xbf16>) -> tensor<1000x512xf32> {
+  %init_acc = tensor.empty() : tensor<1000x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x4xbf16>, tensor<4x512xbf16>) outs(%acc: tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  return %result: tensor<1000x512xf32>
+}
+
+func.func @matmul_4x1000xbf16_times_1000x512xbf16_into_4x512xf32(%lhs: tensor<4x1000xbf16>, %rhs: tensor<1000x512xbf16>) -> tensor<4x512xf32> {
+  %init_acc = tensor.empty() : tensor<4x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<4x512xf32>) -> tensor<4x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<4x1000xbf16>, tensor<1000x512xbf16>) outs(%acc: tensor<4x512xf32>) -> tensor<4x512xf32>
+  return %result: tensor<4x512xf32>
+}
+
+func.func @matmul_512x1000xbf16_times_1000x4xbf16_into_512x4xf32(%lhs: tensor<512x1000xbf16>, %rhs: tensor<1000x4xbf16>) -> tensor<512x4xf32> {
+  %init_acc = tensor.empty() : tensor<512x4xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x4xf32>) -> tensor<512x4xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x1000xbf16>, tensor<1000x4xbf16>) outs(%acc: tensor<512x4xf32>) -> tensor<512x4xf32>
+  return %result: tensor<512x4xf32>
+}
+
+func.func @matmul_512x128xbf16_times_128x500xbf16_into_512x500xf32(%lhs: tensor<512x128xbf16>, %rhs: tensor<128x500xbf16>) -> tensor<512x500xf32> {
+  %init_acc = tensor.empty() : tensor<512x500xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x500xf32>) -> tensor<512x500xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xbf16>, tensor<128x500xbf16>) outs(%acc: tensor<512x500xf32>) -> tensor<512x500xf32>
+  return %result: tensor<512x500xf32>
+}
+
+func.func @matmul_457x330xbf16_times_330x512xbf16_into_457x512xf32(%lhs: tensor<457x330xbf16>, %rhs: tensor<330x512xbf16>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x330xbf16>, tensor<330x512xbf16>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_457x330xbf16_times_330x514xbf16_into_457x514xf32(%lhs: tensor<457x330xbf16>, %rhs: tensor<330x514xbf16>) -> tensor<457x514xf32> {
+  %init_acc = tensor.empty() : tensor<457x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x514xf32>) -> tensor<457x514xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x330xbf16>, tensor<330x514xbf16>) outs(%acc: tensor<457x514xf32>) -> tensor<457x514xf32>
+  return %result: tensor<457x514xf32>
+}
+
+func.func @matmul_438x330xbf16_times_330x514xbf16_into_438x514xf32(%lhs: tensor<438x330xbf16>, %rhs: tensor<330x514xbf16>) -> tensor<438x514xf32> {
+  %init_acc = tensor.empty() : tensor<438x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<438x514xf32>) -> tensor<438x514xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<438x330xbf16>, tensor<330x514xbf16>) outs(%acc: tensor<438x514xf32>) -> tensor<438x514xf32>
+  return %result: tensor<438x514xf32>
+}
+
+func.func @matmul_540x332xbf16_times_332x516xbf16_into_540x516xf32(%lhs: tensor<540x332xbf16>, %rhs: tensor<332x516xbf16>) -> tensor<540x516xf32> {
+  %init_acc = tensor.empty() : tensor<540x516xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<540x516xf32>) -> tensor<540x516xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<540x332xbf16>, tensor<332x516xbf16>) outs(%acc: tensor<540x516xf32>) -> tensor<540x516xf32>
+  return %result: tensor<540x516xf32>
+}
+
+func.func @matmul_654x321xbf16_times_321x234xbf16_into_654x234xf32(%lhs: tensor<654x321xbf16>, %rhs: tensor<321x234xbf16>) -> tensor<654x234xf32> {
+  %init_acc = tensor.empty() : tensor<654x234xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<654x234xf32>) -> tensor<654x234xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<654x321xbf16>, tensor<321x234xbf16>) outs(%acc: tensor<654x234xf32>) -> tensor<654x234xf32>
+  return %result: tensor<654x234xf32>
+}
+
+func.func @matmul_457x160xbf16_times_160x512xbf16_into_457x512xf32(%lhs: tensor<457x160xbf16>, %rhs: tensor<160x512xbf16>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x160xbf16>, tensor<160x512xbf16>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_512x330xbf16_times_330x512xbf16_into_512x512xf32(%lhs: tensor<512x330xbf16>, %rhs: tensor<330x512xbf16>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x330xbf16>, tensor<330x512xbf16>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xf32(%lhs: tensor<1x1000xbf16>, %rhs: tensor<1000x1000xbf16>, %acc: tensor<1x1000xf32>) -> tensor<1x1000xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1000xbf16>, tensor<1000x1000xbf16>) outs(%acc: tensor<1x1000xf32>) -> tensor<1x1000xf32>
+  return %result: tensor<1x1000xf32>
+}
+
+func.func @matmul_accumulate_1000x1000xbf16_times_1000x1xbf16_into_1000x1xf32(%lhs: tensor<1000x1000xbf16>, %rhs: tensor<1000x1xbf16>, %acc: tensor<1000x1xf32>) -> tensor<1000x1xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x1000xbf16>, tensor<1000x1xbf16>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
+func.func @matmul_1000x1000xbf16_times_1000x1xbf16_into_1000x1xf32(%lhs: tensor<1000x1000xbf16>, %rhs: tensor<1000x1xbf16>) -> tensor<1000x1xf32> {
+  %init_acc = tensor.empty() : tensor<1000x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x1000xbf16>, tensor<1000x1xbf16>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_large_calls.mlir b/linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_large_calls.mlir
new file mode 100644
index 0000000..e21103c
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_large_calls.mlir
@@ -0,0 +1,882 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_512x128xbf16_times_128x512xbf16_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xbf16_times_128x512xbf16_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x4xbf16_times_4x512xbf16_into_1000x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_4x1000xbf16_times_1000x512xbf16_into_4x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x1000xbf16_times_1000x4xbf16_into_512x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xbf16_times_128x500xbf16_into_512x500xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xbf16_times_330x512xbf16_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xbf16_times_330x514xbf16_into_457x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_438x330xbf16_times_330x514xbf16_into_438x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_540x332xbf16_times_332x516xbf16_into_540x516xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_654x321xbf16_times_321x234xbf16_into_654x234xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x160xbf16_times_160x512xbf16_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x330xbf16_times_330x512xbf16_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1000x1000xbf16_times_1000x1xbf16_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x1000xbf16_times_1000x1xbf16_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_512_128_512_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_512x128xbf16_times_128x512xbf16_into_512x512xf32_512_128_512_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_512x128xbf16_times_128x512xbf16_into_512x512xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_512_128_512_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xbf16_times_128x512xbf16_into_512x512xf32_512_128_512_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xbf16_times_128x512xbf16_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1000_4_512_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x4xbf16_times_4x512xbf16_into_1000x512xf32_1000_4_512_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 14 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 15 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x4xbf16_times_4x512xbf16_into_1000x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_4_1000_512_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 16 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 17 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_4x1000xbf16_times_1000x512xbf16_into_4x512xf32_4_1000_512_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_4x1000xbf16_times_1000x512xbf16_into_4x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_512_1000_4_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 20 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 21 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x1000xbf16_times_1000x4xbf16_into_512x4xf32_512_1000_4_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 22 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 23 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x1000xbf16_times_1000x4xbf16_into_512x4xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_512_128_500_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 500 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xbf16_times_128x500xbf16_into_512x500xf32_512_128_500_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 26 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 500 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 27 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xbf16_times_128x500xbf16_into_512x500xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_457_330_512_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 28 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 29 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xbf16_times_330x512xbf16_into_457x512xf32_457_330_512_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xbf16_times_330x512xbf16_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_457_330_514_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 32 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 33 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xbf16_times_330x514xbf16_into_457x514xf32_457_330_514_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 34 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 35 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xbf16_times_330x514xbf16_into_457x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_438_330_514_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_438x330xbf16_times_330x514xbf16_into_438x514xf32_438_330_514_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 38 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 39 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_438x330xbf16_times_330x514xbf16_into_438x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_540_332_516_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 40 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 332 : i64
+  %rhs_dim1 = arith.constant 516 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 41 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_540x332xbf16_times_332x516xbf16_into_540x516xf32_540_332_516_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 332 : i64
+  %rhs_dim1 = arith.constant 516 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_540x332xbf16_times_332x516xbf16_into_540x516xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_654_321_234_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 321 : i64
+  %rhs_dim1 = arith.constant 234 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_654x321xbf16_times_321x234xbf16_into_654x234xf32_654_321_234_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 321 : i64
+  %rhs_dim1 = arith.constant 234 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_654x321xbf16_times_321x234xbf16_into_654x234xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_457_160_512_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 48 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 160 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 49 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x160xbf16_times_160x512xbf16_into_457x512xf32_457_160_512_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 50 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 160 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 51 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x160xbf16_times_160x512xbf16_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_512_330_512_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x330xbf16_times_330x512xbf16_into_512x512xf32_512_330_512_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 54 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 55 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x330xbf16_times_330x512xbf16_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1_1000_1000_acc_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 56 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 57 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 58 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 58 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xf32_1_1000_1000_acc_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 59 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 60 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 61 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 61 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1000_1000_1_acc_28() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1000x1000xbf16_times_1000x1xbf16_into_1000x1xf32_1000_1000_1_acc_29() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1000x1000xbf16_times_1000x1xbf16_into_1000x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1000_1000_1_30() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x1000xbf16_times_1000x1xbf16_into_1000x1xf32_1000_1000_1_31() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 70 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 71 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x1000xbf16_times_1000x1xbf16_into_1000x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_small.mlir b/linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_small.mlir
new file mode 100644
index 0000000..44378a6
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_small.mlir
@@ -0,0 +1,99 @@
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xf32(%lhs: tensor<1x1xbf16>, %rhs: tensor<1x1xbf16>, %acc: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1xbf16>, tensor<1x1xbf16>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xbf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xbf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_1x1xbf16_times_1x1xbf16_into_1x1xf32(%lhs: tensor<1x1xbf16>, %rhs: tensor<1x1xbf16>) -> tensor<1x1xf32> {
+  %init_acc = tensor.empty() : tensor<1x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x1xf32>) -> tensor<1x1xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1xbf16>, tensor<1x1xbf16>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xf32(%lhs: tensor<2x2xbf16>, %rhs: tensor<2x2xbf16>, %acc: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<2x2xbf16>, tensor<2x2xbf16>) outs(%acc: tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %result: tensor<2x2xf32>
+}
+
+func.func @matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xf32(%lhs: tensor<4x4xbf16>, %rhs: tensor<4x4xbf16>, %acc: tensor<4x4xf32>) -> tensor<4x4xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<4x4xbf16>, tensor<4x4xbf16>) outs(%acc: tensor<4x4xf32>) -> tensor<4x4xf32>
+  return %result: tensor<4x4xf32>
+}
+
+func.func @matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xf32(%lhs: tensor<8x8xbf16>, %rhs: tensor<8x8xbf16>, %acc: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<8x8xbf16>, tensor<8x8xbf16>) outs(%acc: tensor<8x8xf32>) -> tensor<8x8xf32>
+  return %result: tensor<8x8xf32>
+}
+
+func.func @matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xf32(%lhs: tensor<9x9xbf16>, %rhs: tensor<9x9xbf16>, %acc: tensor<9x9xf32>) -> tensor<9x9xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<9x9xbf16>, tensor<9x9xbf16>) outs(%acc: tensor<9x9xf32>) -> tensor<9x9xf32>
+  return %result: tensor<9x9xf32>
+}
+
+func.func @matmul_accumulate_6x13xbf16_times_13x3xbf16_into_6x3xf32(%lhs: tensor<6x13xbf16>, %rhs: tensor<13x3xbf16>, %acc: tensor<6x3xf32>) -> tensor<6x3xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<6x13xbf16>, tensor<13x3xbf16>) outs(%acc: tensor<6x3xf32>) -> tensor<6x3xf32>
+  return %result: tensor<6x3xf32>
+}
+
+func.func @matmul_15x37xbf16_times_37x7xbf16_into_15x7xf32(%lhs: tensor<15x37xbf16>, %rhs: tensor<37x7xbf16>) -> tensor<15x7xf32> {
+  %init_acc = tensor.empty() : tensor<15x7xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<15x7xf32>) -> tensor<15x7xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<15x37xbf16>, tensor<37x7xbf16>) outs(%acc: tensor<15x7xf32>) -> tensor<15x7xf32>
+  return %result: tensor<15x7xf32>
+}
+
+func.func @matmul_accumulate_81x19xbf16_times_19x41xbf16_into_81x41xf32(%lhs: tensor<81x19xbf16>, %rhs: tensor<19x41xbf16>, %acc: tensor<81x41xf32>) -> tensor<81x41xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<81x19xbf16>, tensor<19x41xbf16>) outs(%acc: tensor<81x41xf32>) -> tensor<81x41xf32>
+  return %result: tensor<81x41xf32>
+}
+
+func.func @matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xf32(%lhs: tensor<1x10xbf16>, %rhs: tensor<10x10xbf16>, %acc: tensor<1x10xf32>) -> tensor<1x10xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x10xbf16>, tensor<10x10xbf16>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_1x10xbf16_times_10x10xbf16_into_1x10xf32(%lhs: tensor<1x10xbf16>, %rhs: tensor<10x10xbf16>) -> tensor<1x10xf32> {
+  %init_acc = tensor.empty() : tensor<1x10xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x10xf32>) -> tensor<1x10xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x10xbf16>, tensor<10x10xbf16>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_accumulate_10x1xbf16_times_1x10xbf16_into_10x10xf32(%lhs: tensor<10x1xbf16>, %rhs: tensor<1x10xbf16>, %acc: tensor<10x10xf32>) -> tensor<10x10xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x1xbf16>, tensor<1x10xbf16>) outs(%acc: tensor<10x10xf32>) -> tensor<10x10xf32>
+  return %result: tensor<10x10xf32>
+}
+
+func.func @matmul_accumulate_10x10xbf16_times_10x1xbf16_into_10x1xf32(%lhs: tensor<10x10xbf16>, %rhs: tensor<10x1xbf16>, %acc: tensor<10x1xf32>) -> tensor<10x1xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x10xbf16>, tensor<10x1xbf16>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
+func.func @matmul_10x10xbf16_times_10x1xbf16_into_10x1xf32(%lhs: tensor<10x10xbf16>, %rhs: tensor<10x1xbf16>) -> tensor<10x1xf32> {
+  %init_acc = tensor.empty() : tensor<10x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<10x1xf32>) -> tensor<10x1xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x10xbf16>, tensor<10x1xbf16>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_small_calls.mlir b/linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_small_calls.mlir
new file mode 100644
index 0000000..2e6303c
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_f32/matmul_bf16_into_f32_small_calls.mlir
@@ -0,0 +1,906 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x1xbf16_times_1x1xbf16_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_6x13xbf16_times_13x3xbf16_into_6x3xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_15x37xbf16_times_37x7xbf16_into_15x7xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_81x19xbf16_times_19x41xbf16_into_81x41xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x10xbf16_times_10x10xbf16_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x1xbf16_times_1x10xbf16_into_10x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x10xbf16_times_10x1xbf16_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_10x10xbf16_times_10x1xbf16_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xf32_1_1_1_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1_1_1_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x1xbf16_times_1x1xbf16_into_1x1xf32_1_1_1_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x1xbf16_times_1x1xbf16_into_1x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_2_2_2_acc_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 14 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 14 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xf32_2_2_2_acc_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 15 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 16 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 17 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 17 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_4_4_4_acc_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 20 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 20 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xf32_4_4_4_acc_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 21 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 22 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 23 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 23 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_8_8_8_acc_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 26 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 26 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xf32_8_8_8_acc_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 27 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 28 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 29 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 29 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_9_9_9_acc_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 32 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 32 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xf32_9_9_9_acc_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 33 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 34 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 35 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 35 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_6_13_3_acc_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 13 : i64
+  %rhs_dim1 = arith.constant 3 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 38 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 38 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_6x13xbf16_times_13x3xbf16_into_6x3xf32_6_13_3_acc_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 39 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 13 : i64
+  %rhs_dim1 = arith.constant 3 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 40 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 41 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 41 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_6x13xbf16_times_13x3xbf16_into_6x3xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_15_37_7_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 37 : i64
+  %rhs_dim1 = arith.constant 7 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_15x37xbf16_times_37x7xbf16_into_15x7xf32_15_37_7_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 37 : i64
+  %rhs_dim1 = arith.constant 7 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_15x37xbf16_times_37x7xbf16_into_15x7xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_81_19_41_acc_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 19 : i64
+  %rhs_dim1 = arith.constant 41 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 48 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 48 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_81x19xbf16_times_19x41xbf16_into_81x41xf32_81_19_41_acc_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 49 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 19 : i64
+  %rhs_dim1 = arith.constant 41 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 50 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 51 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 51 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_81x19xbf16_times_19x41xbf16_into_81x41xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1_10_10_acc_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 54 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 54 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xf32_1_10_10_acc_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 55 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 56 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 57 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 57 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1_10_10_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 58 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 59 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x10xbf16_times_10x10xbf16_into_1x10xf32_1_10_10_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 60 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 61 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x10xbf16_times_10x10xbf16_into_1x10xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_10_1_10_acc_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x1xbf16_times_1x10xbf16_into_10x10xf32_10_1_10_acc_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x1xbf16_times_1x10xbf16_into_10x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_10_10_1_acc_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 70 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 70 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x10xbf16_times_10x1xbf16_into_10x1xf32_10_10_1_acc_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 71 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 72 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 73 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 73 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x10xbf16_times_10x1xbf16_into_10x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_10_10_1_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 74 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 75 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_10x10xbf16_times_10x1xbf16_into_10x1xf32_10_10_1_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 76 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 77 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_10x10xbf16_times_10x1xbf16_into_10x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_large.mlir b/linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_large.mlir
new file mode 100644
index 0000000..980ae69
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_large.mlir
@@ -0,0 +1,136 @@
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_512x128xbf16_times_512x128xbf16_into_512x512xf32(%lhs: tensor<512x128xbf16>, %rhs: tensor<512x128xbf16>, %acc: tensor<512x512xf32>) -> tensor<512x512xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xbf16>, tensor<512x128xbf16>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xbf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xbf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_512x128xbf16_times_512x128xbf16_into_512x512xf32(%lhs: tensor<512x128xbf16>, %rhs: tensor<512x128xbf16>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xbf16>, tensor<512x128xbf16>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_1000x4xbf16_times_512x4xbf16_into_1000x512xf32(%lhs: tensor<1000x4xbf16>, %rhs: tensor<512x4xbf16>) -> tensor<1000x512xf32> {
+  %init_acc = tensor.empty() : tensor<1000x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x4xbf16>, tensor<512x4xbf16>) outs(%acc: tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  return %result: tensor<1000x512xf32>
+}
+
+func.func @matmul_4x1000xbf16_times_512x1000xbf16_into_4x512xf32(%lhs: tensor<4x1000xbf16>, %rhs: tensor<512x1000xbf16>) -> tensor<4x512xf32> {
+  %init_acc = tensor.empty() : tensor<4x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<4x512xf32>) -> tensor<4x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<4x1000xbf16>, tensor<512x1000xbf16>) outs(%acc: tensor<4x512xf32>) -> tensor<4x512xf32>
+  return %result: tensor<4x512xf32>
+}
+
+func.func @matmul_512x1000xbf16_times_4x1000xbf16_into_512x4xf32(%lhs: tensor<512x1000xbf16>, %rhs: tensor<4x1000xbf16>) -> tensor<512x4xf32> {
+  %init_acc = tensor.empty() : tensor<512x4xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x4xf32>) -> tensor<512x4xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x1000xbf16>, tensor<4x1000xbf16>) outs(%acc: tensor<512x4xf32>) -> tensor<512x4xf32>
+  return %result: tensor<512x4xf32>
+}
+
+func.func @matmul_512x128xbf16_times_500x128xbf16_into_512x500xf32(%lhs: tensor<512x128xbf16>, %rhs: tensor<500x128xbf16>) -> tensor<512x500xf32> {
+  %init_acc = tensor.empty() : tensor<512x500xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x500xf32>) -> tensor<512x500xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xbf16>, tensor<500x128xbf16>) outs(%acc: tensor<512x500xf32>) -> tensor<512x500xf32>
+  return %result: tensor<512x500xf32>
+}
+
+func.func @matmul_457x330xbf16_times_512x330xbf16_into_457x512xf32(%lhs: tensor<457x330xbf16>, %rhs: tensor<512x330xbf16>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x330xbf16>, tensor<512x330xbf16>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_457x330xbf16_times_514x330xbf16_into_457x514xf32(%lhs: tensor<457x330xbf16>, %rhs: tensor<514x330xbf16>) -> tensor<457x514xf32> {
+  %init_acc = tensor.empty() : tensor<457x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x514xf32>) -> tensor<457x514xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x330xbf16>, tensor<514x330xbf16>) outs(%acc: tensor<457x514xf32>) -> tensor<457x514xf32>
+  return %result: tensor<457x514xf32>
+}
+
+func.func @matmul_438x330xbf16_times_514x330xbf16_into_438x514xf32(%lhs: tensor<438x330xbf16>, %rhs: tensor<514x330xbf16>) -> tensor<438x514xf32> {
+  %init_acc = tensor.empty() : tensor<438x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<438x514xf32>) -> tensor<438x514xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<438x330xbf16>, tensor<514x330xbf16>) outs(%acc: tensor<438x514xf32>) -> tensor<438x514xf32>
+  return %result: tensor<438x514xf32>
+}
+
+func.func @matmul_540x332xbf16_times_516x332xbf16_into_540x516xf32(%lhs: tensor<540x332xbf16>, %rhs: tensor<516x332xbf16>) -> tensor<540x516xf32> {
+  %init_acc = tensor.empty() : tensor<540x516xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<540x516xf32>) -> tensor<540x516xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<540x332xbf16>, tensor<516x332xbf16>) outs(%acc: tensor<540x516xf32>) -> tensor<540x516xf32>
+  return %result: tensor<540x516xf32>
+}
+
+func.func @matmul_654x321xbf16_times_234x321xbf16_into_654x234xf32(%lhs: tensor<654x321xbf16>, %rhs: tensor<234x321xbf16>) -> tensor<654x234xf32> {
+  %init_acc = tensor.empty() : tensor<654x234xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<654x234xf32>) -> tensor<654x234xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<654x321xbf16>, tensor<234x321xbf16>) outs(%acc: tensor<654x234xf32>) -> tensor<654x234xf32>
+  return %result: tensor<654x234xf32>
+}
+
+func.func @matmul_457x160xbf16_times_512x160xbf16_into_457x512xf32(%lhs: tensor<457x160xbf16>, %rhs: tensor<512x160xbf16>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x160xbf16>, tensor<512x160xbf16>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_512x330xbf16_times_512x330xbf16_into_512x512xf32(%lhs: tensor<512x330xbf16>, %rhs: tensor<512x330xbf16>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x330xbf16>, tensor<512x330xbf16>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xf32(%lhs: tensor<1x1000xbf16>, %rhs: tensor<1000x1000xbf16>, %acc: tensor<1x1000xf32>) -> tensor<1x1000xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1000xbf16>, tensor<1000x1000xbf16>) outs(%acc: tensor<1x1000xf32>) -> tensor<1x1000xf32>
+  return %result: tensor<1x1000xf32>
+}
+
+func.func @matmul_accumulate_1000x1000xbf16_times_1x1000xbf16_into_1000x1xf32(%lhs: tensor<1000x1000xbf16>, %rhs: tensor<1x1000xbf16>, %acc: tensor<1000x1xf32>) -> tensor<1000x1xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x1000xbf16>, tensor<1x1000xbf16>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
+func.func @matmul_1000x1000xbf16_times_1x1000xbf16_into_1000x1xf32(%lhs: tensor<1000x1000xbf16>, %rhs: tensor<1x1000xbf16>) -> tensor<1000x1xf32> {
+  %init_acc = tensor.empty() : tensor<1000x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x1000xbf16>, tensor<1x1000xbf16>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_large_calls.mlir b/linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_large_calls.mlir
new file mode 100644
index 0000000..0aed8ea
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_large_calls.mlir
@@ -0,0 +1,882 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_512x128xbf16_times_512x128xbf16_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xbf16_times_512x128xbf16_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x4xbf16_times_512x4xbf16_into_1000x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_4x1000xbf16_times_512x1000xbf16_into_4x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x1000xbf16_times_4x1000xbf16_into_512x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xbf16_times_500x128xbf16_into_512x500xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xbf16_times_512x330xbf16_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xbf16_times_514x330xbf16_into_457x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_438x330xbf16_times_514x330xbf16_into_438x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_540x332xbf16_times_516x332xbf16_into_540x516xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_654x321xbf16_times_234x321xbf16_into_654x234xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x160xbf16_times_512x160xbf16_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x330xbf16_times_512x330xbf16_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1000x1000xbf16_times_1x1000xbf16_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x1000xbf16_times_1x1000xbf16_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_512_128_512_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_512x128xbf16_times_512x128xbf16_into_512x512xf32_512_128_512_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_512x128xbf16_times_512x128xbf16_into_512x512xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_512_128_512_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xbf16_times_512x128xbf16_into_512x512xf32_512_128_512_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xbf16_times_512x128xbf16_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1000_4_512_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x4xbf16_times_512x4xbf16_into_1000x512xf32_1000_4_512_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 14 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 15 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x4xbf16_times_512x4xbf16_into_1000x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_4_1000_512_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 16 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 17 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_4x1000xbf16_times_512x1000xbf16_into_4x512xf32_4_1000_512_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_4x1000xbf16_times_512x1000xbf16_into_4x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_512_1000_4_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 20 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 21 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x1000xbf16_times_4x1000xbf16_into_512x4xf32_512_1000_4_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 22 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 23 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x1000xbf16_times_4x1000xbf16_into_512x4xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_512_128_500_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 500 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xbf16_times_500x128xbf16_into_512x500xf32_512_128_500_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 26 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 500 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 27 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xbf16_times_500x128xbf16_into_512x500xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_457_330_512_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 28 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 29 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xbf16_times_512x330xbf16_into_457x512xf32_457_330_512_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xbf16_times_512x330xbf16_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_457_330_514_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 32 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 33 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xbf16_times_514x330xbf16_into_457x514xf32_457_330_514_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 34 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 35 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xbf16_times_514x330xbf16_into_457x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_438_330_514_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_438x330xbf16_times_514x330xbf16_into_438x514xf32_438_330_514_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 38 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 39 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_438x330xbf16_times_514x330xbf16_into_438x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_540_332_516_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 40 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 516 : i64
+  %rhs_dim1 = arith.constant 332 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 41 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_540x332xbf16_times_516x332xbf16_into_540x516xf32_540_332_516_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 516 : i64
+  %rhs_dim1 = arith.constant 332 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_540x332xbf16_times_516x332xbf16_into_540x516xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_654_321_234_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 234 : i64
+  %rhs_dim1 = arith.constant 321 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_654x321xbf16_times_234x321xbf16_into_654x234xf32_654_321_234_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 234 : i64
+  %rhs_dim1 = arith.constant 321 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_654x321xbf16_times_234x321xbf16_into_654x234xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_457_160_512_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 48 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 160 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 49 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x160xbf16_times_512x160xbf16_into_457x512xf32_457_160_512_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 50 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 160 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 51 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x160xbf16_times_512x160xbf16_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_512_330_512_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x330xbf16_times_512x330xbf16_into_512x512xf32_512_330_512_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 54 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 55 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x330xbf16_times_512x330xbf16_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1_1000_1000_acc_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 56 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 57 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 58 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 58 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xf32_1_1000_1000_acc_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 59 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 60 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 61 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 61 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1000xbf16_times_1000x1000xbf16_into_1x1000xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1000_1000_1_acc_28() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1000x1000xbf16_times_1x1000xbf16_into_1000x1xf32_1000_1000_1_acc_29() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1000x1000xbf16_times_1x1000xbf16_into_1000x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1000_1000_1_30() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x1000xbf16_times_1x1000xbf16_into_1000x1xf32_1000_1000_1_31() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 70 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 71 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x1000xbf16_times_1x1000xbf16_into_1000x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_small.mlir b/linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_small.mlir
new file mode 100644
index 0000000..f867453
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_small.mlir
@@ -0,0 +1,99 @@
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xf32(%lhs: tensor<1x1xbf16>, %rhs: tensor<1x1xbf16>, %acc: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1xbf16>, tensor<1x1xbf16>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: tensor<?x?xbf16>, %rhs: tensor<?x?xbf16>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xbf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xbf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xbf16>, tensor<?x?xbf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_1x1xbf16_times_1x1xbf16_into_1x1xf32(%lhs: tensor<1x1xbf16>, %rhs: tensor<1x1xbf16>) -> tensor<1x1xf32> {
+  %init_acc = tensor.empty() : tensor<1x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x1xf32>) -> tensor<1x1xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1xbf16>, tensor<1x1xbf16>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xf32(%lhs: tensor<2x2xbf16>, %rhs: tensor<2x2xbf16>, %acc: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<2x2xbf16>, tensor<2x2xbf16>) outs(%acc: tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %result: tensor<2x2xf32>
+}
+
+func.func @matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xf32(%lhs: tensor<4x4xbf16>, %rhs: tensor<4x4xbf16>, %acc: tensor<4x4xf32>) -> tensor<4x4xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<4x4xbf16>, tensor<4x4xbf16>) outs(%acc: tensor<4x4xf32>) -> tensor<4x4xf32>
+  return %result: tensor<4x4xf32>
+}
+
+func.func @matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xf32(%lhs: tensor<8x8xbf16>, %rhs: tensor<8x8xbf16>, %acc: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<8x8xbf16>, tensor<8x8xbf16>) outs(%acc: tensor<8x8xf32>) -> tensor<8x8xf32>
+  return %result: tensor<8x8xf32>
+}
+
+func.func @matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xf32(%lhs: tensor<9x9xbf16>, %rhs: tensor<9x9xbf16>, %acc: tensor<9x9xf32>) -> tensor<9x9xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<9x9xbf16>, tensor<9x9xbf16>) outs(%acc: tensor<9x9xf32>) -> tensor<9x9xf32>
+  return %result: tensor<9x9xf32>
+}
+
+func.func @matmul_accumulate_6x13xbf16_times_3x13xbf16_into_6x3xf32(%lhs: tensor<6x13xbf16>, %rhs: tensor<3x13xbf16>, %acc: tensor<6x3xf32>) -> tensor<6x3xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<6x13xbf16>, tensor<3x13xbf16>) outs(%acc: tensor<6x3xf32>) -> tensor<6x3xf32>
+  return %result: tensor<6x3xf32>
+}
+
+func.func @matmul_15x37xbf16_times_7x37xbf16_into_15x7xf32(%lhs: tensor<15x37xbf16>, %rhs: tensor<7x37xbf16>) -> tensor<15x7xf32> {
+  %init_acc = tensor.empty() : tensor<15x7xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<15x7xf32>) -> tensor<15x7xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<15x37xbf16>, tensor<7x37xbf16>) outs(%acc: tensor<15x7xf32>) -> tensor<15x7xf32>
+  return %result: tensor<15x7xf32>
+}
+
+func.func @matmul_accumulate_81x19xbf16_times_41x19xbf16_into_81x41xf32(%lhs: tensor<81x19xbf16>, %rhs: tensor<41x19xbf16>, %acc: tensor<81x41xf32>) -> tensor<81x41xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<81x19xbf16>, tensor<41x19xbf16>) outs(%acc: tensor<81x41xf32>) -> tensor<81x41xf32>
+  return %result: tensor<81x41xf32>
+}
+
+func.func @matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xf32(%lhs: tensor<1x10xbf16>, %rhs: tensor<10x10xbf16>, %acc: tensor<1x10xf32>) -> tensor<1x10xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x10xbf16>, tensor<10x10xbf16>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_1x10xbf16_times_10x10xbf16_into_1x10xf32(%lhs: tensor<1x10xbf16>, %rhs: tensor<10x10xbf16>) -> tensor<1x10xf32> {
+  %init_acc = tensor.empty() : tensor<1x10xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x10xf32>) -> tensor<1x10xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x10xbf16>, tensor<10x10xbf16>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_accumulate_10x1xbf16_times_10x1xbf16_into_10x10xf32(%lhs: tensor<10x1xbf16>, %rhs: tensor<10x1xbf16>, %acc: tensor<10x10xf32>) -> tensor<10x10xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x1xbf16>, tensor<10x1xbf16>) outs(%acc: tensor<10x10xf32>) -> tensor<10x10xf32>
+  return %result: tensor<10x10xf32>
+}
+
+func.func @matmul_accumulate_10x10xbf16_times_1x10xbf16_into_10x1xf32(%lhs: tensor<10x10xbf16>, %rhs: tensor<1x10xbf16>, %acc: tensor<10x1xf32>) -> tensor<10x1xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x10xbf16>, tensor<1x10xbf16>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
+func.func @matmul_10x10xbf16_times_1x10xbf16_into_10x1xf32(%lhs: tensor<10x10xbf16>, %rhs: tensor<1x10xbf16>) -> tensor<10x1xf32> {
+  %init_acc = tensor.empty() : tensor<10x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<10x1xf32>) -> tensor<10x1xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x10xbf16>, tensor<1x10xbf16>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_small_calls.mlir b/linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_small_calls.mlir
new file mode 100644
index 0000000..24b3cae
--- /dev/null
+++ b/linalg_ops/matmul/generated/bf16_into_f32/matmul_transpose_b_bf16_into_f32_small_calls.mlir
@@ -0,0 +1,906 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x1xbf16_times_1x1xbf16_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_6x13xbf16_times_3x13xbf16_into_6x3xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_15x37xbf16_times_7x37xbf16_into_15x7xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_81x19xbf16_times_41x19xbf16_into_81x41xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x10xbf16_times_10x10xbf16_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x1xbf16_times_10x1xbf16_into_10x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x10xbf16_times_1x10xbf16_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_10x10xbf16_times_1x10xbf16_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xf32_1_1_1_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1xbf16_times_1x1xbf16_into_1x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1_1_1_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x1xbf16_times_1x1xbf16_into_1x1xf32_1_1_1_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x1xbf16_times_1x1xbf16_into_1x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_2_2_2_acc_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 14 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 14 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xf32_2_2_2_acc_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 15 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 16 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 17 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 17 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_2x2xbf16_times_2x2xbf16_into_2x2xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_4_4_4_acc_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 20 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 20 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xf32_4_4_4_acc_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 21 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 22 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 23 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 23 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_4x4xbf16_times_4x4xbf16_into_4x4xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_8_8_8_acc_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 26 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 26 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xf32_8_8_8_acc_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 27 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 28 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 29 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 29 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_8x8xbf16_times_8x8xbf16_into_8x8xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_9_9_9_acc_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 32 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 32 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xf32_9_9_9_acc_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 33 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 34 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 35 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 35 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_9x9xbf16_times_9x9xbf16_into_9x9xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_6_13_3_acc_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 3 : i64
+  %rhs_dim1 = arith.constant 13 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 38 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 38 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_6x13xbf16_times_3x13xbf16_into_6x3xf32_6_13_3_acc_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 39 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 3 : i64
+  %rhs_dim1 = arith.constant 13 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 40 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 41 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 41 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_6x13xbf16_times_3x13xbf16_into_6x3xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_15_37_7_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 7 : i64
+  %rhs_dim1 = arith.constant 37 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_15x37xbf16_times_7x37xbf16_into_15x7xf32_15_37_7_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 7 : i64
+  %rhs_dim1 = arith.constant 37 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_15x37xbf16_times_7x37xbf16_into_15x7xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_81_19_41_acc_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 41 : i64
+  %rhs_dim1 = arith.constant 19 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 48 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 48 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_81x19xbf16_times_41x19xbf16_into_81x41xf32_81_19_41_acc_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 49 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 41 : i64
+  %rhs_dim1 = arith.constant 19 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 50 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 51 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 51 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_81x19xbf16_times_41x19xbf16_into_81x41xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1_10_10_acc_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 54 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 54 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xf32_1_10_10_acc_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 55 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 56 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 57 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 57 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x10xbf16_times_10x10xbf16_into_1x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_1_10_10_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 58 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 59 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x10xbf16_times_10x10xbf16_into_1x10xf32_1_10_10_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 60 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 61 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x10xbf16_times_10x10xbf16_into_1x10xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_10_1_10_acc_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x1xbf16_times_10x1xbf16_into_10x10xf32_10_1_10_acc_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x1xbf16_times_10x1xbf16_into_10x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_10_10_1_acc_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 70 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 70 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x10xbf16_times_1x10xbf16_into_10x1xf32_10_10_1_acc_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 71 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 72 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 73 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 73 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x10xbf16_times_1x10xbf16_into_10x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_10_10_1_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 74 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 75 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_10x10xbf16_times_1x10xbf16_into_10x1xf32_10_10_1_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<bf16> : i32
+  %lhs_seed = arith.constant 76 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<bf16> : i32
+  %rhs_seed = arith.constant 77 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_10x10xbf16_times_1x10xbf16_into_10x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_large.mlir b/linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_large.mlir
new file mode 100644
index 0000000..649e4fb
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_large.mlir
@@ -0,0 +1,136 @@
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %acc: tensor<?x?xf16>) -> tensor<?x?xf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf16>) -> tensor<?x?xf16>
+  return %result: tensor<?x?xf16>
+}
+
+func.func @matmul_accumulate_512x128xf16_times_128x512xf16_into_512x512xf16(%lhs: tensor<512x128xf16>, %rhs: tensor<128x512xf16>, %acc: tensor<512x512xf16>) -> tensor<512x512xf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xf16>, tensor<128x512xf16>) outs(%acc: tensor<512x512xf16>) -> tensor<512x512xf16>
+  return %result: tensor<512x512xf16>
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>) -> tensor<?x?xf16> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<?x?xf16>) -> tensor<?x?xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf16>) -> tensor<?x?xf16>
+  return %result: tensor<?x?xf16>
+}
+
+func.func @matmul_512x128xf16_times_128x512xf16_into_512x512xf16(%lhs: tensor<512x128xf16>, %rhs: tensor<128x512xf16>) -> tensor<512x512xf16> {
+  %init_acc = tensor.empty() : tensor<512x512xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<512x512xf16>) -> tensor<512x512xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xf16>, tensor<128x512xf16>) outs(%acc: tensor<512x512xf16>) -> tensor<512x512xf16>
+  return %result: tensor<512x512xf16>
+}
+
+func.func @matmul_1000x4xf16_times_4x512xf16_into_1000x512xf16(%lhs: tensor<1000x4xf16>, %rhs: tensor<4x512xf16>) -> tensor<1000x512xf16> {
+  %init_acc = tensor.empty() : tensor<1000x512xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<1000x512xf16>) -> tensor<1000x512xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x4xf16>, tensor<4x512xf16>) outs(%acc: tensor<1000x512xf16>) -> tensor<1000x512xf16>
+  return %result: tensor<1000x512xf16>
+}
+
+func.func @matmul_4x1000xf16_times_1000x512xf16_into_4x512xf16(%lhs: tensor<4x1000xf16>, %rhs: tensor<1000x512xf16>) -> tensor<4x512xf16> {
+  %init_acc = tensor.empty() : tensor<4x512xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<4x512xf16>) -> tensor<4x512xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<4x1000xf16>, tensor<1000x512xf16>) outs(%acc: tensor<4x512xf16>) -> tensor<4x512xf16>
+  return %result: tensor<4x512xf16>
+}
+
+func.func @matmul_512x1000xf16_times_1000x4xf16_into_512x4xf16(%lhs: tensor<512x1000xf16>, %rhs: tensor<1000x4xf16>) -> tensor<512x4xf16> {
+  %init_acc = tensor.empty() : tensor<512x4xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<512x4xf16>) -> tensor<512x4xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x1000xf16>, tensor<1000x4xf16>) outs(%acc: tensor<512x4xf16>) -> tensor<512x4xf16>
+  return %result: tensor<512x4xf16>
+}
+
+func.func @matmul_512x128xf16_times_128x500xf16_into_512x500xf16(%lhs: tensor<512x128xf16>, %rhs: tensor<128x500xf16>) -> tensor<512x500xf16> {
+  %init_acc = tensor.empty() : tensor<512x500xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<512x500xf16>) -> tensor<512x500xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xf16>, tensor<128x500xf16>) outs(%acc: tensor<512x500xf16>) -> tensor<512x500xf16>
+  return %result: tensor<512x500xf16>
+}
+
+func.func @matmul_457x330xf16_times_330x512xf16_into_457x512xf16(%lhs: tensor<457x330xf16>, %rhs: tensor<330x512xf16>) -> tensor<457x512xf16> {
+  %init_acc = tensor.empty() : tensor<457x512xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<457x512xf16>) -> tensor<457x512xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x330xf16>, tensor<330x512xf16>) outs(%acc: tensor<457x512xf16>) -> tensor<457x512xf16>
+  return %result: tensor<457x512xf16>
+}
+
+func.func @matmul_457x330xf16_times_330x514xf16_into_457x514xf16(%lhs: tensor<457x330xf16>, %rhs: tensor<330x514xf16>) -> tensor<457x514xf16> {
+  %init_acc = tensor.empty() : tensor<457x514xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<457x514xf16>) -> tensor<457x514xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x330xf16>, tensor<330x514xf16>) outs(%acc: tensor<457x514xf16>) -> tensor<457x514xf16>
+  return %result: tensor<457x514xf16>
+}
+
+func.func @matmul_438x330xf16_times_330x514xf16_into_438x514xf16(%lhs: tensor<438x330xf16>, %rhs: tensor<330x514xf16>) -> tensor<438x514xf16> {
+  %init_acc = tensor.empty() : tensor<438x514xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<438x514xf16>) -> tensor<438x514xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<438x330xf16>, tensor<330x514xf16>) outs(%acc: tensor<438x514xf16>) -> tensor<438x514xf16>
+  return %result: tensor<438x514xf16>
+}
+
+func.func @matmul_540x332xf16_times_332x516xf16_into_540x516xf16(%lhs: tensor<540x332xf16>, %rhs: tensor<332x516xf16>) -> tensor<540x516xf16> {
+  %init_acc = tensor.empty() : tensor<540x516xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<540x516xf16>) -> tensor<540x516xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<540x332xf16>, tensor<332x516xf16>) outs(%acc: tensor<540x516xf16>) -> tensor<540x516xf16>
+  return %result: tensor<540x516xf16>
+}
+
+func.func @matmul_654x321xf16_times_321x234xf16_into_654x234xf16(%lhs: tensor<654x321xf16>, %rhs: tensor<321x234xf16>) -> tensor<654x234xf16> {
+  %init_acc = tensor.empty() : tensor<654x234xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<654x234xf16>) -> tensor<654x234xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<654x321xf16>, tensor<321x234xf16>) outs(%acc: tensor<654x234xf16>) -> tensor<654x234xf16>
+  return %result: tensor<654x234xf16>
+}
+
+func.func @matmul_457x160xf16_times_160x512xf16_into_457x512xf16(%lhs: tensor<457x160xf16>, %rhs: tensor<160x512xf16>) -> tensor<457x512xf16> {
+  %init_acc = tensor.empty() : tensor<457x512xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<457x512xf16>) -> tensor<457x512xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x160xf16>, tensor<160x512xf16>) outs(%acc: tensor<457x512xf16>) -> tensor<457x512xf16>
+  return %result: tensor<457x512xf16>
+}
+
+func.func @matmul_512x330xf16_times_330x512xf16_into_512x512xf16(%lhs: tensor<512x330xf16>, %rhs: tensor<330x512xf16>) -> tensor<512x512xf16> {
+  %init_acc = tensor.empty() : tensor<512x512xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<512x512xf16>) -> tensor<512x512xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x330xf16>, tensor<330x512xf16>) outs(%acc: tensor<512x512xf16>) -> tensor<512x512xf16>
+  return %result: tensor<512x512xf16>
+}
+
+func.func @matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf16(%lhs: tensor<1x1000xf16>, %rhs: tensor<1000x1000xf16>, %acc: tensor<1x1000xf16>) -> tensor<1x1000xf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1000xf16>, tensor<1000x1000xf16>) outs(%acc: tensor<1x1000xf16>) -> tensor<1x1000xf16>
+  return %result: tensor<1x1000xf16>
+}
+
+func.func @matmul_accumulate_1000x1000xf16_times_1000x1xf16_into_1000x1xf16(%lhs: tensor<1000x1000xf16>, %rhs: tensor<1000x1xf16>, %acc: tensor<1000x1xf16>) -> tensor<1000x1xf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x1000xf16>, tensor<1000x1xf16>) outs(%acc: tensor<1000x1xf16>) -> tensor<1000x1xf16>
+  return %result: tensor<1000x1xf16>
+}
+
+func.func @matmul_1000x1000xf16_times_1000x1xf16_into_1000x1xf16(%lhs: tensor<1000x1000xf16>, %rhs: tensor<1000x1xf16>) -> tensor<1000x1xf16> {
+  %init_acc = tensor.empty() : tensor<1000x1xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<1000x1xf16>) -> tensor<1000x1xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x1000xf16>, tensor<1000x1xf16>) outs(%acc: tensor<1000x1xf16>) -> tensor<1000x1xf16>
+  return %result: tensor<1000x1xf16>
+}
+
diff --git a/linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_large_calls.mlir b/linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_large_calls.mlir
new file mode 100644
index 0000000..998c031
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_large_calls.mlir
@@ -0,0 +1,882 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_512x128xf16_times_128x512xf16_into_512x512xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf16_times_128x512xf16_into_512x512xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x4xf16_times_4x512xf16_into_1000x512xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_4x1000xf16_times_1000x512xf16_into_4x512xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x1000xf16_times_1000x4xf16_into_512x4xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf16_times_128x500xf16_into_512x500xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf16_times_330x512xf16_into_457x512xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf16_times_330x514xf16_into_457x514xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_438x330xf16_times_330x514xf16_into_438x514xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_540x332xf16_times_332x516xf16_into_540x516xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_654x321xf16_times_321x234xf16_into_654x234xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x160xf16_times_160x512xf16_into_457x512xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x330xf16_times_330x512xf16_into_512x512xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1000x1000xf16_times_1000x1xf16_into_1000x1xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x1000xf16_times_1000x1xf16_into_1000x1xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_512_128_512_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_512x128xf16_times_128x512xf16_into_512x512xf16_512_128_512_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_512x128xf16_times_128x512xf16_into_512x512xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_512_128_512_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf16_times_128x512xf16_into_512x512xf16_512_128_512_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf16_times_128x512xf16_into_512x512xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1000_4_512_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x4xf16_times_4x512xf16_into_1000x512xf16_1000_4_512_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 14 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 15 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x4xf16_times_4x512xf16_into_1000x512xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_4_1000_512_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 16 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 17 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_4x1000xf16_times_1000x512xf16_into_4x512xf16_4_1000_512_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_4x1000xf16_times_1000x512xf16_into_4x512xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_512_1000_4_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 20 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 21 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x1000xf16_times_1000x4xf16_into_512x4xf16_512_1000_4_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 22 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 23 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x1000xf16_times_1000x4xf16_into_512x4xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_512_128_500_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 500 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf16_times_128x500xf16_into_512x500xf16_512_128_500_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 26 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 500 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 27 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf16_times_128x500xf16_into_512x500xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_457_330_512_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 28 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 29 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf16_times_330x512xf16_into_457x512xf16_457_330_512_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf16_times_330x512xf16_into_457x512xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_457_330_514_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 32 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 33 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf16_times_330x514xf16_into_457x514xf16_457_330_514_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 34 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 35 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf16_times_330x514xf16_into_457x514xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_438_330_514_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_438x330xf16_times_330x514xf16_into_438x514xf16_438_330_514_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 38 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 39 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_438x330xf16_times_330x514xf16_into_438x514xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_540_332_516_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 40 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 332 : i64
+  %rhs_dim1 = arith.constant 516 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 41 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_540x332xf16_times_332x516xf16_into_540x516xf16_540_332_516_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 332 : i64
+  %rhs_dim1 = arith.constant 516 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_540x332xf16_times_332x516xf16_into_540x516xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_654_321_234_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 321 : i64
+  %rhs_dim1 = arith.constant 234 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_654x321xf16_times_321x234xf16_into_654x234xf16_654_321_234_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 321 : i64
+  %rhs_dim1 = arith.constant 234 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_654x321xf16_times_321x234xf16_into_654x234xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_457_160_512_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 48 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 160 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 49 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x160xf16_times_160x512xf16_into_457x512xf16_457_160_512_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 50 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 160 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 51 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x160xf16_times_160x512xf16_into_457x512xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_512_330_512_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x330xf16_times_330x512xf16_into_512x512xf16_512_330_512_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 54 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 55 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x330xf16_times_330x512xf16_into_512x512xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1_1000_1000_acc_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 56 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 57 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 58 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 58 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf16_1_1000_1000_acc_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 59 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 60 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 61 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 61 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1000_1000_1_acc_28() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1000x1000xf16_times_1000x1xf16_into_1000x1xf16_1000_1000_1_acc_29() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1000x1000xf16_times_1000x1xf16_into_1000x1xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1000_1000_1_30() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x1000xf16_times_1000x1xf16_into_1000x1xf16_1000_1000_1_31() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 70 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 71 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x1000xf16_times_1000x1xf16_into_1000x1xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_small.mlir b/linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_small.mlir
new file mode 100644
index 0000000..3cfe7cd
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_small.mlir
@@ -0,0 +1,99 @@
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %acc: tensor<?x?xf16>) -> tensor<?x?xf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf16>) -> tensor<?x?xf16>
+  return %result: tensor<?x?xf16>
+}
+
+func.func @matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf16(%lhs: tensor<1x1xf16>, %rhs: tensor<1x1xf16>, %acc: tensor<1x1xf16>) -> tensor<1x1xf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1xf16>, tensor<1x1xf16>) outs(%acc: tensor<1x1xf16>) -> tensor<1x1xf16>
+  return %result: tensor<1x1xf16>
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>) -> tensor<?x?xf16> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<?x?xf16>) -> tensor<?x?xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf16>) -> tensor<?x?xf16>
+  return %result: tensor<?x?xf16>
+}
+
+func.func @matmul_1x1xf16_times_1x1xf16_into_1x1xf16(%lhs: tensor<1x1xf16>, %rhs: tensor<1x1xf16>) -> tensor<1x1xf16> {
+  %init_acc = tensor.empty() : tensor<1x1xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<1x1xf16>) -> tensor<1x1xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1xf16>, tensor<1x1xf16>) outs(%acc: tensor<1x1xf16>) -> tensor<1x1xf16>
+  return %result: tensor<1x1xf16>
+}
+
+func.func @matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf16(%lhs: tensor<2x2xf16>, %rhs: tensor<2x2xf16>, %acc: tensor<2x2xf16>) -> tensor<2x2xf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<2x2xf16>, tensor<2x2xf16>) outs(%acc: tensor<2x2xf16>) -> tensor<2x2xf16>
+  return %result: tensor<2x2xf16>
+}
+
+func.func @matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf16(%lhs: tensor<4x4xf16>, %rhs: tensor<4x4xf16>, %acc: tensor<4x4xf16>) -> tensor<4x4xf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<4x4xf16>, tensor<4x4xf16>) outs(%acc: tensor<4x4xf16>) -> tensor<4x4xf16>
+  return %result: tensor<4x4xf16>
+}
+
+func.func @matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf16(%lhs: tensor<8x8xf16>, %rhs: tensor<8x8xf16>, %acc: tensor<8x8xf16>) -> tensor<8x8xf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<8x8xf16>, tensor<8x8xf16>) outs(%acc: tensor<8x8xf16>) -> tensor<8x8xf16>
+  return %result: tensor<8x8xf16>
+}
+
+func.func @matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf16(%lhs: tensor<9x9xf16>, %rhs: tensor<9x9xf16>, %acc: tensor<9x9xf16>) -> tensor<9x9xf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<9x9xf16>, tensor<9x9xf16>) outs(%acc: tensor<9x9xf16>) -> tensor<9x9xf16>
+  return %result: tensor<9x9xf16>
+}
+
+func.func @matmul_accumulate_6x13xf16_times_13x3xf16_into_6x3xf16(%lhs: tensor<6x13xf16>, %rhs: tensor<13x3xf16>, %acc: tensor<6x3xf16>) -> tensor<6x3xf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<6x13xf16>, tensor<13x3xf16>) outs(%acc: tensor<6x3xf16>) -> tensor<6x3xf16>
+  return %result: tensor<6x3xf16>
+}
+
+func.func @matmul_15x37xf16_times_37x7xf16_into_15x7xf16(%lhs: tensor<15x37xf16>, %rhs: tensor<37x7xf16>) -> tensor<15x7xf16> {
+  %init_acc = tensor.empty() : tensor<15x7xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<15x7xf16>) -> tensor<15x7xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<15x37xf16>, tensor<37x7xf16>) outs(%acc: tensor<15x7xf16>) -> tensor<15x7xf16>
+  return %result: tensor<15x7xf16>
+}
+
+func.func @matmul_accumulate_81x19xf16_times_19x41xf16_into_81x41xf16(%lhs: tensor<81x19xf16>, %rhs: tensor<19x41xf16>, %acc: tensor<81x41xf16>) -> tensor<81x41xf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<81x19xf16>, tensor<19x41xf16>) outs(%acc: tensor<81x41xf16>) -> tensor<81x41xf16>
+  return %result: tensor<81x41xf16>
+}
+
+func.func @matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf16(%lhs: tensor<1x10xf16>, %rhs: tensor<10x10xf16>, %acc: tensor<1x10xf16>) -> tensor<1x10xf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x10xf16>, tensor<10x10xf16>) outs(%acc: tensor<1x10xf16>) -> tensor<1x10xf16>
+  return %result: tensor<1x10xf16>
+}
+
+func.func @matmul_1x10xf16_times_10x10xf16_into_1x10xf16(%lhs: tensor<1x10xf16>, %rhs: tensor<10x10xf16>) -> tensor<1x10xf16> {
+  %init_acc = tensor.empty() : tensor<1x10xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<1x10xf16>) -> tensor<1x10xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x10xf16>, tensor<10x10xf16>) outs(%acc: tensor<1x10xf16>) -> tensor<1x10xf16>
+  return %result: tensor<1x10xf16>
+}
+
+func.func @matmul_accumulate_10x1xf16_times_1x10xf16_into_10x10xf16(%lhs: tensor<10x1xf16>, %rhs: tensor<1x10xf16>, %acc: tensor<10x10xf16>) -> tensor<10x10xf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x1xf16>, tensor<1x10xf16>) outs(%acc: tensor<10x10xf16>) -> tensor<10x10xf16>
+  return %result: tensor<10x10xf16>
+}
+
+func.func @matmul_accumulate_10x10xf16_times_10x1xf16_into_10x1xf16(%lhs: tensor<10x10xf16>, %rhs: tensor<10x1xf16>, %acc: tensor<10x1xf16>) -> tensor<10x1xf16> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x10xf16>, tensor<10x1xf16>) outs(%acc: tensor<10x1xf16>) -> tensor<10x1xf16>
+  return %result: tensor<10x1xf16>
+}
+
+func.func @matmul_10x10xf16_times_10x1xf16_into_10x1xf16(%lhs: tensor<10x10xf16>, %rhs: tensor<10x1xf16>) -> tensor<10x1xf16> {
+  %init_acc = tensor.empty() : tensor<10x1xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<10x1xf16>) -> tensor<10x1xf16>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x10xf16>, tensor<10x1xf16>) outs(%acc: tensor<10x1xf16>) -> tensor<10x1xf16>
+  return %result: tensor<10x1xf16>
+}
+
diff --git a/linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_small_calls.mlir b/linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_small_calls.mlir
new file mode 100644
index 0000000..fd681dc
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f16/matmul_f16_into_f16_small_calls.mlir
@@ -0,0 +1,906 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x1xf16_times_1x1xf16_into_1x1xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_6x13xf16_times_13x3xf16_into_6x3xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_15x37xf16_times_37x7xf16_into_15x7xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_81x19xf16_times_19x41xf16_into_81x41xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x10xf16_times_10x10xf16_into_1x10xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x1xf16_times_1x10xf16_into_10x10xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x10xf16_times_10x1xf16_into_10x1xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_10x10xf16_times_10x1xf16_into_10x1xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf16_1_1_1_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1_1_1_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x1xf16_times_1x1xf16_into_1x1xf16_1_1_1_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x1xf16_times_1x1xf16_into_1x1xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_2_2_2_acc_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 14 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 14 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf16_2_2_2_acc_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 15 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 16 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 17 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 17 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_4_4_4_acc_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 20 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 20 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf16_4_4_4_acc_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 21 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 22 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 23 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 23 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_8_8_8_acc_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 26 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 26 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf16_8_8_8_acc_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 27 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 28 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 29 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 29 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_9_9_9_acc_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 32 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 32 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf16_9_9_9_acc_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 33 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 34 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 35 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 35 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_6_13_3_acc_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 13 : i64
+  %rhs_dim1 = arith.constant 3 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 38 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 38 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_6x13xf16_times_13x3xf16_into_6x3xf16_6_13_3_acc_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 39 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 13 : i64
+  %rhs_dim1 = arith.constant 3 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 40 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 41 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 41 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_6x13xf16_times_13x3xf16_into_6x3xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_15_37_7_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 37 : i64
+  %rhs_dim1 = arith.constant 7 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_15x37xf16_times_37x7xf16_into_15x7xf16_15_37_7_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 37 : i64
+  %rhs_dim1 = arith.constant 7 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_15x37xf16_times_37x7xf16_into_15x7xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_81_19_41_acc_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 19 : i64
+  %rhs_dim1 = arith.constant 41 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 48 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 48 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_81x19xf16_times_19x41xf16_into_81x41xf16_81_19_41_acc_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 49 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 19 : i64
+  %rhs_dim1 = arith.constant 41 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 50 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 51 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 51 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_81x19xf16_times_19x41xf16_into_81x41xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1_10_10_acc_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 54 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 54 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf16_1_10_10_acc_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 55 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 56 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 57 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 57 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1_10_10_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 58 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 59 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x10xf16_times_10x10xf16_into_1x10xf16_1_10_10_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 60 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 61 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x10xf16_times_10x10xf16_into_1x10xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_10_1_10_acc_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x1xf16_times_1x10xf16_into_10x10xf16_10_1_10_acc_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x1xf16_times_1x10xf16_into_10x10xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_10_10_1_acc_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 70 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 70 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x10xf16_times_10x1xf16_into_10x1xf16_10_10_1_acc_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 71 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 72 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 73 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 73 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x10xf16_times_10x1xf16_into_10x1xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_10_10_1_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 74 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 75 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_10x10xf16_times_10x1xf16_into_10x1xf16_10_10_1_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 76 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 77 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_10x10xf16_times_10x1xf16_into_10x1xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_large.mlir b/linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_large.mlir
new file mode 100644
index 0000000..6663571
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_large.mlir
@@ -0,0 +1,136 @@
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %acc: tensor<?x?xf16>) -> tensor<?x?xf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf16>) -> tensor<?x?xf16>
+  return %result: tensor<?x?xf16>
+}
+
+func.func @matmul_accumulate_512x128xf16_times_512x128xf16_into_512x512xf16(%lhs: tensor<512x128xf16>, %rhs: tensor<512x128xf16>, %acc: tensor<512x512xf16>) -> tensor<512x512xf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xf16>, tensor<512x128xf16>) outs(%acc: tensor<512x512xf16>) -> tensor<512x512xf16>
+  return %result: tensor<512x512xf16>
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>) -> tensor<?x?xf16> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<?x?xf16>) -> tensor<?x?xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf16>) -> tensor<?x?xf16>
+  return %result: tensor<?x?xf16>
+}
+
+func.func @matmul_512x128xf16_times_512x128xf16_into_512x512xf16(%lhs: tensor<512x128xf16>, %rhs: tensor<512x128xf16>) -> tensor<512x512xf16> {
+  %init_acc = tensor.empty() : tensor<512x512xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<512x512xf16>) -> tensor<512x512xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xf16>, tensor<512x128xf16>) outs(%acc: tensor<512x512xf16>) -> tensor<512x512xf16>
+  return %result: tensor<512x512xf16>
+}
+
+func.func @matmul_1000x4xf16_times_512x4xf16_into_1000x512xf16(%lhs: tensor<1000x4xf16>, %rhs: tensor<512x4xf16>) -> tensor<1000x512xf16> {
+  %init_acc = tensor.empty() : tensor<1000x512xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<1000x512xf16>) -> tensor<1000x512xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x4xf16>, tensor<512x4xf16>) outs(%acc: tensor<1000x512xf16>) -> tensor<1000x512xf16>
+  return %result: tensor<1000x512xf16>
+}
+
+func.func @matmul_4x1000xf16_times_512x1000xf16_into_4x512xf16(%lhs: tensor<4x1000xf16>, %rhs: tensor<512x1000xf16>) -> tensor<4x512xf16> {
+  %init_acc = tensor.empty() : tensor<4x512xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<4x512xf16>) -> tensor<4x512xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<4x1000xf16>, tensor<512x1000xf16>) outs(%acc: tensor<4x512xf16>) -> tensor<4x512xf16>
+  return %result: tensor<4x512xf16>
+}
+
+func.func @matmul_512x1000xf16_times_4x1000xf16_into_512x4xf16(%lhs: tensor<512x1000xf16>, %rhs: tensor<4x1000xf16>) -> tensor<512x4xf16> {
+  %init_acc = tensor.empty() : tensor<512x4xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<512x4xf16>) -> tensor<512x4xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x1000xf16>, tensor<4x1000xf16>) outs(%acc: tensor<512x4xf16>) -> tensor<512x4xf16>
+  return %result: tensor<512x4xf16>
+}
+
+func.func @matmul_512x128xf16_times_500x128xf16_into_512x500xf16(%lhs: tensor<512x128xf16>, %rhs: tensor<500x128xf16>) -> tensor<512x500xf16> {
+  %init_acc = tensor.empty() : tensor<512x500xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<512x500xf16>) -> tensor<512x500xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xf16>, tensor<500x128xf16>) outs(%acc: tensor<512x500xf16>) -> tensor<512x500xf16>
+  return %result: tensor<512x500xf16>
+}
+
+func.func @matmul_457x330xf16_times_512x330xf16_into_457x512xf16(%lhs: tensor<457x330xf16>, %rhs: tensor<512x330xf16>) -> tensor<457x512xf16> {
+  %init_acc = tensor.empty() : tensor<457x512xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<457x512xf16>) -> tensor<457x512xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x330xf16>, tensor<512x330xf16>) outs(%acc: tensor<457x512xf16>) -> tensor<457x512xf16>
+  return %result: tensor<457x512xf16>
+}
+
+func.func @matmul_457x330xf16_times_514x330xf16_into_457x514xf16(%lhs: tensor<457x330xf16>, %rhs: tensor<514x330xf16>) -> tensor<457x514xf16> {
+  %init_acc = tensor.empty() : tensor<457x514xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<457x514xf16>) -> tensor<457x514xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x330xf16>, tensor<514x330xf16>) outs(%acc: tensor<457x514xf16>) -> tensor<457x514xf16>
+  return %result: tensor<457x514xf16>
+}
+
+func.func @matmul_438x330xf16_times_514x330xf16_into_438x514xf16(%lhs: tensor<438x330xf16>, %rhs: tensor<514x330xf16>) -> tensor<438x514xf16> {
+  %init_acc = tensor.empty() : tensor<438x514xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<438x514xf16>) -> tensor<438x514xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<438x330xf16>, tensor<514x330xf16>) outs(%acc: tensor<438x514xf16>) -> tensor<438x514xf16>
+  return %result: tensor<438x514xf16>
+}
+
+func.func @matmul_540x332xf16_times_516x332xf16_into_540x516xf16(%lhs: tensor<540x332xf16>, %rhs: tensor<516x332xf16>) -> tensor<540x516xf16> {
+  %init_acc = tensor.empty() : tensor<540x516xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<540x516xf16>) -> tensor<540x516xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<540x332xf16>, tensor<516x332xf16>) outs(%acc: tensor<540x516xf16>) -> tensor<540x516xf16>
+  return %result: tensor<540x516xf16>
+}
+
+func.func @matmul_654x321xf16_times_234x321xf16_into_654x234xf16(%lhs: tensor<654x321xf16>, %rhs: tensor<234x321xf16>) -> tensor<654x234xf16> {
+  %init_acc = tensor.empty() : tensor<654x234xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<654x234xf16>) -> tensor<654x234xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<654x321xf16>, tensor<234x321xf16>) outs(%acc: tensor<654x234xf16>) -> tensor<654x234xf16>
+  return %result: tensor<654x234xf16>
+}
+
+func.func @matmul_457x160xf16_times_512x160xf16_into_457x512xf16(%lhs: tensor<457x160xf16>, %rhs: tensor<512x160xf16>) -> tensor<457x512xf16> {
+  %init_acc = tensor.empty() : tensor<457x512xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<457x512xf16>) -> tensor<457x512xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x160xf16>, tensor<512x160xf16>) outs(%acc: tensor<457x512xf16>) -> tensor<457x512xf16>
+  return %result: tensor<457x512xf16>
+}
+
+func.func @matmul_512x330xf16_times_512x330xf16_into_512x512xf16(%lhs: tensor<512x330xf16>, %rhs: tensor<512x330xf16>) -> tensor<512x512xf16> {
+  %init_acc = tensor.empty() : tensor<512x512xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<512x512xf16>) -> tensor<512x512xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x330xf16>, tensor<512x330xf16>) outs(%acc: tensor<512x512xf16>) -> tensor<512x512xf16>
+  return %result: tensor<512x512xf16>
+}
+
+func.func @matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf16(%lhs: tensor<1x1000xf16>, %rhs: tensor<1000x1000xf16>, %acc: tensor<1x1000xf16>) -> tensor<1x1000xf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1000xf16>, tensor<1000x1000xf16>) outs(%acc: tensor<1x1000xf16>) -> tensor<1x1000xf16>
+  return %result: tensor<1x1000xf16>
+}
+
+func.func @matmul_accumulate_1000x1000xf16_times_1x1000xf16_into_1000x1xf16(%lhs: tensor<1000x1000xf16>, %rhs: tensor<1x1000xf16>, %acc: tensor<1000x1xf16>) -> tensor<1000x1xf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x1000xf16>, tensor<1x1000xf16>) outs(%acc: tensor<1000x1xf16>) -> tensor<1000x1xf16>
+  return %result: tensor<1000x1xf16>
+}
+
+func.func @matmul_1000x1000xf16_times_1x1000xf16_into_1000x1xf16(%lhs: tensor<1000x1000xf16>, %rhs: tensor<1x1000xf16>) -> tensor<1000x1xf16> {
+  %init_acc = tensor.empty() : tensor<1000x1xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<1000x1xf16>) -> tensor<1000x1xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x1000xf16>, tensor<1x1000xf16>) outs(%acc: tensor<1000x1xf16>) -> tensor<1000x1xf16>
+  return %result: tensor<1000x1xf16>
+}
+
diff --git a/linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_large_calls.mlir b/linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_large_calls.mlir
new file mode 100644
index 0000000..fad0e2b
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_large_calls.mlir
@@ -0,0 +1,882 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_512x128xf16_times_512x128xf16_into_512x512xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf16_times_512x128xf16_into_512x512xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x4xf16_times_512x4xf16_into_1000x512xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_4x1000xf16_times_512x1000xf16_into_4x512xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x1000xf16_times_4x1000xf16_into_512x4xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf16_times_500x128xf16_into_512x500xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf16_times_512x330xf16_into_457x512xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf16_times_514x330xf16_into_457x514xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_438x330xf16_times_514x330xf16_into_438x514xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_540x332xf16_times_516x332xf16_into_540x516xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_654x321xf16_times_234x321xf16_into_654x234xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x160xf16_times_512x160xf16_into_457x512xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x330xf16_times_512x330xf16_into_512x512xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1000x1000xf16_times_1x1000xf16_into_1000x1xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x1000xf16_times_1x1000xf16_into_1000x1xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_512_128_512_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_512x128xf16_times_512x128xf16_into_512x512xf16_512_128_512_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_512x128xf16_times_512x128xf16_into_512x512xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_512_128_512_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf16_times_512x128xf16_into_512x512xf16_512_128_512_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf16_times_512x128xf16_into_512x512xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1000_4_512_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x4xf16_times_512x4xf16_into_1000x512xf16_1000_4_512_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 14 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 15 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x4xf16_times_512x4xf16_into_1000x512xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_4_1000_512_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 16 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 17 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_4x1000xf16_times_512x1000xf16_into_4x512xf16_4_1000_512_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_4x1000xf16_times_512x1000xf16_into_4x512xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_512_1000_4_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 20 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 21 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x1000xf16_times_4x1000xf16_into_512x4xf16_512_1000_4_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 22 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 23 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x1000xf16_times_4x1000xf16_into_512x4xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_512_128_500_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 500 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf16_times_500x128xf16_into_512x500xf16_512_128_500_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 26 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 500 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 27 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf16_times_500x128xf16_into_512x500xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_457_330_512_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 28 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 29 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf16_times_512x330xf16_into_457x512xf16_457_330_512_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf16_times_512x330xf16_into_457x512xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_457_330_514_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 32 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 33 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf16_times_514x330xf16_into_457x514xf16_457_330_514_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 34 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 35 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf16_times_514x330xf16_into_457x514xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_438_330_514_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_438x330xf16_times_514x330xf16_into_438x514xf16_438_330_514_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 38 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 39 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_438x330xf16_times_514x330xf16_into_438x514xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_540_332_516_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 40 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 516 : i64
+  %rhs_dim1 = arith.constant 332 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 41 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_540x332xf16_times_516x332xf16_into_540x516xf16_540_332_516_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 516 : i64
+  %rhs_dim1 = arith.constant 332 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_540x332xf16_times_516x332xf16_into_540x516xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_654_321_234_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 234 : i64
+  %rhs_dim1 = arith.constant 321 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_654x321xf16_times_234x321xf16_into_654x234xf16_654_321_234_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 234 : i64
+  %rhs_dim1 = arith.constant 321 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_654x321xf16_times_234x321xf16_into_654x234xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_457_160_512_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 48 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 160 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 49 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x160xf16_times_512x160xf16_into_457x512xf16_457_160_512_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 50 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 160 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 51 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x160xf16_times_512x160xf16_into_457x512xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_512_330_512_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x330xf16_times_512x330xf16_into_512x512xf16_512_330_512_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 54 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 55 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x330xf16_times_512x330xf16_into_512x512xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1_1000_1000_acc_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 56 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 57 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 58 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 58 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf16_1_1000_1000_acc_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 59 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 60 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 61 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 61 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1000_1000_1_acc_28() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1000x1000xf16_times_1x1000xf16_into_1000x1xf16_1000_1000_1_acc_29() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1000x1000xf16_times_1x1000xf16_into_1000x1xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1000_1000_1_30() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x1000xf16_times_1x1000xf16_into_1000x1xf16_1000_1000_1_31() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 70 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 71 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x1000xf16_times_1x1000xf16_into_1000x1xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_small.mlir b/linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_small.mlir
new file mode 100644
index 0000000..fb93898
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_small.mlir
@@ -0,0 +1,99 @@
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %acc: tensor<?x?xf16>) -> tensor<?x?xf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf16>) -> tensor<?x?xf16>
+  return %result: tensor<?x?xf16>
+}
+
+func.func @matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf16(%lhs: tensor<1x1xf16>, %rhs: tensor<1x1xf16>, %acc: tensor<1x1xf16>) -> tensor<1x1xf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1xf16>, tensor<1x1xf16>) outs(%acc: tensor<1x1xf16>) -> tensor<1x1xf16>
+  return %result: tensor<1x1xf16>
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>) -> tensor<?x?xf16> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<?x?xf16>) -> tensor<?x?xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf16>) -> tensor<?x?xf16>
+  return %result: tensor<?x?xf16>
+}
+
+func.func @matmul_1x1xf16_times_1x1xf16_into_1x1xf16(%lhs: tensor<1x1xf16>, %rhs: tensor<1x1xf16>) -> tensor<1x1xf16> {
+  %init_acc = tensor.empty() : tensor<1x1xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<1x1xf16>) -> tensor<1x1xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1xf16>, tensor<1x1xf16>) outs(%acc: tensor<1x1xf16>) -> tensor<1x1xf16>
+  return %result: tensor<1x1xf16>
+}
+
+func.func @matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf16(%lhs: tensor<2x2xf16>, %rhs: tensor<2x2xf16>, %acc: tensor<2x2xf16>) -> tensor<2x2xf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<2x2xf16>, tensor<2x2xf16>) outs(%acc: tensor<2x2xf16>) -> tensor<2x2xf16>
+  return %result: tensor<2x2xf16>
+}
+
+func.func @matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf16(%lhs: tensor<4x4xf16>, %rhs: tensor<4x4xf16>, %acc: tensor<4x4xf16>) -> tensor<4x4xf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<4x4xf16>, tensor<4x4xf16>) outs(%acc: tensor<4x4xf16>) -> tensor<4x4xf16>
+  return %result: tensor<4x4xf16>
+}
+
+func.func @matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf16(%lhs: tensor<8x8xf16>, %rhs: tensor<8x8xf16>, %acc: tensor<8x8xf16>) -> tensor<8x8xf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<8x8xf16>, tensor<8x8xf16>) outs(%acc: tensor<8x8xf16>) -> tensor<8x8xf16>
+  return %result: tensor<8x8xf16>
+}
+
+func.func @matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf16(%lhs: tensor<9x9xf16>, %rhs: tensor<9x9xf16>, %acc: tensor<9x9xf16>) -> tensor<9x9xf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<9x9xf16>, tensor<9x9xf16>) outs(%acc: tensor<9x9xf16>) -> tensor<9x9xf16>
+  return %result: tensor<9x9xf16>
+}
+
+func.func @matmul_accumulate_6x13xf16_times_3x13xf16_into_6x3xf16(%lhs: tensor<6x13xf16>, %rhs: tensor<3x13xf16>, %acc: tensor<6x3xf16>) -> tensor<6x3xf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<6x13xf16>, tensor<3x13xf16>) outs(%acc: tensor<6x3xf16>) -> tensor<6x3xf16>
+  return %result: tensor<6x3xf16>
+}
+
+func.func @matmul_15x37xf16_times_7x37xf16_into_15x7xf16(%lhs: tensor<15x37xf16>, %rhs: tensor<7x37xf16>) -> tensor<15x7xf16> {
+  %init_acc = tensor.empty() : tensor<15x7xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<15x7xf16>) -> tensor<15x7xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<15x37xf16>, tensor<7x37xf16>) outs(%acc: tensor<15x7xf16>) -> tensor<15x7xf16>
+  return %result: tensor<15x7xf16>
+}
+
+func.func @matmul_accumulate_81x19xf16_times_41x19xf16_into_81x41xf16(%lhs: tensor<81x19xf16>, %rhs: tensor<41x19xf16>, %acc: tensor<81x41xf16>) -> tensor<81x41xf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<81x19xf16>, tensor<41x19xf16>) outs(%acc: tensor<81x41xf16>) -> tensor<81x41xf16>
+  return %result: tensor<81x41xf16>
+}
+
+func.func @matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf16(%lhs: tensor<1x10xf16>, %rhs: tensor<10x10xf16>, %acc: tensor<1x10xf16>) -> tensor<1x10xf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x10xf16>, tensor<10x10xf16>) outs(%acc: tensor<1x10xf16>) -> tensor<1x10xf16>
+  return %result: tensor<1x10xf16>
+}
+
+func.func @matmul_1x10xf16_times_10x10xf16_into_1x10xf16(%lhs: tensor<1x10xf16>, %rhs: tensor<10x10xf16>) -> tensor<1x10xf16> {
+  %init_acc = tensor.empty() : tensor<1x10xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<1x10xf16>) -> tensor<1x10xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x10xf16>, tensor<10x10xf16>) outs(%acc: tensor<1x10xf16>) -> tensor<1x10xf16>
+  return %result: tensor<1x10xf16>
+}
+
+func.func @matmul_accumulate_10x1xf16_times_10x1xf16_into_10x10xf16(%lhs: tensor<10x1xf16>, %rhs: tensor<10x1xf16>, %acc: tensor<10x10xf16>) -> tensor<10x10xf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x1xf16>, tensor<10x1xf16>) outs(%acc: tensor<10x10xf16>) -> tensor<10x10xf16>
+  return %result: tensor<10x10xf16>
+}
+
+func.func @matmul_accumulate_10x10xf16_times_1x10xf16_into_10x1xf16(%lhs: tensor<10x10xf16>, %rhs: tensor<1x10xf16>, %acc: tensor<10x1xf16>) -> tensor<10x1xf16> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x10xf16>, tensor<1x10xf16>) outs(%acc: tensor<10x1xf16>) -> tensor<10x1xf16>
+  return %result: tensor<10x1xf16>
+}
+
+func.func @matmul_10x10xf16_times_1x10xf16_into_10x1xf16(%lhs: tensor<10x10xf16>, %rhs: tensor<1x10xf16>) -> tensor<10x1xf16> {
+  %init_acc = tensor.empty() : tensor<10x1xf16>
+  %c0_acc_type = arith.constant 0.0: f16
+  %acc = linalg.fill ins(%c0_acc_type : f16) outs(%init_acc : tensor<10x1xf16>) -> tensor<10x1xf16>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x10xf16>, tensor<1x10xf16>) outs(%acc: tensor<10x1xf16>) -> tensor<10x1xf16>
+  return %result: tensor<10x1xf16>
+}
+
diff --git a/linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_small_calls.mlir b/linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_small_calls.mlir
new file mode 100644
index 0000000..90bdf06
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f16/matmul_transpose_b_f16_into_f16_small_calls.mlir
@@ -0,0 +1,906 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x1xf16_times_1x1xf16_into_1x1xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_6x13xf16_times_3x13xf16_into_6x3xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_15x37xf16_times_7x37xf16_into_15x7xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_81x19xf16_times_41x19xf16_into_81x41xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x10xf16_times_10x10xf16_into_1x10xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x1xf16_times_10x1xf16_into_10x10xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x10xf16_times_1x10xf16_into_10x1xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_10x10xf16_times_1x10xf16_into_10x1xf16(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf16_1_1_1_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1_1_1_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x1xf16_times_1x1xf16_into_1x1xf16_1_1_1_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x1xf16_times_1x1xf16_into_1x1xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_2_2_2_acc_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 14 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 14 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf16_2_2_2_acc_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 15 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 16 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 17 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 17 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_4_4_4_acc_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 20 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 20 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf16_4_4_4_acc_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 21 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 22 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 23 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 23 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_8_8_8_acc_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 26 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 26 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf16_8_8_8_acc_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 27 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 28 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 29 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 29 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_9_9_9_acc_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 32 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 32 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf16_9_9_9_acc_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 33 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 34 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 35 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 35 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_6_13_3_acc_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 3 : i64
+  %rhs_dim1 = arith.constant 13 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 38 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 38 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_6x13xf16_times_3x13xf16_into_6x3xf16_6_13_3_acc_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 39 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 3 : i64
+  %rhs_dim1 = arith.constant 13 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 40 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 41 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 41 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_6x13xf16_times_3x13xf16_into_6x3xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_15_37_7_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 7 : i64
+  %rhs_dim1 = arith.constant 37 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_15x37xf16_times_7x37xf16_into_15x7xf16_15_37_7_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 7 : i64
+  %rhs_dim1 = arith.constant 37 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_15x37xf16_times_7x37xf16_into_15x7xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_81_19_41_acc_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 41 : i64
+  %rhs_dim1 = arith.constant 19 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 48 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 48 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_81x19xf16_times_41x19xf16_into_81x41xf16_81_19_41_acc_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 49 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 41 : i64
+  %rhs_dim1 = arith.constant 19 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 50 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 51 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 51 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_81x19xf16_times_41x19xf16_into_81x41xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1_10_10_acc_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 54 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 54 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf16_1_10_10_acc_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 55 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 56 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 57 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 57 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_1_10_10_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 58 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 59 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x10xf16_times_10x10xf16_into_1x10xf16_1_10_10_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 60 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 61 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x10xf16_times_10x10xf16_into_1x10xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_10_1_10_acc_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x1xf16_times_10x1xf16_into_10x10xf16_10_1_10_acc_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x1xf16_times_10x1xf16_into_10x10xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_10_10_1_acc_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 70 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 70 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x10xf16_times_1x10xf16_into_10x1xf16_10_10_1_acc_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 71 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 72 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 73 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 73 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x10xf16_times_1x10xf16_into_10x1xf16(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16_10_10_1_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 74 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 75 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_10x10xf16_times_1x10xf16_into_10x1xf16_10_10_1_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 76 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 77 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_10x10xf16_times_1x10xf16_into_10x1xf16(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_large.mlir b/linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_large.mlir
new file mode 100644
index 0000000..a5662d7
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_large.mlir
@@ -0,0 +1,136 @@
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_512x128xf16_times_128x512xf16_into_512x512xf32(%lhs: tensor<512x128xf16>, %rhs: tensor<128x512xf16>, %acc: tensor<512x512xf32>) -> tensor<512x512xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xf16>, tensor<128x512xf16>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_512x128xf16_times_128x512xf16_into_512x512xf32(%lhs: tensor<512x128xf16>, %rhs: tensor<128x512xf16>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xf16>, tensor<128x512xf16>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_1000x4xf16_times_4x512xf16_into_1000x512xf32(%lhs: tensor<1000x4xf16>, %rhs: tensor<4x512xf16>) -> tensor<1000x512xf32> {
+  %init_acc = tensor.empty() : tensor<1000x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x4xf16>, tensor<4x512xf16>) outs(%acc: tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  return %result: tensor<1000x512xf32>
+}
+
+func.func @matmul_4x1000xf16_times_1000x512xf16_into_4x512xf32(%lhs: tensor<4x1000xf16>, %rhs: tensor<1000x512xf16>) -> tensor<4x512xf32> {
+  %init_acc = tensor.empty() : tensor<4x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<4x512xf32>) -> tensor<4x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<4x1000xf16>, tensor<1000x512xf16>) outs(%acc: tensor<4x512xf32>) -> tensor<4x512xf32>
+  return %result: tensor<4x512xf32>
+}
+
+func.func @matmul_512x1000xf16_times_1000x4xf16_into_512x4xf32(%lhs: tensor<512x1000xf16>, %rhs: tensor<1000x4xf16>) -> tensor<512x4xf32> {
+  %init_acc = tensor.empty() : tensor<512x4xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x4xf32>) -> tensor<512x4xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x1000xf16>, tensor<1000x4xf16>) outs(%acc: tensor<512x4xf32>) -> tensor<512x4xf32>
+  return %result: tensor<512x4xf32>
+}
+
+func.func @matmul_512x128xf16_times_128x500xf16_into_512x500xf32(%lhs: tensor<512x128xf16>, %rhs: tensor<128x500xf16>) -> tensor<512x500xf32> {
+  %init_acc = tensor.empty() : tensor<512x500xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x500xf32>) -> tensor<512x500xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xf16>, tensor<128x500xf16>) outs(%acc: tensor<512x500xf32>) -> tensor<512x500xf32>
+  return %result: tensor<512x500xf32>
+}
+
+func.func @matmul_457x330xf16_times_330x512xf16_into_457x512xf32(%lhs: tensor<457x330xf16>, %rhs: tensor<330x512xf16>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x330xf16>, tensor<330x512xf16>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_457x330xf16_times_330x514xf16_into_457x514xf32(%lhs: tensor<457x330xf16>, %rhs: tensor<330x514xf16>) -> tensor<457x514xf32> {
+  %init_acc = tensor.empty() : tensor<457x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x514xf32>) -> tensor<457x514xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x330xf16>, tensor<330x514xf16>) outs(%acc: tensor<457x514xf32>) -> tensor<457x514xf32>
+  return %result: tensor<457x514xf32>
+}
+
+func.func @matmul_438x330xf16_times_330x514xf16_into_438x514xf32(%lhs: tensor<438x330xf16>, %rhs: tensor<330x514xf16>) -> tensor<438x514xf32> {
+  %init_acc = tensor.empty() : tensor<438x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<438x514xf32>) -> tensor<438x514xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<438x330xf16>, tensor<330x514xf16>) outs(%acc: tensor<438x514xf32>) -> tensor<438x514xf32>
+  return %result: tensor<438x514xf32>
+}
+
+func.func @matmul_540x332xf16_times_332x516xf16_into_540x516xf32(%lhs: tensor<540x332xf16>, %rhs: tensor<332x516xf16>) -> tensor<540x516xf32> {
+  %init_acc = tensor.empty() : tensor<540x516xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<540x516xf32>) -> tensor<540x516xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<540x332xf16>, tensor<332x516xf16>) outs(%acc: tensor<540x516xf32>) -> tensor<540x516xf32>
+  return %result: tensor<540x516xf32>
+}
+
+func.func @matmul_654x321xf16_times_321x234xf16_into_654x234xf32(%lhs: tensor<654x321xf16>, %rhs: tensor<321x234xf16>) -> tensor<654x234xf32> {
+  %init_acc = tensor.empty() : tensor<654x234xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<654x234xf32>) -> tensor<654x234xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<654x321xf16>, tensor<321x234xf16>) outs(%acc: tensor<654x234xf32>) -> tensor<654x234xf32>
+  return %result: tensor<654x234xf32>
+}
+
+func.func @matmul_457x160xf16_times_160x512xf16_into_457x512xf32(%lhs: tensor<457x160xf16>, %rhs: tensor<160x512xf16>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x160xf16>, tensor<160x512xf16>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_512x330xf16_times_330x512xf16_into_512x512xf32(%lhs: tensor<512x330xf16>, %rhs: tensor<330x512xf16>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x330xf16>, tensor<330x512xf16>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf32(%lhs: tensor<1x1000xf16>, %rhs: tensor<1000x1000xf16>, %acc: tensor<1x1000xf32>) -> tensor<1x1000xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1000xf16>, tensor<1000x1000xf16>) outs(%acc: tensor<1x1000xf32>) -> tensor<1x1000xf32>
+  return %result: tensor<1x1000xf32>
+}
+
+func.func @matmul_accumulate_1000x1000xf16_times_1000x1xf16_into_1000x1xf32(%lhs: tensor<1000x1000xf16>, %rhs: tensor<1000x1xf16>, %acc: tensor<1000x1xf32>) -> tensor<1000x1xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x1000xf16>, tensor<1000x1xf16>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
+func.func @matmul_1000x1000xf16_times_1000x1xf16_into_1000x1xf32(%lhs: tensor<1000x1000xf16>, %rhs: tensor<1000x1xf16>) -> tensor<1000x1xf32> {
+  %init_acc = tensor.empty() : tensor<1000x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x1000xf16>, tensor<1000x1xf16>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_large_calls.mlir b/linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_large_calls.mlir
new file mode 100644
index 0000000..eb94823
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_large_calls.mlir
@@ -0,0 +1,882 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_512x128xf16_times_128x512xf16_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf16_times_128x512xf16_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x4xf16_times_4x512xf16_into_1000x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_4x1000xf16_times_1000x512xf16_into_4x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x1000xf16_times_1000x4xf16_into_512x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf16_times_128x500xf16_into_512x500xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf16_times_330x512xf16_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf16_times_330x514xf16_into_457x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_438x330xf16_times_330x514xf16_into_438x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_540x332xf16_times_332x516xf16_into_540x516xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_654x321xf16_times_321x234xf16_into_654x234xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x160xf16_times_160x512xf16_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x330xf16_times_330x512xf16_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1000x1000xf16_times_1000x1xf16_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x1000xf16_times_1000x1xf16_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_512_128_512_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_512x128xf16_times_128x512xf16_into_512x512xf32_512_128_512_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_512x128xf16_times_128x512xf16_into_512x512xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_512_128_512_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf16_times_128x512xf16_into_512x512xf32_512_128_512_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf16_times_128x512xf16_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1000_4_512_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x4xf16_times_4x512xf16_into_1000x512xf32_1000_4_512_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 14 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 15 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x4xf16_times_4x512xf16_into_1000x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_4_1000_512_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 16 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 17 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_4x1000xf16_times_1000x512xf16_into_4x512xf32_4_1000_512_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_4x1000xf16_times_1000x512xf16_into_4x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_512_1000_4_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 20 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 21 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x1000xf16_times_1000x4xf16_into_512x4xf32_512_1000_4_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 22 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 23 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x1000xf16_times_1000x4xf16_into_512x4xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_512_128_500_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 500 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf16_times_128x500xf16_into_512x500xf32_512_128_500_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 26 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 500 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 27 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf16_times_128x500xf16_into_512x500xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_457_330_512_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 28 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 29 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf16_times_330x512xf16_into_457x512xf32_457_330_512_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf16_times_330x512xf16_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_457_330_514_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 32 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 33 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf16_times_330x514xf16_into_457x514xf32_457_330_514_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 34 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 35 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf16_times_330x514xf16_into_457x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_438_330_514_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_438x330xf16_times_330x514xf16_into_438x514xf32_438_330_514_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 38 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 39 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_438x330xf16_times_330x514xf16_into_438x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_540_332_516_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 40 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 332 : i64
+  %rhs_dim1 = arith.constant 516 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 41 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_540x332xf16_times_332x516xf16_into_540x516xf32_540_332_516_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 332 : i64
+  %rhs_dim1 = arith.constant 516 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_540x332xf16_times_332x516xf16_into_540x516xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_654_321_234_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 321 : i64
+  %rhs_dim1 = arith.constant 234 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_654x321xf16_times_321x234xf16_into_654x234xf32_654_321_234_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 321 : i64
+  %rhs_dim1 = arith.constant 234 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_654x321xf16_times_321x234xf16_into_654x234xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_457_160_512_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 48 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 160 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 49 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x160xf16_times_160x512xf16_into_457x512xf32_457_160_512_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 50 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 160 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 51 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x160xf16_times_160x512xf16_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_512_330_512_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x330xf16_times_330x512xf16_into_512x512xf32_512_330_512_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 54 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 55 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x330xf16_times_330x512xf16_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1_1000_1000_acc_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 56 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 57 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 58 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 58 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf32_1_1000_1000_acc_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 59 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 60 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 61 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 61 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1000_1000_1_acc_28() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1000x1000xf16_times_1000x1xf16_into_1000x1xf32_1000_1000_1_acc_29() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1000x1000xf16_times_1000x1xf16_into_1000x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1000_1000_1_30() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x1000xf16_times_1000x1xf16_into_1000x1xf32_1000_1000_1_31() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 70 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 71 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x1000xf16_times_1000x1xf16_into_1000x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_small.mlir b/linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_small.mlir
new file mode 100644
index 0000000..e70d950
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_small.mlir
@@ -0,0 +1,99 @@
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf32(%lhs: tensor<1x1xf16>, %rhs: tensor<1x1xf16>, %acc: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1xf16>, tensor<1x1xf16>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_1x1xf16_times_1x1xf16_into_1x1xf32(%lhs: tensor<1x1xf16>, %rhs: tensor<1x1xf16>) -> tensor<1x1xf32> {
+  %init_acc = tensor.empty() : tensor<1x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x1xf32>) -> tensor<1x1xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1xf16>, tensor<1x1xf16>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf32(%lhs: tensor<2x2xf16>, %rhs: tensor<2x2xf16>, %acc: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<2x2xf16>, tensor<2x2xf16>) outs(%acc: tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %result: tensor<2x2xf32>
+}
+
+func.func @matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf32(%lhs: tensor<4x4xf16>, %rhs: tensor<4x4xf16>, %acc: tensor<4x4xf32>) -> tensor<4x4xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<4x4xf16>, tensor<4x4xf16>) outs(%acc: tensor<4x4xf32>) -> tensor<4x4xf32>
+  return %result: tensor<4x4xf32>
+}
+
+func.func @matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf32(%lhs: tensor<8x8xf16>, %rhs: tensor<8x8xf16>, %acc: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<8x8xf16>, tensor<8x8xf16>) outs(%acc: tensor<8x8xf32>) -> tensor<8x8xf32>
+  return %result: tensor<8x8xf32>
+}
+
+func.func @matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf32(%lhs: tensor<9x9xf16>, %rhs: tensor<9x9xf16>, %acc: tensor<9x9xf32>) -> tensor<9x9xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<9x9xf16>, tensor<9x9xf16>) outs(%acc: tensor<9x9xf32>) -> tensor<9x9xf32>
+  return %result: tensor<9x9xf32>
+}
+
+func.func @matmul_accumulate_6x13xf16_times_13x3xf16_into_6x3xf32(%lhs: tensor<6x13xf16>, %rhs: tensor<13x3xf16>, %acc: tensor<6x3xf32>) -> tensor<6x3xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<6x13xf16>, tensor<13x3xf16>) outs(%acc: tensor<6x3xf32>) -> tensor<6x3xf32>
+  return %result: tensor<6x3xf32>
+}
+
+func.func @matmul_15x37xf16_times_37x7xf16_into_15x7xf32(%lhs: tensor<15x37xf16>, %rhs: tensor<37x7xf16>) -> tensor<15x7xf32> {
+  %init_acc = tensor.empty() : tensor<15x7xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<15x7xf32>) -> tensor<15x7xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<15x37xf16>, tensor<37x7xf16>) outs(%acc: tensor<15x7xf32>) -> tensor<15x7xf32>
+  return %result: tensor<15x7xf32>
+}
+
+func.func @matmul_accumulate_81x19xf16_times_19x41xf16_into_81x41xf32(%lhs: tensor<81x19xf16>, %rhs: tensor<19x41xf16>, %acc: tensor<81x41xf32>) -> tensor<81x41xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<81x19xf16>, tensor<19x41xf16>) outs(%acc: tensor<81x41xf32>) -> tensor<81x41xf32>
+  return %result: tensor<81x41xf32>
+}
+
+func.func @matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf32(%lhs: tensor<1x10xf16>, %rhs: tensor<10x10xf16>, %acc: tensor<1x10xf32>) -> tensor<1x10xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x10xf16>, tensor<10x10xf16>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_1x10xf16_times_10x10xf16_into_1x10xf32(%lhs: tensor<1x10xf16>, %rhs: tensor<10x10xf16>) -> tensor<1x10xf32> {
+  %init_acc = tensor.empty() : tensor<1x10xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x10xf32>) -> tensor<1x10xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x10xf16>, tensor<10x10xf16>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_accumulate_10x1xf16_times_1x10xf16_into_10x10xf32(%lhs: tensor<10x1xf16>, %rhs: tensor<1x10xf16>, %acc: tensor<10x10xf32>) -> tensor<10x10xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x1xf16>, tensor<1x10xf16>) outs(%acc: tensor<10x10xf32>) -> tensor<10x10xf32>
+  return %result: tensor<10x10xf32>
+}
+
+func.func @matmul_accumulate_10x10xf16_times_10x1xf16_into_10x1xf32(%lhs: tensor<10x10xf16>, %rhs: tensor<10x1xf16>, %acc: tensor<10x1xf32>) -> tensor<10x1xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x10xf16>, tensor<10x1xf16>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
+func.func @matmul_10x10xf16_times_10x1xf16_into_10x1xf32(%lhs: tensor<10x10xf16>, %rhs: tensor<10x1xf16>) -> tensor<10x1xf32> {
+  %init_acc = tensor.empty() : tensor<10x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<10x1xf32>) -> tensor<10x1xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x10xf16>, tensor<10x1xf16>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_small_calls.mlir b/linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_small_calls.mlir
new file mode 100644
index 0000000..051ac2b
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f32/matmul_f16_into_f32_small_calls.mlir
@@ -0,0 +1,906 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x1xf16_times_1x1xf16_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_6x13xf16_times_13x3xf16_into_6x3xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_15x37xf16_times_37x7xf16_into_15x7xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_81x19xf16_times_19x41xf16_into_81x41xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x10xf16_times_10x10xf16_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x1xf16_times_1x10xf16_into_10x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x10xf16_times_10x1xf16_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_10x10xf16_times_10x1xf16_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf32_1_1_1_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1_1_1_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x1xf16_times_1x1xf16_into_1x1xf32_1_1_1_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x1xf16_times_1x1xf16_into_1x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_2_2_2_acc_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 14 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 14 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf32_2_2_2_acc_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 15 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 16 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 17 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 17 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_4_4_4_acc_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 20 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 20 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf32_4_4_4_acc_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 21 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 22 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 23 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 23 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_8_8_8_acc_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 26 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 26 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf32_8_8_8_acc_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 27 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 28 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 29 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 29 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_9_9_9_acc_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 32 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 32 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf32_9_9_9_acc_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 33 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 34 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 35 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 35 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_6_13_3_acc_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 13 : i64
+  %rhs_dim1 = arith.constant 3 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 38 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 38 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_6x13xf16_times_13x3xf16_into_6x3xf32_6_13_3_acc_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 39 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 13 : i64
+  %rhs_dim1 = arith.constant 3 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 40 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 41 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 41 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_6x13xf16_times_13x3xf16_into_6x3xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_15_37_7_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 37 : i64
+  %rhs_dim1 = arith.constant 7 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_15x37xf16_times_37x7xf16_into_15x7xf32_15_37_7_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 37 : i64
+  %rhs_dim1 = arith.constant 7 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_15x37xf16_times_37x7xf16_into_15x7xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_81_19_41_acc_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 19 : i64
+  %rhs_dim1 = arith.constant 41 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 48 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 48 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_81x19xf16_times_19x41xf16_into_81x41xf32_81_19_41_acc_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 49 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 19 : i64
+  %rhs_dim1 = arith.constant 41 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 50 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 51 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 51 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_81x19xf16_times_19x41xf16_into_81x41xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1_10_10_acc_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 54 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 54 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf32_1_10_10_acc_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 55 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 56 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 57 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 57 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1_10_10_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 58 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 59 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x10xf16_times_10x10xf16_into_1x10xf32_1_10_10_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 60 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 61 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x10xf16_times_10x10xf16_into_1x10xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_10_1_10_acc_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x1xf16_times_1x10xf16_into_10x10xf32_10_1_10_acc_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x1xf16_times_1x10xf16_into_10x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_10_10_1_acc_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 70 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 70 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x10xf16_times_10x1xf16_into_10x1xf32_10_10_1_acc_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 71 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 72 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 73 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 73 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x10xf16_times_10x1xf16_into_10x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_10_10_1_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 74 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 75 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_10x10xf16_times_10x1xf16_into_10x1xf32_10_10_1_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 76 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 77 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_10x10xf16_times_10x1xf16_into_10x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_large.mlir b/linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_large.mlir
new file mode 100644
index 0000000..fb930f7
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_large.mlir
@@ -0,0 +1,136 @@
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_512x128xf16_times_512x128xf16_into_512x512xf32(%lhs: tensor<512x128xf16>, %rhs: tensor<512x128xf16>, %acc: tensor<512x512xf32>) -> tensor<512x512xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xf16>, tensor<512x128xf16>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_512x128xf16_times_512x128xf16_into_512x512xf32(%lhs: tensor<512x128xf16>, %rhs: tensor<512x128xf16>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xf16>, tensor<512x128xf16>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_1000x4xf16_times_512x4xf16_into_1000x512xf32(%lhs: tensor<1000x4xf16>, %rhs: tensor<512x4xf16>) -> tensor<1000x512xf32> {
+  %init_acc = tensor.empty() : tensor<1000x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x4xf16>, tensor<512x4xf16>) outs(%acc: tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  return %result: tensor<1000x512xf32>
+}
+
+func.func @matmul_4x1000xf16_times_512x1000xf16_into_4x512xf32(%lhs: tensor<4x1000xf16>, %rhs: tensor<512x1000xf16>) -> tensor<4x512xf32> {
+  %init_acc = tensor.empty() : tensor<4x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<4x512xf32>) -> tensor<4x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<4x1000xf16>, tensor<512x1000xf16>) outs(%acc: tensor<4x512xf32>) -> tensor<4x512xf32>
+  return %result: tensor<4x512xf32>
+}
+
+func.func @matmul_512x1000xf16_times_4x1000xf16_into_512x4xf32(%lhs: tensor<512x1000xf16>, %rhs: tensor<4x1000xf16>) -> tensor<512x4xf32> {
+  %init_acc = tensor.empty() : tensor<512x4xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x4xf32>) -> tensor<512x4xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x1000xf16>, tensor<4x1000xf16>) outs(%acc: tensor<512x4xf32>) -> tensor<512x4xf32>
+  return %result: tensor<512x4xf32>
+}
+
+func.func @matmul_512x128xf16_times_500x128xf16_into_512x500xf32(%lhs: tensor<512x128xf16>, %rhs: tensor<500x128xf16>) -> tensor<512x500xf32> {
+  %init_acc = tensor.empty() : tensor<512x500xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x500xf32>) -> tensor<512x500xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xf16>, tensor<500x128xf16>) outs(%acc: tensor<512x500xf32>) -> tensor<512x500xf32>
+  return %result: tensor<512x500xf32>
+}
+
+func.func @matmul_457x330xf16_times_512x330xf16_into_457x512xf32(%lhs: tensor<457x330xf16>, %rhs: tensor<512x330xf16>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x330xf16>, tensor<512x330xf16>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_457x330xf16_times_514x330xf16_into_457x514xf32(%lhs: tensor<457x330xf16>, %rhs: tensor<514x330xf16>) -> tensor<457x514xf32> {
+  %init_acc = tensor.empty() : tensor<457x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x514xf32>) -> tensor<457x514xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x330xf16>, tensor<514x330xf16>) outs(%acc: tensor<457x514xf32>) -> tensor<457x514xf32>
+  return %result: tensor<457x514xf32>
+}
+
+func.func @matmul_438x330xf16_times_514x330xf16_into_438x514xf32(%lhs: tensor<438x330xf16>, %rhs: tensor<514x330xf16>) -> tensor<438x514xf32> {
+  %init_acc = tensor.empty() : tensor<438x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<438x514xf32>) -> tensor<438x514xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<438x330xf16>, tensor<514x330xf16>) outs(%acc: tensor<438x514xf32>) -> tensor<438x514xf32>
+  return %result: tensor<438x514xf32>
+}
+
+func.func @matmul_540x332xf16_times_516x332xf16_into_540x516xf32(%lhs: tensor<540x332xf16>, %rhs: tensor<516x332xf16>) -> tensor<540x516xf32> {
+  %init_acc = tensor.empty() : tensor<540x516xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<540x516xf32>) -> tensor<540x516xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<540x332xf16>, tensor<516x332xf16>) outs(%acc: tensor<540x516xf32>) -> tensor<540x516xf32>
+  return %result: tensor<540x516xf32>
+}
+
+func.func @matmul_654x321xf16_times_234x321xf16_into_654x234xf32(%lhs: tensor<654x321xf16>, %rhs: tensor<234x321xf16>) -> tensor<654x234xf32> {
+  %init_acc = tensor.empty() : tensor<654x234xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<654x234xf32>) -> tensor<654x234xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<654x321xf16>, tensor<234x321xf16>) outs(%acc: tensor<654x234xf32>) -> tensor<654x234xf32>
+  return %result: tensor<654x234xf32>
+}
+
+func.func @matmul_457x160xf16_times_512x160xf16_into_457x512xf32(%lhs: tensor<457x160xf16>, %rhs: tensor<512x160xf16>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x160xf16>, tensor<512x160xf16>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_512x330xf16_times_512x330xf16_into_512x512xf32(%lhs: tensor<512x330xf16>, %rhs: tensor<512x330xf16>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x330xf16>, tensor<512x330xf16>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf32(%lhs: tensor<1x1000xf16>, %rhs: tensor<1000x1000xf16>, %acc: tensor<1x1000xf32>) -> tensor<1x1000xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1000xf16>, tensor<1000x1000xf16>) outs(%acc: tensor<1x1000xf32>) -> tensor<1x1000xf32>
+  return %result: tensor<1x1000xf32>
+}
+
+func.func @matmul_accumulate_1000x1000xf16_times_1x1000xf16_into_1000x1xf32(%lhs: tensor<1000x1000xf16>, %rhs: tensor<1x1000xf16>, %acc: tensor<1000x1xf32>) -> tensor<1000x1xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x1000xf16>, tensor<1x1000xf16>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
+func.func @matmul_1000x1000xf16_times_1x1000xf16_into_1000x1xf32(%lhs: tensor<1000x1000xf16>, %rhs: tensor<1x1000xf16>) -> tensor<1000x1xf32> {
+  %init_acc = tensor.empty() : tensor<1000x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x1000xf16>, tensor<1x1000xf16>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_large_calls.mlir b/linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_large_calls.mlir
new file mode 100644
index 0000000..e644cc3
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_large_calls.mlir
@@ -0,0 +1,882 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_512x128xf16_times_512x128xf16_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf16_times_512x128xf16_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x4xf16_times_512x4xf16_into_1000x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_4x1000xf16_times_512x1000xf16_into_4x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x1000xf16_times_4x1000xf16_into_512x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf16_times_500x128xf16_into_512x500xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf16_times_512x330xf16_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf16_times_514x330xf16_into_457x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_438x330xf16_times_514x330xf16_into_438x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_540x332xf16_times_516x332xf16_into_540x516xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_654x321xf16_times_234x321xf16_into_654x234xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x160xf16_times_512x160xf16_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x330xf16_times_512x330xf16_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1000x1000xf16_times_1x1000xf16_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x1000xf16_times_1x1000xf16_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_512_128_512_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_512x128xf16_times_512x128xf16_into_512x512xf32_512_128_512_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_512x128xf16_times_512x128xf16_into_512x512xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_512_128_512_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf16_times_512x128xf16_into_512x512xf32_512_128_512_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf16_times_512x128xf16_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1000_4_512_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x4xf16_times_512x4xf16_into_1000x512xf32_1000_4_512_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 14 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 15 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x4xf16_times_512x4xf16_into_1000x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_4_1000_512_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 16 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 17 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_4x1000xf16_times_512x1000xf16_into_4x512xf32_4_1000_512_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_4x1000xf16_times_512x1000xf16_into_4x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_512_1000_4_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 20 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 21 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x1000xf16_times_4x1000xf16_into_512x4xf32_512_1000_4_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 22 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 23 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x1000xf16_times_4x1000xf16_into_512x4xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_512_128_500_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 500 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf16_times_500x128xf16_into_512x500xf32_512_128_500_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 26 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 500 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 27 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf16_times_500x128xf16_into_512x500xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_457_330_512_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 28 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 29 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf16_times_512x330xf16_into_457x512xf32_457_330_512_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf16_times_512x330xf16_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_457_330_514_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 32 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 33 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf16_times_514x330xf16_into_457x514xf32_457_330_514_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 34 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 35 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf16_times_514x330xf16_into_457x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_438_330_514_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_438x330xf16_times_514x330xf16_into_438x514xf32_438_330_514_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 38 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 39 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_438x330xf16_times_514x330xf16_into_438x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_540_332_516_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 40 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 516 : i64
+  %rhs_dim1 = arith.constant 332 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 41 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_540x332xf16_times_516x332xf16_into_540x516xf32_540_332_516_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 516 : i64
+  %rhs_dim1 = arith.constant 332 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_540x332xf16_times_516x332xf16_into_540x516xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_654_321_234_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 234 : i64
+  %rhs_dim1 = arith.constant 321 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_654x321xf16_times_234x321xf16_into_654x234xf32_654_321_234_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 234 : i64
+  %rhs_dim1 = arith.constant 321 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_654x321xf16_times_234x321xf16_into_654x234xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_457_160_512_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 48 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 160 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 49 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x160xf16_times_512x160xf16_into_457x512xf32_457_160_512_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 50 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 160 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 51 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x160xf16_times_512x160xf16_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_512_330_512_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x330xf16_times_512x330xf16_into_512x512xf32_512_330_512_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 54 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 55 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x330xf16_times_512x330xf16_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1_1000_1000_acc_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 56 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 57 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 58 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 58 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf32_1_1000_1000_acc_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 59 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 60 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 61 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 61 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1000xf16_times_1000x1000xf16_into_1x1000xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1000_1000_1_acc_28() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1000x1000xf16_times_1x1000xf16_into_1000x1xf32_1000_1000_1_acc_29() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1000x1000xf16_times_1x1000xf16_into_1000x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1000_1000_1_30() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x1000xf16_times_1x1000xf16_into_1000x1xf32_1000_1000_1_31() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 70 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 71 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x1000xf16_times_1x1000xf16_into_1000x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_small.mlir b/linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_small.mlir
new file mode 100644
index 0000000..70ed2e4
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_small.mlir
@@ -0,0 +1,99 @@
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf32(%lhs: tensor<1x1xf16>, %rhs: tensor<1x1xf16>, %acc: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1xf16>, tensor<1x1xf16>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf16>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf16>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xf16>, tensor<?x?xf16>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_1x1xf16_times_1x1xf16_into_1x1xf32(%lhs: tensor<1x1xf16>, %rhs: tensor<1x1xf16>) -> tensor<1x1xf32> {
+  %init_acc = tensor.empty() : tensor<1x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x1xf32>) -> tensor<1x1xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1xf16>, tensor<1x1xf16>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf32(%lhs: tensor<2x2xf16>, %rhs: tensor<2x2xf16>, %acc: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<2x2xf16>, tensor<2x2xf16>) outs(%acc: tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %result: tensor<2x2xf32>
+}
+
+func.func @matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf32(%lhs: tensor<4x4xf16>, %rhs: tensor<4x4xf16>, %acc: tensor<4x4xf32>) -> tensor<4x4xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<4x4xf16>, tensor<4x4xf16>) outs(%acc: tensor<4x4xf32>) -> tensor<4x4xf32>
+  return %result: tensor<4x4xf32>
+}
+
+func.func @matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf32(%lhs: tensor<8x8xf16>, %rhs: tensor<8x8xf16>, %acc: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<8x8xf16>, tensor<8x8xf16>) outs(%acc: tensor<8x8xf32>) -> tensor<8x8xf32>
+  return %result: tensor<8x8xf32>
+}
+
+func.func @matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf32(%lhs: tensor<9x9xf16>, %rhs: tensor<9x9xf16>, %acc: tensor<9x9xf32>) -> tensor<9x9xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<9x9xf16>, tensor<9x9xf16>) outs(%acc: tensor<9x9xf32>) -> tensor<9x9xf32>
+  return %result: tensor<9x9xf32>
+}
+
+func.func @matmul_accumulate_6x13xf16_times_3x13xf16_into_6x3xf32(%lhs: tensor<6x13xf16>, %rhs: tensor<3x13xf16>, %acc: tensor<6x3xf32>) -> tensor<6x3xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<6x13xf16>, tensor<3x13xf16>) outs(%acc: tensor<6x3xf32>) -> tensor<6x3xf32>
+  return %result: tensor<6x3xf32>
+}
+
+func.func @matmul_15x37xf16_times_7x37xf16_into_15x7xf32(%lhs: tensor<15x37xf16>, %rhs: tensor<7x37xf16>) -> tensor<15x7xf32> {
+  %init_acc = tensor.empty() : tensor<15x7xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<15x7xf32>) -> tensor<15x7xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<15x37xf16>, tensor<7x37xf16>) outs(%acc: tensor<15x7xf32>) -> tensor<15x7xf32>
+  return %result: tensor<15x7xf32>
+}
+
+func.func @matmul_accumulate_81x19xf16_times_41x19xf16_into_81x41xf32(%lhs: tensor<81x19xf16>, %rhs: tensor<41x19xf16>, %acc: tensor<81x41xf32>) -> tensor<81x41xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<81x19xf16>, tensor<41x19xf16>) outs(%acc: tensor<81x41xf32>) -> tensor<81x41xf32>
+  return %result: tensor<81x41xf32>
+}
+
+func.func @matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf32(%lhs: tensor<1x10xf16>, %rhs: tensor<10x10xf16>, %acc: tensor<1x10xf32>) -> tensor<1x10xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x10xf16>, tensor<10x10xf16>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_1x10xf16_times_10x10xf16_into_1x10xf32(%lhs: tensor<1x10xf16>, %rhs: tensor<10x10xf16>) -> tensor<1x10xf32> {
+  %init_acc = tensor.empty() : tensor<1x10xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x10xf32>) -> tensor<1x10xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x10xf16>, tensor<10x10xf16>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_accumulate_10x1xf16_times_10x1xf16_into_10x10xf32(%lhs: tensor<10x1xf16>, %rhs: tensor<10x1xf16>, %acc: tensor<10x10xf32>) -> tensor<10x10xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x1xf16>, tensor<10x1xf16>) outs(%acc: tensor<10x10xf32>) -> tensor<10x10xf32>
+  return %result: tensor<10x10xf32>
+}
+
+func.func @matmul_accumulate_10x10xf16_times_1x10xf16_into_10x1xf32(%lhs: tensor<10x10xf16>, %rhs: tensor<1x10xf16>, %acc: tensor<10x1xf32>) -> tensor<10x1xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x10xf16>, tensor<1x10xf16>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
+func.func @matmul_10x10xf16_times_1x10xf16_into_10x1xf32(%lhs: tensor<10x10xf16>, %rhs: tensor<1x10xf16>) -> tensor<10x1xf32> {
+  %init_acc = tensor.empty() : tensor<10x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<10x1xf32>) -> tensor<10x1xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x10xf16>, tensor<1x10xf16>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_small_calls.mlir b/linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_small_calls.mlir
new file mode 100644
index 0000000..27ee342
--- /dev/null
+++ b/linalg_ops/matmul/generated/f16_into_f32/matmul_transpose_b_f16_into_f32_small_calls.mlir
@@ -0,0 +1,906 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x1xf16_times_1x1xf16_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_6x13xf16_times_3x13xf16_into_6x3xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_15x37xf16_times_7x37xf16_into_15x7xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_81x19xf16_times_41x19xf16_into_81x41xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x10xf16_times_10x10xf16_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x1xf16_times_10x1xf16_into_10x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x10xf16_times_1x10xf16_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_10x10xf16_times_1x10xf16_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf32_1_1_1_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1xf16_times_1x1xf16_into_1x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1_1_1_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x1xf16_times_1x1xf16_into_1x1xf32_1_1_1_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x1xf16_times_1x1xf16_into_1x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_2_2_2_acc_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 14 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 14 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf32_2_2_2_acc_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 15 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 16 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 17 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 17 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_2x2xf16_times_2x2xf16_into_2x2xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_4_4_4_acc_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 20 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 20 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf32_4_4_4_acc_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 21 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 22 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 23 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 23 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_4x4xf16_times_4x4xf16_into_4x4xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_8_8_8_acc_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 26 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 26 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf32_8_8_8_acc_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 27 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 28 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 29 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 29 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_8x8xf16_times_8x8xf16_into_8x8xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_9_9_9_acc_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 32 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 32 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf32_9_9_9_acc_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 33 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 34 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 35 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 35 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_9x9xf16_times_9x9xf16_into_9x9xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_6_13_3_acc_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 3 : i64
+  %rhs_dim1 = arith.constant 13 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 38 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 38 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_6x13xf16_times_3x13xf16_into_6x3xf32_6_13_3_acc_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 39 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 3 : i64
+  %rhs_dim1 = arith.constant 13 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 40 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 41 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 41 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_6x13xf16_times_3x13xf16_into_6x3xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_15_37_7_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 7 : i64
+  %rhs_dim1 = arith.constant 37 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_15x37xf16_times_7x37xf16_into_15x7xf32_15_37_7_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 7 : i64
+  %rhs_dim1 = arith.constant 37 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_15x37xf16_times_7x37xf16_into_15x7xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_81_19_41_acc_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 41 : i64
+  %rhs_dim1 = arith.constant 19 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 48 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 48 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_81x19xf16_times_41x19xf16_into_81x41xf32_81_19_41_acc_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 49 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 41 : i64
+  %rhs_dim1 = arith.constant 19 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 50 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 51 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 51 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_81x19xf16_times_41x19xf16_into_81x41xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1_10_10_acc_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 54 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 54 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf32_1_10_10_acc_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 55 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 56 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 57 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 57 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x10xf16_times_10x10xf16_into_1x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_1_10_10_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 58 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 59 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x10xf16_times_10x10xf16_into_1x10xf32_1_10_10_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 60 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 61 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x10xf16_times_10x10xf16_into_1x10xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_10_1_10_acc_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x1xf16_times_10x1xf16_into_10x10xf32_10_1_10_acc_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x1xf16_times_10x1xf16_into_10x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_10_10_1_acc_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 70 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 70 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x10xf16_times_1x10xf16_into_10x1xf32_10_10_1_acc_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 71 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 72 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 73 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 73 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x10xf16_times_1x10xf16_into_10x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32_10_10_1_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 74 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 75 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf16_times_DYNxDYNxf16_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_10x10xf16_times_1x10xf16_into_10x1xf32_10_10_1_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f16> : i32
+  %lhs_seed = arith.constant 76 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f16> : i32
+  %rhs_seed = arith.constant 77 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_10x10xf16_times_1x10xf16_into_10x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_large.mlir b/linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_large.mlir
new file mode 100644
index 0000000..c68c1d1
--- /dev/null
+++ b/linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_large.mlir
@@ -0,0 +1,136 @@
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xf32>, tensor<?x?xf32>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_512x128xf32_times_128x512xf32_into_512x512xf32(%lhs: tensor<512x128xf32>, %rhs: tensor<128x512xf32>, %acc: tensor<512x512xf32>) -> tensor<512x512xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xf32>, tensor<128x512xf32>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf32>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xf32>, tensor<?x?xf32>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_512x128xf32_times_128x512xf32_into_512x512xf32(%lhs: tensor<512x128xf32>, %rhs: tensor<128x512xf32>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xf32>, tensor<128x512xf32>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_1000x4xf32_times_4x512xf32_into_1000x512xf32(%lhs: tensor<1000x4xf32>, %rhs: tensor<4x512xf32>) -> tensor<1000x512xf32> {
+  %init_acc = tensor.empty() : tensor<1000x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x4xf32>, tensor<4x512xf32>) outs(%acc: tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  return %result: tensor<1000x512xf32>
+}
+
+func.func @matmul_4x1000xf32_times_1000x512xf32_into_4x512xf32(%lhs: tensor<4x1000xf32>, %rhs: tensor<1000x512xf32>) -> tensor<4x512xf32> {
+  %init_acc = tensor.empty() : tensor<4x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<4x512xf32>) -> tensor<4x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<4x1000xf32>, tensor<1000x512xf32>) outs(%acc: tensor<4x512xf32>) -> tensor<4x512xf32>
+  return %result: tensor<4x512xf32>
+}
+
+func.func @matmul_512x1000xf32_times_1000x4xf32_into_512x4xf32(%lhs: tensor<512x1000xf32>, %rhs: tensor<1000x4xf32>) -> tensor<512x4xf32> {
+  %init_acc = tensor.empty() : tensor<512x4xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x4xf32>) -> tensor<512x4xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x1000xf32>, tensor<1000x4xf32>) outs(%acc: tensor<512x4xf32>) -> tensor<512x4xf32>
+  return %result: tensor<512x4xf32>
+}
+
+func.func @matmul_512x128xf32_times_128x500xf32_into_512x500xf32(%lhs: tensor<512x128xf32>, %rhs: tensor<128x500xf32>) -> tensor<512x500xf32> {
+  %init_acc = tensor.empty() : tensor<512x500xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x500xf32>) -> tensor<512x500xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xf32>, tensor<128x500xf32>) outs(%acc: tensor<512x500xf32>) -> tensor<512x500xf32>
+  return %result: tensor<512x500xf32>
+}
+
+func.func @matmul_457x330xf32_times_330x512xf32_into_457x512xf32(%lhs: tensor<457x330xf32>, %rhs: tensor<330x512xf32>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x330xf32>, tensor<330x512xf32>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_457x330xf32_times_330x514xf32_into_457x514xf32(%lhs: tensor<457x330xf32>, %rhs: tensor<330x514xf32>) -> tensor<457x514xf32> {
+  %init_acc = tensor.empty() : tensor<457x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x514xf32>) -> tensor<457x514xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x330xf32>, tensor<330x514xf32>) outs(%acc: tensor<457x514xf32>) -> tensor<457x514xf32>
+  return %result: tensor<457x514xf32>
+}
+
+func.func @matmul_438x330xf32_times_330x514xf32_into_438x514xf32(%lhs: tensor<438x330xf32>, %rhs: tensor<330x514xf32>) -> tensor<438x514xf32> {
+  %init_acc = tensor.empty() : tensor<438x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<438x514xf32>) -> tensor<438x514xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<438x330xf32>, tensor<330x514xf32>) outs(%acc: tensor<438x514xf32>) -> tensor<438x514xf32>
+  return %result: tensor<438x514xf32>
+}
+
+func.func @matmul_540x332xf32_times_332x516xf32_into_540x516xf32(%lhs: tensor<540x332xf32>, %rhs: tensor<332x516xf32>) -> tensor<540x516xf32> {
+  %init_acc = tensor.empty() : tensor<540x516xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<540x516xf32>) -> tensor<540x516xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<540x332xf32>, tensor<332x516xf32>) outs(%acc: tensor<540x516xf32>) -> tensor<540x516xf32>
+  return %result: tensor<540x516xf32>
+}
+
+func.func @matmul_654x321xf32_times_321x234xf32_into_654x234xf32(%lhs: tensor<654x321xf32>, %rhs: tensor<321x234xf32>) -> tensor<654x234xf32> {
+  %init_acc = tensor.empty() : tensor<654x234xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<654x234xf32>) -> tensor<654x234xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<654x321xf32>, tensor<321x234xf32>) outs(%acc: tensor<654x234xf32>) -> tensor<654x234xf32>
+  return %result: tensor<654x234xf32>
+}
+
+func.func @matmul_457x160xf32_times_160x512xf32_into_457x512xf32(%lhs: tensor<457x160xf32>, %rhs: tensor<160x512xf32>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x160xf32>, tensor<160x512xf32>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_512x330xf32_times_330x512xf32_into_512x512xf32(%lhs: tensor<512x330xf32>, %rhs: tensor<330x512xf32>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x330xf32>, tensor<330x512xf32>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_accumulate_1x1000xf32_times_1000x1000xf32_into_1x1000xf32(%lhs: tensor<1x1000xf32>, %rhs: tensor<1000x1000xf32>, %acc: tensor<1x1000xf32>) -> tensor<1x1000xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1000xf32>, tensor<1000x1000xf32>) outs(%acc: tensor<1x1000xf32>) -> tensor<1x1000xf32>
+  return %result: tensor<1x1000xf32>
+}
+
+func.func @matmul_accumulate_1000x1000xf32_times_1000x1xf32_into_1000x1xf32(%lhs: tensor<1000x1000xf32>, %rhs: tensor<1000x1xf32>, %acc: tensor<1000x1xf32>) -> tensor<1000x1xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x1000xf32>, tensor<1000x1xf32>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
+func.func @matmul_1000x1000xf32_times_1000x1xf32_into_1000x1xf32(%lhs: tensor<1000x1000xf32>, %rhs: tensor<1000x1xf32>) -> tensor<1000x1xf32> {
+  %init_acc = tensor.empty() : tensor<1000x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x1000xf32>, tensor<1000x1xf32>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_large_calls.mlir b/linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_large_calls.mlir
new file mode 100644
index 0000000..28136c6
--- /dev/null
+++ b/linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_large_calls.mlir
@@ -0,0 +1,882 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_512x128xf32_times_128x512xf32_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf32_times_128x512xf32_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x4xf32_times_4x512xf32_into_1000x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_4x1000xf32_times_1000x512xf32_into_4x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x1000xf32_times_1000x4xf32_into_512x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf32_times_128x500xf32_into_512x500xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf32_times_330x512xf32_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf32_times_330x514xf32_into_457x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_438x330xf32_times_330x514xf32_into_438x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_540x332xf32_times_332x516xf32_into_540x516xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_654x321xf32_times_321x234xf32_into_654x234xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x160xf32_times_160x512xf32_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x330xf32_times_330x512xf32_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1000xf32_times_1000x1000xf32_into_1x1000xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1000x1000xf32_times_1000x1xf32_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x1000xf32_times_1000x1xf32_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_512_128_512_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_512x128xf32_times_128x512xf32_into_512x512xf32_512_128_512_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_512x128xf32_times_128x512xf32_into_512x512xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_512_128_512_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf32_times_128x512xf32_into_512x512xf32_512_128_512_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf32_times_128x512xf32_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1000_4_512_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x4xf32_times_4x512xf32_into_1000x512xf32_1000_4_512_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 14 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 15 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x4xf32_times_4x512xf32_into_1000x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_4_1000_512_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 16 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 17 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_4x1000xf32_times_1000x512xf32_into_4x512xf32_4_1000_512_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_4x1000xf32_times_1000x512xf32_into_4x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_512_1000_4_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 20 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 21 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x1000xf32_times_1000x4xf32_into_512x4xf32_512_1000_4_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 22 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 23 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x1000xf32_times_1000x4xf32_into_512x4xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_512_128_500_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 500 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf32_times_128x500xf32_into_512x500xf32_512_128_500_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 26 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 500 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 27 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf32_times_128x500xf32_into_512x500xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_457_330_512_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 28 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 29 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf32_times_330x512xf32_into_457x512xf32_457_330_512_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf32_times_330x512xf32_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_457_330_514_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 32 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 33 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf32_times_330x514xf32_into_457x514xf32_457_330_514_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 34 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 35 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf32_times_330x514xf32_into_457x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_438_330_514_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_438x330xf32_times_330x514xf32_into_438x514xf32_438_330_514_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 38 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 39 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_438x330xf32_times_330x514xf32_into_438x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_540_332_516_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 40 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 332 : i64
+  %rhs_dim1 = arith.constant 516 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 41 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_540x332xf32_times_332x516xf32_into_540x516xf32_540_332_516_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 332 : i64
+  %rhs_dim1 = arith.constant 516 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_540x332xf32_times_332x516xf32_into_540x516xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_654_321_234_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 321 : i64
+  %rhs_dim1 = arith.constant 234 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_654x321xf32_times_321x234xf32_into_654x234xf32_654_321_234_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 321 : i64
+  %rhs_dim1 = arith.constant 234 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_654x321xf32_times_321x234xf32_into_654x234xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_457_160_512_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 48 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 160 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 49 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x160xf32_times_160x512xf32_into_457x512xf32_457_160_512_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 50 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 160 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 51 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x160xf32_times_160x512xf32_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_512_330_512_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x330xf32_times_330x512xf32_into_512x512xf32_512_330_512_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 54 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 55 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x330xf32_times_330x512xf32_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1_1000_1000_acc_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 56 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 57 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 58 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 58 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1000xf32_times_1000x1000xf32_into_1x1000xf32_1_1000_1000_acc_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 59 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 60 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 61 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 61 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1000xf32_times_1000x1000xf32_into_1x1000xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1000_1000_1_acc_28() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1000x1000xf32_times_1000x1xf32_into_1000x1xf32_1000_1000_1_acc_29() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1000x1000xf32_times_1000x1xf32_into_1000x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1000_1000_1_30() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x1000xf32_times_1000x1xf32_into_1000x1xf32_1000_1000_1_31() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 70 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 71 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x1000xf32_times_1000x1xf32_into_1000x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_small.mlir b/linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_small.mlir
new file mode 100644
index 0000000..ff5e347
--- /dev/null
+++ b/linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_small.mlir
@@ -0,0 +1,99 @@
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xf32>, tensor<?x?xf32>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_1x1xf32_times_1x1xf32_into_1x1xf32(%lhs: tensor<1x1xf32>, %rhs: tensor<1x1xf32>, %acc: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1xf32>, tensor<1x1xf32>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf32>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xf32>, tensor<?x?xf32>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_1x1xf32_times_1x1xf32_into_1x1xf32(%lhs: tensor<1x1xf32>, %rhs: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %init_acc = tensor.empty() : tensor<1x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x1xf32>) -> tensor<1x1xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1xf32>, tensor<1x1xf32>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_accumulate_2x2xf32_times_2x2xf32_into_2x2xf32(%lhs: tensor<2x2xf32>, %rhs: tensor<2x2xf32>, %acc: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<2x2xf32>, tensor<2x2xf32>) outs(%acc: tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %result: tensor<2x2xf32>
+}
+
+func.func @matmul_accumulate_4x4xf32_times_4x4xf32_into_4x4xf32(%lhs: tensor<4x4xf32>, %rhs: tensor<4x4xf32>, %acc: tensor<4x4xf32>) -> tensor<4x4xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<4x4xf32>, tensor<4x4xf32>) outs(%acc: tensor<4x4xf32>) -> tensor<4x4xf32>
+  return %result: tensor<4x4xf32>
+}
+
+func.func @matmul_accumulate_8x8xf32_times_8x8xf32_into_8x8xf32(%lhs: tensor<8x8xf32>, %rhs: tensor<8x8xf32>, %acc: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<8x8xf32>, tensor<8x8xf32>) outs(%acc: tensor<8x8xf32>) -> tensor<8x8xf32>
+  return %result: tensor<8x8xf32>
+}
+
+func.func @matmul_accumulate_9x9xf32_times_9x9xf32_into_9x9xf32(%lhs: tensor<9x9xf32>, %rhs: tensor<9x9xf32>, %acc: tensor<9x9xf32>) -> tensor<9x9xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<9x9xf32>, tensor<9x9xf32>) outs(%acc: tensor<9x9xf32>) -> tensor<9x9xf32>
+  return %result: tensor<9x9xf32>
+}
+
+func.func @matmul_accumulate_6x13xf32_times_13x3xf32_into_6x3xf32(%lhs: tensor<6x13xf32>, %rhs: tensor<13x3xf32>, %acc: tensor<6x3xf32>) -> tensor<6x3xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<6x13xf32>, tensor<13x3xf32>) outs(%acc: tensor<6x3xf32>) -> tensor<6x3xf32>
+  return %result: tensor<6x3xf32>
+}
+
+func.func @matmul_15x37xf32_times_37x7xf32_into_15x7xf32(%lhs: tensor<15x37xf32>, %rhs: tensor<37x7xf32>) -> tensor<15x7xf32> {
+  %init_acc = tensor.empty() : tensor<15x7xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<15x7xf32>) -> tensor<15x7xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<15x37xf32>, tensor<37x7xf32>) outs(%acc: tensor<15x7xf32>) -> tensor<15x7xf32>
+  return %result: tensor<15x7xf32>
+}
+
+func.func @matmul_accumulate_81x19xf32_times_19x41xf32_into_81x41xf32(%lhs: tensor<81x19xf32>, %rhs: tensor<19x41xf32>, %acc: tensor<81x41xf32>) -> tensor<81x41xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<81x19xf32>, tensor<19x41xf32>) outs(%acc: tensor<81x41xf32>) -> tensor<81x41xf32>
+  return %result: tensor<81x41xf32>
+}
+
+func.func @matmul_accumulate_1x10xf32_times_10x10xf32_into_1x10xf32(%lhs: tensor<1x10xf32>, %rhs: tensor<10x10xf32>, %acc: tensor<1x10xf32>) -> tensor<1x10xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x10xf32>, tensor<10x10xf32>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_1x10xf32_times_10x10xf32_into_1x10xf32(%lhs: tensor<1x10xf32>, %rhs: tensor<10x10xf32>) -> tensor<1x10xf32> {
+  %init_acc = tensor.empty() : tensor<1x10xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x10xf32>) -> tensor<1x10xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x10xf32>, tensor<10x10xf32>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_accumulate_10x1xf32_times_1x10xf32_into_10x10xf32(%lhs: tensor<10x1xf32>, %rhs: tensor<1x10xf32>, %acc: tensor<10x10xf32>) -> tensor<10x10xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x1xf32>, tensor<1x10xf32>) outs(%acc: tensor<10x10xf32>) -> tensor<10x10xf32>
+  return %result: tensor<10x10xf32>
+}
+
+func.func @matmul_accumulate_10x10xf32_times_10x1xf32_into_10x1xf32(%lhs: tensor<10x10xf32>, %rhs: tensor<10x1xf32>, %acc: tensor<10x1xf32>) -> tensor<10x1xf32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x10xf32>, tensor<10x1xf32>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
+func.func @matmul_10x10xf32_times_10x1xf32_into_10x1xf32(%lhs: tensor<10x10xf32>, %rhs: tensor<10x1xf32>) -> tensor<10x1xf32> {
+  %init_acc = tensor.empty() : tensor<10x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<10x1xf32>) -> tensor<10x1xf32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x10xf32>, tensor<10x1xf32>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_small_calls.mlir b/linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_small_calls.mlir
new file mode 100644
index 0000000..e764f76
--- /dev/null
+++ b/linalg_ops/matmul/generated/f32_into_f32/matmul_f32_into_f32_small_calls.mlir
@@ -0,0 +1,906 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1xf32_times_1x1xf32_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x1xf32_times_1x1xf32_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_2x2xf32_times_2x2xf32_into_2x2xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_4x4xf32_times_4x4xf32_into_4x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_8x8xf32_times_8x8xf32_into_8x8xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_9x9xf32_times_9x9xf32_into_9x9xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_6x13xf32_times_13x3xf32_into_6x3xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_15x37xf32_times_37x7xf32_into_15x7xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_81x19xf32_times_19x41xf32_into_81x41xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x10xf32_times_10x10xf32_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x10xf32_times_10x10xf32_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x1xf32_times_1x10xf32_into_10x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x10xf32_times_10x1xf32_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_10x10xf32_times_10x1xf32_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1xf32_times_1x1xf32_into_1x1xf32_1_1_1_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1xf32_times_1x1xf32_into_1x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1_1_1_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x1xf32_times_1x1xf32_into_1x1xf32_1_1_1_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x1xf32_times_1x1xf32_into_1x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_2_2_2_acc_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 14 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 14 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_2x2xf32_times_2x2xf32_into_2x2xf32_2_2_2_acc_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 15 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 16 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 17 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 17 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_2x2xf32_times_2x2xf32_into_2x2xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_4_4_4_acc_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 20 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 20 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_4x4xf32_times_4x4xf32_into_4x4xf32_4_4_4_acc_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 21 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 22 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 23 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 23 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_4x4xf32_times_4x4xf32_into_4x4xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_8_8_8_acc_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 26 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 26 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_8x8xf32_times_8x8xf32_into_8x8xf32_8_8_8_acc_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 27 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 28 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 29 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 29 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_8x8xf32_times_8x8xf32_into_8x8xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_9_9_9_acc_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 32 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 32 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_9x9xf32_times_9x9xf32_into_9x9xf32_9_9_9_acc_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 33 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 34 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 35 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 35 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_9x9xf32_times_9x9xf32_into_9x9xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_6_13_3_acc_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 13 : i64
+  %rhs_dim1 = arith.constant 3 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 38 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 38 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_6x13xf32_times_13x3xf32_into_6x3xf32_6_13_3_acc_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 39 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 13 : i64
+  %rhs_dim1 = arith.constant 3 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 40 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 41 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 41 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_6x13xf32_times_13x3xf32_into_6x3xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_15_37_7_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 37 : i64
+  %rhs_dim1 = arith.constant 7 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_15x37xf32_times_37x7xf32_into_15x7xf32_15_37_7_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 37 : i64
+  %rhs_dim1 = arith.constant 7 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_15x37xf32_times_37x7xf32_into_15x7xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_81_19_41_acc_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 19 : i64
+  %rhs_dim1 = arith.constant 41 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 48 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 48 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_81x19xf32_times_19x41xf32_into_81x41xf32_81_19_41_acc_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 49 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 19 : i64
+  %rhs_dim1 = arith.constant 41 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 50 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 51 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 51 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_81x19xf32_times_19x41xf32_into_81x41xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1_10_10_acc_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 54 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 54 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x10xf32_times_10x10xf32_into_1x10xf32_1_10_10_acc_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 55 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 56 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 57 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 57 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x10xf32_times_10x10xf32_into_1x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1_10_10_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 58 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 59 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x10xf32_times_10x10xf32_into_1x10xf32_1_10_10_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 60 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 61 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x10xf32_times_10x10xf32_into_1x10xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_10_1_10_acc_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x1xf32_times_1x10xf32_into_10x10xf32_10_1_10_acc_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x1xf32_times_1x10xf32_into_10x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_10_10_1_acc_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 70 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 70 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x10xf32_times_10x1xf32_into_10x1xf32_10_10_1_acc_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 71 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 72 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 73 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 73 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x10xf32_times_10x1xf32_into_10x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_10_10_1_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 74 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 75 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_10x10xf32_times_10x1xf32_into_10x1xf32_10_10_1_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 76 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 77 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_10x10xf32_times_10x1xf32_into_10x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_large.mlir b/linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_large.mlir
new file mode 100644
index 0000000..63c71a7
--- /dev/null
+++ b/linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_large.mlir
@@ -0,0 +1,136 @@
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xf32>, tensor<?x?xf32>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_512x128xf32_times_512x128xf32_into_512x512xf32(%lhs: tensor<512x128xf32>, %rhs: tensor<512x128xf32>, %acc: tensor<512x512xf32>) -> tensor<512x512xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xf32>, tensor<512x128xf32>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf32>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xf32>, tensor<?x?xf32>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_512x128xf32_times_512x128xf32_into_512x512xf32(%lhs: tensor<512x128xf32>, %rhs: tensor<512x128xf32>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xf32>, tensor<512x128xf32>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_1000x4xf32_times_512x4xf32_into_1000x512xf32(%lhs: tensor<1000x4xf32>, %rhs: tensor<512x4xf32>) -> tensor<1000x512xf32> {
+  %init_acc = tensor.empty() : tensor<1000x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x4xf32>, tensor<512x4xf32>) outs(%acc: tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  return %result: tensor<1000x512xf32>
+}
+
+func.func @matmul_4x1000xf32_times_512x1000xf32_into_4x512xf32(%lhs: tensor<4x1000xf32>, %rhs: tensor<512x1000xf32>) -> tensor<4x512xf32> {
+  %init_acc = tensor.empty() : tensor<4x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<4x512xf32>) -> tensor<4x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<4x1000xf32>, tensor<512x1000xf32>) outs(%acc: tensor<4x512xf32>) -> tensor<4x512xf32>
+  return %result: tensor<4x512xf32>
+}
+
+func.func @matmul_512x1000xf32_times_4x1000xf32_into_512x4xf32(%lhs: tensor<512x1000xf32>, %rhs: tensor<4x1000xf32>) -> tensor<512x4xf32> {
+  %init_acc = tensor.empty() : tensor<512x4xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x4xf32>) -> tensor<512x4xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x1000xf32>, tensor<4x1000xf32>) outs(%acc: tensor<512x4xf32>) -> tensor<512x4xf32>
+  return %result: tensor<512x4xf32>
+}
+
+func.func @matmul_512x128xf32_times_500x128xf32_into_512x500xf32(%lhs: tensor<512x128xf32>, %rhs: tensor<500x128xf32>) -> tensor<512x500xf32> {
+  %init_acc = tensor.empty() : tensor<512x500xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x500xf32>) -> tensor<512x500xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xf32>, tensor<500x128xf32>) outs(%acc: tensor<512x500xf32>) -> tensor<512x500xf32>
+  return %result: tensor<512x500xf32>
+}
+
+func.func @matmul_457x330xf32_times_512x330xf32_into_457x512xf32(%lhs: tensor<457x330xf32>, %rhs: tensor<512x330xf32>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x330xf32>, tensor<512x330xf32>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_457x330xf32_times_514x330xf32_into_457x514xf32(%lhs: tensor<457x330xf32>, %rhs: tensor<514x330xf32>) -> tensor<457x514xf32> {
+  %init_acc = tensor.empty() : tensor<457x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x514xf32>) -> tensor<457x514xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x330xf32>, tensor<514x330xf32>) outs(%acc: tensor<457x514xf32>) -> tensor<457x514xf32>
+  return %result: tensor<457x514xf32>
+}
+
+func.func @matmul_438x330xf32_times_514x330xf32_into_438x514xf32(%lhs: tensor<438x330xf32>, %rhs: tensor<514x330xf32>) -> tensor<438x514xf32> {
+  %init_acc = tensor.empty() : tensor<438x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<438x514xf32>) -> tensor<438x514xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<438x330xf32>, tensor<514x330xf32>) outs(%acc: tensor<438x514xf32>) -> tensor<438x514xf32>
+  return %result: tensor<438x514xf32>
+}
+
+func.func @matmul_540x332xf32_times_516x332xf32_into_540x516xf32(%lhs: tensor<540x332xf32>, %rhs: tensor<516x332xf32>) -> tensor<540x516xf32> {
+  %init_acc = tensor.empty() : tensor<540x516xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<540x516xf32>) -> tensor<540x516xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<540x332xf32>, tensor<516x332xf32>) outs(%acc: tensor<540x516xf32>) -> tensor<540x516xf32>
+  return %result: tensor<540x516xf32>
+}
+
+func.func @matmul_654x321xf32_times_234x321xf32_into_654x234xf32(%lhs: tensor<654x321xf32>, %rhs: tensor<234x321xf32>) -> tensor<654x234xf32> {
+  %init_acc = tensor.empty() : tensor<654x234xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<654x234xf32>) -> tensor<654x234xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<654x321xf32>, tensor<234x321xf32>) outs(%acc: tensor<654x234xf32>) -> tensor<654x234xf32>
+  return %result: tensor<654x234xf32>
+}
+
+func.func @matmul_457x160xf32_times_512x160xf32_into_457x512xf32(%lhs: tensor<457x160xf32>, %rhs: tensor<512x160xf32>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x160xf32>, tensor<512x160xf32>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_512x330xf32_times_512x330xf32_into_512x512xf32(%lhs: tensor<512x330xf32>, %rhs: tensor<512x330xf32>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x330xf32>, tensor<512x330xf32>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_accumulate_1x1000xf32_times_1000x1000xf32_into_1x1000xf32(%lhs: tensor<1x1000xf32>, %rhs: tensor<1000x1000xf32>, %acc: tensor<1x1000xf32>) -> tensor<1x1000xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1000xf32>, tensor<1000x1000xf32>) outs(%acc: tensor<1x1000xf32>) -> tensor<1x1000xf32>
+  return %result: tensor<1x1000xf32>
+}
+
+func.func @matmul_accumulate_1000x1000xf32_times_1x1000xf32_into_1000x1xf32(%lhs: tensor<1000x1000xf32>, %rhs: tensor<1x1000xf32>, %acc: tensor<1000x1xf32>) -> tensor<1000x1xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x1000xf32>, tensor<1x1000xf32>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
+func.func @matmul_1000x1000xf32_times_1x1000xf32_into_1000x1xf32(%lhs: tensor<1000x1000xf32>, %rhs: tensor<1x1000xf32>) -> tensor<1000x1xf32> {
+  %init_acc = tensor.empty() : tensor<1000x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x1000xf32>, tensor<1x1000xf32>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_large_calls.mlir b/linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_large_calls.mlir
new file mode 100644
index 0000000..1ecf2c6
--- /dev/null
+++ b/linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_large_calls.mlir
@@ -0,0 +1,882 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_512x128xf32_times_512x128xf32_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf32_times_512x128xf32_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x4xf32_times_512x4xf32_into_1000x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_4x1000xf32_times_512x1000xf32_into_4x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x1000xf32_times_4x1000xf32_into_512x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf32_times_500x128xf32_into_512x500xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf32_times_512x330xf32_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf32_times_514x330xf32_into_457x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_438x330xf32_times_514x330xf32_into_438x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_540x332xf32_times_516x332xf32_into_540x516xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_654x321xf32_times_234x321xf32_into_654x234xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x160xf32_times_512x160xf32_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x330xf32_times_512x330xf32_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1000xf32_times_1000x1000xf32_into_1x1000xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1000x1000xf32_times_1x1000xf32_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x1000xf32_times_1x1000xf32_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_512_128_512_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_512x128xf32_times_512x128xf32_into_512x512xf32_512_128_512_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_512x128xf32_times_512x128xf32_into_512x512xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_512_128_512_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf32_times_512x128xf32_into_512x512xf32_512_128_512_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf32_times_512x128xf32_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1000_4_512_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x4xf32_times_512x4xf32_into_1000x512xf32_1000_4_512_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 14 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 15 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x4xf32_times_512x4xf32_into_1000x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_4_1000_512_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 16 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 17 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_4x1000xf32_times_512x1000xf32_into_4x512xf32_4_1000_512_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_4x1000xf32_times_512x1000xf32_into_4x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_512_1000_4_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 20 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 21 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x1000xf32_times_4x1000xf32_into_512x4xf32_512_1000_4_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 22 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 23 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x1000xf32_times_4x1000xf32_into_512x4xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_512_128_500_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 500 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf32_times_500x128xf32_into_512x500xf32_512_128_500_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 26 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 500 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 27 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf32_times_500x128xf32_into_512x500xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_457_330_512_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 28 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 29 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf32_times_512x330xf32_into_457x512xf32_457_330_512_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf32_times_512x330xf32_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_457_330_514_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 32 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 33 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf32_times_514x330xf32_into_457x514xf32_457_330_514_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 34 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 35 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf32_times_514x330xf32_into_457x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_438_330_514_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_438x330xf32_times_514x330xf32_into_438x514xf32_438_330_514_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 38 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 39 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_438x330xf32_times_514x330xf32_into_438x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_540_332_516_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 40 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 516 : i64
+  %rhs_dim1 = arith.constant 332 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 41 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_540x332xf32_times_516x332xf32_into_540x516xf32_540_332_516_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 516 : i64
+  %rhs_dim1 = arith.constant 332 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_540x332xf32_times_516x332xf32_into_540x516xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_654_321_234_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 234 : i64
+  %rhs_dim1 = arith.constant 321 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_654x321xf32_times_234x321xf32_into_654x234xf32_654_321_234_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 234 : i64
+  %rhs_dim1 = arith.constant 321 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_654x321xf32_times_234x321xf32_into_654x234xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_457_160_512_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 48 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 160 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 49 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x160xf32_times_512x160xf32_into_457x512xf32_457_160_512_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 50 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 160 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 51 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x160xf32_times_512x160xf32_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_512_330_512_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x330xf32_times_512x330xf32_into_512x512xf32_512_330_512_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 54 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 55 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x330xf32_times_512x330xf32_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1_1000_1000_acc_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 56 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 57 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 58 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 58 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1000xf32_times_1000x1000xf32_into_1x1000xf32_1_1000_1000_acc_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 59 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 60 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 61 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 61 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1000xf32_times_1000x1000xf32_into_1x1000xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1000_1000_1_acc_28() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1000x1000xf32_times_1x1000xf32_into_1000x1xf32_1000_1000_1_acc_29() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1000x1000xf32_times_1x1000xf32_into_1000x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1000_1000_1_30() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x1000xf32_times_1x1000xf32_into_1000x1xf32_1000_1000_1_31() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 70 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 71 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x1000xf32_times_1x1000xf32_into_1000x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_small.mlir b/linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_small.mlir
new file mode 100644
index 0000000..4a31253
--- /dev/null
+++ b/linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_small.mlir
@@ -0,0 +1,99 @@
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xf32>, tensor<?x?xf32>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_1x1xf32_times_1x1xf32_into_1x1xf32(%lhs: tensor<1x1xf32>, %rhs: tensor<1x1xf32>, %acc: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1xf32>, tensor<1x1xf32>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf32>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xf32>, tensor<?x?xf32>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_1x1xf32_times_1x1xf32_into_1x1xf32(%lhs: tensor<1x1xf32>, %rhs: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %init_acc = tensor.empty() : tensor<1x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x1xf32>) -> tensor<1x1xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1xf32>, tensor<1x1xf32>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_accumulate_2x2xf32_times_2x2xf32_into_2x2xf32(%lhs: tensor<2x2xf32>, %rhs: tensor<2x2xf32>, %acc: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<2x2xf32>, tensor<2x2xf32>) outs(%acc: tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %result: tensor<2x2xf32>
+}
+
+func.func @matmul_accumulate_4x4xf32_times_4x4xf32_into_4x4xf32(%lhs: tensor<4x4xf32>, %rhs: tensor<4x4xf32>, %acc: tensor<4x4xf32>) -> tensor<4x4xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<4x4xf32>, tensor<4x4xf32>) outs(%acc: tensor<4x4xf32>) -> tensor<4x4xf32>
+  return %result: tensor<4x4xf32>
+}
+
+func.func @matmul_accumulate_8x8xf32_times_8x8xf32_into_8x8xf32(%lhs: tensor<8x8xf32>, %rhs: tensor<8x8xf32>, %acc: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<8x8xf32>, tensor<8x8xf32>) outs(%acc: tensor<8x8xf32>) -> tensor<8x8xf32>
+  return %result: tensor<8x8xf32>
+}
+
+func.func @matmul_accumulate_9x9xf32_times_9x9xf32_into_9x9xf32(%lhs: tensor<9x9xf32>, %rhs: tensor<9x9xf32>, %acc: tensor<9x9xf32>) -> tensor<9x9xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<9x9xf32>, tensor<9x9xf32>) outs(%acc: tensor<9x9xf32>) -> tensor<9x9xf32>
+  return %result: tensor<9x9xf32>
+}
+
+func.func @matmul_accumulate_6x13xf32_times_3x13xf32_into_6x3xf32(%lhs: tensor<6x13xf32>, %rhs: tensor<3x13xf32>, %acc: tensor<6x3xf32>) -> tensor<6x3xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<6x13xf32>, tensor<3x13xf32>) outs(%acc: tensor<6x3xf32>) -> tensor<6x3xf32>
+  return %result: tensor<6x3xf32>
+}
+
+func.func @matmul_15x37xf32_times_7x37xf32_into_15x7xf32(%lhs: tensor<15x37xf32>, %rhs: tensor<7x37xf32>) -> tensor<15x7xf32> {
+  %init_acc = tensor.empty() : tensor<15x7xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<15x7xf32>) -> tensor<15x7xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<15x37xf32>, tensor<7x37xf32>) outs(%acc: tensor<15x7xf32>) -> tensor<15x7xf32>
+  return %result: tensor<15x7xf32>
+}
+
+func.func @matmul_accumulate_81x19xf32_times_41x19xf32_into_81x41xf32(%lhs: tensor<81x19xf32>, %rhs: tensor<41x19xf32>, %acc: tensor<81x41xf32>) -> tensor<81x41xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<81x19xf32>, tensor<41x19xf32>) outs(%acc: tensor<81x41xf32>) -> tensor<81x41xf32>
+  return %result: tensor<81x41xf32>
+}
+
+func.func @matmul_accumulate_1x10xf32_times_10x10xf32_into_1x10xf32(%lhs: tensor<1x10xf32>, %rhs: tensor<10x10xf32>, %acc: tensor<1x10xf32>) -> tensor<1x10xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x10xf32>, tensor<10x10xf32>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_1x10xf32_times_10x10xf32_into_1x10xf32(%lhs: tensor<1x10xf32>, %rhs: tensor<10x10xf32>) -> tensor<1x10xf32> {
+  %init_acc = tensor.empty() : tensor<1x10xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x10xf32>) -> tensor<1x10xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x10xf32>, tensor<10x10xf32>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_accumulate_10x1xf32_times_10x1xf32_into_10x10xf32(%lhs: tensor<10x1xf32>, %rhs: tensor<10x1xf32>, %acc: tensor<10x10xf32>) -> tensor<10x10xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x1xf32>, tensor<10x1xf32>) outs(%acc: tensor<10x10xf32>) -> tensor<10x10xf32>
+  return %result: tensor<10x10xf32>
+}
+
+func.func @matmul_accumulate_10x10xf32_times_1x10xf32_into_10x1xf32(%lhs: tensor<10x10xf32>, %rhs: tensor<1x10xf32>, %acc: tensor<10x1xf32>) -> tensor<10x1xf32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x10xf32>, tensor<1x10xf32>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
+func.func @matmul_10x10xf32_times_1x10xf32_into_10x1xf32(%lhs: tensor<10x10xf32>, %rhs: tensor<1x10xf32>) -> tensor<10x1xf32> {
+  %init_acc = tensor.empty() : tensor<10x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<10x1xf32>) -> tensor<10x1xf32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x10xf32>, tensor<1x10xf32>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_small_calls.mlir b/linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_small_calls.mlir
new file mode 100644
index 0000000..d10f372
--- /dev/null
+++ b/linalg_ops/matmul/generated/f32_into_f32/matmul_transpose_b_f32_into_f32_small_calls.mlir
@@ -0,0 +1,906 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1xf32_times_1x1xf32_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x1xf32_times_1x1xf32_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_2x2xf32_times_2x2xf32_into_2x2xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_4x4xf32_times_4x4xf32_into_4x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_8x8xf32_times_8x8xf32_into_8x8xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_9x9xf32_times_9x9xf32_into_9x9xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_6x13xf32_times_3x13xf32_into_6x3xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_15x37xf32_times_7x37xf32_into_15x7xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_81x19xf32_times_41x19xf32_into_81x41xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x10xf32_times_10x10xf32_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x10xf32_times_10x10xf32_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x1xf32_times_10x1xf32_into_10x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x10xf32_times_1x10xf32_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_10x10xf32_times_1x10xf32_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1xf32_times_1x1xf32_into_1x1xf32_1_1_1_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1xf32_times_1x1xf32_into_1x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1_1_1_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x1xf32_times_1x1xf32_into_1x1xf32_1_1_1_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x1xf32_times_1x1xf32_into_1x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_2_2_2_acc_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 14 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 14 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_2x2xf32_times_2x2xf32_into_2x2xf32_2_2_2_acc_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 15 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 16 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 17 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 17 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_2x2xf32_times_2x2xf32_into_2x2xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_4_4_4_acc_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 20 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 20 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_4x4xf32_times_4x4xf32_into_4x4xf32_4_4_4_acc_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 21 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 22 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 23 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 23 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_4x4xf32_times_4x4xf32_into_4x4xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_8_8_8_acc_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 26 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 26 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_8x8xf32_times_8x8xf32_into_8x8xf32_8_8_8_acc_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 27 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 28 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 29 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 29 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_8x8xf32_times_8x8xf32_into_8x8xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_9_9_9_acc_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 32 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 32 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_9x9xf32_times_9x9xf32_into_9x9xf32_9_9_9_acc_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 33 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 34 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 35 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 35 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_9x9xf32_times_9x9xf32_into_9x9xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_6_13_3_acc_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 3 : i64
+  %rhs_dim1 = arith.constant 13 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 38 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 38 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_6x13xf32_times_3x13xf32_into_6x3xf32_6_13_3_acc_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 39 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 3 : i64
+  %rhs_dim1 = arith.constant 13 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 40 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 41 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 41 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_6x13xf32_times_3x13xf32_into_6x3xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_15_37_7_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 7 : i64
+  %rhs_dim1 = arith.constant 37 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_15x37xf32_times_7x37xf32_into_15x7xf32_15_37_7_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 7 : i64
+  %rhs_dim1 = arith.constant 37 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_15x37xf32_times_7x37xf32_into_15x7xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_81_19_41_acc_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 41 : i64
+  %rhs_dim1 = arith.constant 19 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 48 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 48 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_81x19xf32_times_41x19xf32_into_81x41xf32_81_19_41_acc_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 49 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 41 : i64
+  %rhs_dim1 = arith.constant 19 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 50 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 51 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 51 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_81x19xf32_times_41x19xf32_into_81x41xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1_10_10_acc_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 54 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 54 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x10xf32_times_10x10xf32_into_1x10xf32_1_10_10_acc_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 55 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 56 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 57 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 57 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x10xf32_times_10x10xf32_into_1x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_1_10_10_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 58 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 59 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x10xf32_times_10x10xf32_into_1x10xf32_1_10_10_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 60 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 61 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x10xf32_times_10x10xf32_into_1x10xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_10_1_10_acc_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x1xf32_times_10x1xf32_into_10x10xf32_10_1_10_acc_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x1xf32_times_10x1xf32_into_10x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_10_10_1_acc_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 70 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 70 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x10xf32_times_1x10xf32_into_10x1xf32_10_10_1_acc_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 71 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 72 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 73 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 73 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x10xf32_times_1x10xf32_into_10x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32_10_10_1_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 74 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 75 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf32_times_DYNxDYNxf32_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_10x10xf32_times_1x10xf32_into_10x1xf32_10_10_1_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 76 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 77 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_10x10xf32_times_1x10xf32_into_10x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_large.mlir b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_large.mlir
new file mode 100644
index 0000000..5e8b633
--- /dev/null
+++ b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_large.mlir
@@ -0,0 +1,172 @@
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<?x?xf8E4M3FNUZ>, tensor<?x?xf8E4M3FNUZ>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_512x128xf8E4M3FNUZ_times_128x512xf8E4M3FNUZ_into_512x512xf32(%lhs: tensor<512x128xf32>, %rhs: tensor<128x512xf32>, %acc: tensor<512x512xf32>) -> tensor<512x512xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<512x128xf32> to tensor<512x128xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<128x512xf32> to tensor<128x512xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<512x128xf8E4M3FNUZ>, tensor<128x512xf8E4M3FNUZ>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf32>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<?x?xf8E4M3FNUZ>, tensor<?x?xf8E4M3FNUZ>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_512x128xf8E4M3FNUZ_times_128x512xf8E4M3FNUZ_into_512x512xf32(%lhs: tensor<512x128xf32>, %rhs: tensor<128x512xf32>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<512x128xf32> to tensor<512x128xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<128x512xf32> to tensor<128x512xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<512x128xf8E4M3FNUZ>, tensor<128x512xf8E4M3FNUZ>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_1000x4xf8E4M3FNUZ_times_4x512xf8E4M3FNUZ_into_1000x512xf32(%lhs: tensor<1000x4xf32>, %rhs: tensor<4x512xf32>) -> tensor<1000x512xf32> {
+  %init_acc = tensor.empty() : tensor<1000x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<1000x4xf32> to tensor<1000x4xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<4x512xf32> to tensor<4x512xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<1000x4xf8E4M3FNUZ>, tensor<4x512xf8E4M3FNUZ>) outs(%acc: tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  return %result: tensor<1000x512xf32>
+}
+
+func.func @matmul_4x1000xf8E4M3FNUZ_times_1000x512xf8E4M3FNUZ_into_4x512xf32(%lhs: tensor<4x1000xf32>, %rhs: tensor<1000x512xf32>) -> tensor<4x512xf32> {
+  %init_acc = tensor.empty() : tensor<4x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<4x512xf32>) -> tensor<4x512xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<4x1000xf32> to tensor<4x1000xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<1000x512xf32> to tensor<1000x512xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<4x1000xf8E4M3FNUZ>, tensor<1000x512xf8E4M3FNUZ>) outs(%acc: tensor<4x512xf32>) -> tensor<4x512xf32>
+  return %result: tensor<4x512xf32>
+}
+
+func.func @matmul_512x1000xf8E4M3FNUZ_times_1000x4xf8E4M3FNUZ_into_512x4xf32(%lhs: tensor<512x1000xf32>, %rhs: tensor<1000x4xf32>) -> tensor<512x4xf32> {
+  %init_acc = tensor.empty() : tensor<512x4xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x4xf32>) -> tensor<512x4xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<512x1000xf32> to tensor<512x1000xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<1000x4xf32> to tensor<1000x4xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<512x1000xf8E4M3FNUZ>, tensor<1000x4xf8E4M3FNUZ>) outs(%acc: tensor<512x4xf32>) -> tensor<512x4xf32>
+  return %result: tensor<512x4xf32>
+}
+
+func.func @matmul_512x128xf8E4M3FNUZ_times_128x500xf8E4M3FNUZ_into_512x500xf32(%lhs: tensor<512x128xf32>, %rhs: tensor<128x500xf32>) -> tensor<512x500xf32> {
+  %init_acc = tensor.empty() : tensor<512x500xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x500xf32>) -> tensor<512x500xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<512x128xf32> to tensor<512x128xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<128x500xf32> to tensor<128x500xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<512x128xf8E4M3FNUZ>, tensor<128x500xf8E4M3FNUZ>) outs(%acc: tensor<512x500xf32>) -> tensor<512x500xf32>
+  return %result: tensor<512x500xf32>
+}
+
+func.func @matmul_457x330xf8E4M3FNUZ_times_330x512xf8E4M3FNUZ_into_457x512xf32(%lhs: tensor<457x330xf32>, %rhs: tensor<330x512xf32>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<457x330xf32> to tensor<457x330xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<330x512xf32> to tensor<330x512xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<457x330xf8E4M3FNUZ>, tensor<330x512xf8E4M3FNUZ>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_457x330xf8E4M3FNUZ_times_330x514xf8E4M3FNUZ_into_457x514xf32(%lhs: tensor<457x330xf32>, %rhs: tensor<330x514xf32>) -> tensor<457x514xf32> {
+  %init_acc = tensor.empty() : tensor<457x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x514xf32>) -> tensor<457x514xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<457x330xf32> to tensor<457x330xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<330x514xf32> to tensor<330x514xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<457x330xf8E4M3FNUZ>, tensor<330x514xf8E4M3FNUZ>) outs(%acc: tensor<457x514xf32>) -> tensor<457x514xf32>
+  return %result: tensor<457x514xf32>
+}
+
+func.func @matmul_438x330xf8E4M3FNUZ_times_330x514xf8E4M3FNUZ_into_438x514xf32(%lhs: tensor<438x330xf32>, %rhs: tensor<330x514xf32>) -> tensor<438x514xf32> {
+  %init_acc = tensor.empty() : tensor<438x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<438x514xf32>) -> tensor<438x514xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<438x330xf32> to tensor<438x330xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<330x514xf32> to tensor<330x514xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<438x330xf8E4M3FNUZ>, tensor<330x514xf8E4M3FNUZ>) outs(%acc: tensor<438x514xf32>) -> tensor<438x514xf32>
+  return %result: tensor<438x514xf32>
+}
+
+func.func @matmul_540x332xf8E4M3FNUZ_times_332x516xf8E4M3FNUZ_into_540x516xf32(%lhs: tensor<540x332xf32>, %rhs: tensor<332x516xf32>) -> tensor<540x516xf32> {
+  %init_acc = tensor.empty() : tensor<540x516xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<540x516xf32>) -> tensor<540x516xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<540x332xf32> to tensor<540x332xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<332x516xf32> to tensor<332x516xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<540x332xf8E4M3FNUZ>, tensor<332x516xf8E4M3FNUZ>) outs(%acc: tensor<540x516xf32>) -> tensor<540x516xf32>
+  return %result: tensor<540x516xf32>
+}
+
+func.func @matmul_654x321xf8E4M3FNUZ_times_321x234xf8E4M3FNUZ_into_654x234xf32(%lhs: tensor<654x321xf32>, %rhs: tensor<321x234xf32>) -> tensor<654x234xf32> {
+  %init_acc = tensor.empty() : tensor<654x234xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<654x234xf32>) -> tensor<654x234xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<654x321xf32> to tensor<654x321xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<321x234xf32> to tensor<321x234xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<654x321xf8E4M3FNUZ>, tensor<321x234xf8E4M3FNUZ>) outs(%acc: tensor<654x234xf32>) -> tensor<654x234xf32>
+  return %result: tensor<654x234xf32>
+}
+
+func.func @matmul_457x160xf8E4M3FNUZ_times_160x512xf8E4M3FNUZ_into_457x512xf32(%lhs: tensor<457x160xf32>, %rhs: tensor<160x512xf32>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<457x160xf32> to tensor<457x160xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<160x512xf32> to tensor<160x512xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<457x160xf8E4M3FNUZ>, tensor<160x512xf8E4M3FNUZ>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_512x330xf8E4M3FNUZ_times_330x512xf8E4M3FNUZ_into_512x512xf32(%lhs: tensor<512x330xf32>, %rhs: tensor<330x512xf32>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<512x330xf32> to tensor<512x330xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<330x512xf32> to tensor<330x512xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<512x330xf8E4M3FNUZ>, tensor<330x512xf8E4M3FNUZ>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_accumulate_1x1000xf8E4M3FNUZ_times_1000x1000xf8E4M3FNUZ_into_1x1000xf32(%lhs: tensor<1x1000xf32>, %rhs: tensor<1000x1000xf32>, %acc: tensor<1x1000xf32>) -> tensor<1x1000xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<1x1000xf32> to tensor<1x1000xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<1000x1000xf32> to tensor<1000x1000xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<1x1000xf8E4M3FNUZ>, tensor<1000x1000xf8E4M3FNUZ>) outs(%acc: tensor<1x1000xf32>) -> tensor<1x1000xf32>
+  return %result: tensor<1x1000xf32>
+}
+
+func.func @matmul_accumulate_1000x1000xf8E4M3FNUZ_times_1000x1xf8E4M3FNUZ_into_1000x1xf32(%lhs: tensor<1000x1000xf32>, %rhs: tensor<1000x1xf32>, %acc: tensor<1000x1xf32>) -> tensor<1000x1xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<1000x1000xf32> to tensor<1000x1000xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<1000x1xf32> to tensor<1000x1xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<1000x1000xf8E4M3FNUZ>, tensor<1000x1xf8E4M3FNUZ>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
+func.func @matmul_1000x1000xf8E4M3FNUZ_times_1000x1xf8E4M3FNUZ_into_1000x1xf32(%lhs: tensor<1000x1000xf32>, %rhs: tensor<1000x1xf32>) -> tensor<1000x1xf32> {
+  %init_acc = tensor.empty() : tensor<1000x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<1000x1000xf32> to tensor<1000x1000xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<1000x1xf32> to tensor<1000x1xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<1000x1000xf8E4M3FNUZ>, tensor<1000x1xf8E4M3FNUZ>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_large_calls.mlir b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_large_calls.mlir
new file mode 100644
index 0000000..5317e96
--- /dev/null
+++ b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_large_calls.mlir
@@ -0,0 +1,882 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_512x128xf8E4M3FNUZ_times_128x512xf8E4M3FNUZ_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf8E4M3FNUZ_times_128x512xf8E4M3FNUZ_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x4xf8E4M3FNUZ_times_4x512xf8E4M3FNUZ_into_1000x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_4x1000xf8E4M3FNUZ_times_1000x512xf8E4M3FNUZ_into_4x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x1000xf8E4M3FNUZ_times_1000x4xf8E4M3FNUZ_into_512x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf8E4M3FNUZ_times_128x500xf8E4M3FNUZ_into_512x500xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf8E4M3FNUZ_times_330x512xf8E4M3FNUZ_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf8E4M3FNUZ_times_330x514xf8E4M3FNUZ_into_457x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_438x330xf8E4M3FNUZ_times_330x514xf8E4M3FNUZ_into_438x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_540x332xf8E4M3FNUZ_times_332x516xf8E4M3FNUZ_into_540x516xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_654x321xf8E4M3FNUZ_times_321x234xf8E4M3FNUZ_into_654x234xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x160xf8E4M3FNUZ_times_160x512xf8E4M3FNUZ_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x330xf8E4M3FNUZ_times_330x512xf8E4M3FNUZ_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1000xf8E4M3FNUZ_times_1000x1000xf8E4M3FNUZ_into_1x1000xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1000x1000xf8E4M3FNUZ_times_1000x1xf8E4M3FNUZ_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x1000xf8E4M3FNUZ_times_1000x1xf8E4M3FNUZ_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_512_128_512_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_512x128xf8E4M3FNUZ_times_128x512xf8E4M3FNUZ_into_512x512xf32_512_128_512_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_512x128xf8E4M3FNUZ_times_128x512xf8E4M3FNUZ_into_512x512xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_512_128_512_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf8E4M3FNUZ_times_128x512xf8E4M3FNUZ_into_512x512xf32_512_128_512_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf8E4M3FNUZ_times_128x512xf8E4M3FNUZ_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1000_4_512_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x4xf8E4M3FNUZ_times_4x512xf8E4M3FNUZ_into_1000x512xf32_1000_4_512_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 14 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 15 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x4xf8E4M3FNUZ_times_4x512xf8E4M3FNUZ_into_1000x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_4_1000_512_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 16 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 17 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_4x1000xf8E4M3FNUZ_times_1000x512xf8E4M3FNUZ_into_4x512xf32_4_1000_512_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_4x1000xf8E4M3FNUZ_times_1000x512xf8E4M3FNUZ_into_4x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_512_1000_4_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 20 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 21 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x1000xf8E4M3FNUZ_times_1000x4xf8E4M3FNUZ_into_512x4xf32_512_1000_4_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 22 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 23 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x1000xf8E4M3FNUZ_times_1000x4xf8E4M3FNUZ_into_512x4xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_512_128_500_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 500 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf8E4M3FNUZ_times_128x500xf8E4M3FNUZ_into_512x500xf32_512_128_500_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 26 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 500 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 27 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf8E4M3FNUZ_times_128x500xf8E4M3FNUZ_into_512x500xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_457_330_512_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 28 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 29 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf8E4M3FNUZ_times_330x512xf8E4M3FNUZ_into_457x512xf32_457_330_512_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf8E4M3FNUZ_times_330x512xf8E4M3FNUZ_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_457_330_514_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 32 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 33 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf8E4M3FNUZ_times_330x514xf8E4M3FNUZ_into_457x514xf32_457_330_514_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 34 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 35 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf8E4M3FNUZ_times_330x514xf8E4M3FNUZ_into_457x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_438_330_514_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_438x330xf8E4M3FNUZ_times_330x514xf8E4M3FNUZ_into_438x514xf32_438_330_514_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 38 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 39 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_438x330xf8E4M3FNUZ_times_330x514xf8E4M3FNUZ_into_438x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_540_332_516_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 40 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 332 : i64
+  %rhs_dim1 = arith.constant 516 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 41 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_540x332xf8E4M3FNUZ_times_332x516xf8E4M3FNUZ_into_540x516xf32_540_332_516_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 332 : i64
+  %rhs_dim1 = arith.constant 516 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_540x332xf8E4M3FNUZ_times_332x516xf8E4M3FNUZ_into_540x516xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_654_321_234_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 321 : i64
+  %rhs_dim1 = arith.constant 234 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_654x321xf8E4M3FNUZ_times_321x234xf8E4M3FNUZ_into_654x234xf32_654_321_234_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 321 : i64
+  %rhs_dim1 = arith.constant 234 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_654x321xf8E4M3FNUZ_times_321x234xf8E4M3FNUZ_into_654x234xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_457_160_512_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 48 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 160 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 49 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x160xf8E4M3FNUZ_times_160x512xf8E4M3FNUZ_into_457x512xf32_457_160_512_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 50 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 160 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 51 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x160xf8E4M3FNUZ_times_160x512xf8E4M3FNUZ_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_512_330_512_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x330xf8E4M3FNUZ_times_330x512xf8E4M3FNUZ_into_512x512xf32_512_330_512_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 54 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 55 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x330xf8E4M3FNUZ_times_330x512xf8E4M3FNUZ_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1_1000_1000_acc_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 56 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 57 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 58 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 58 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1000xf8E4M3FNUZ_times_1000x1000xf8E4M3FNUZ_into_1x1000xf32_1_1000_1000_acc_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 59 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 60 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 61 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 61 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1000xf8E4M3FNUZ_times_1000x1000xf8E4M3FNUZ_into_1x1000xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1000_1000_1_acc_28() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1000x1000xf8E4M3FNUZ_times_1000x1xf8E4M3FNUZ_into_1000x1xf32_1000_1000_1_acc_29() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1000x1000xf8E4M3FNUZ_times_1000x1xf8E4M3FNUZ_into_1000x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1000_1000_1_30() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x1000xf8E4M3FNUZ_times_1000x1xf8E4M3FNUZ_into_1000x1xf32_1000_1000_1_31() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 70 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 71 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x1000xf8E4M3FNUZ_times_1000x1xf8E4M3FNUZ_into_1000x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_small.mlir b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_small.mlir
new file mode 100644
index 0000000..c5a2caa
--- /dev/null
+++ b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_small.mlir
@@ -0,0 +1,131 @@
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<?x?xf8E4M3FNUZ>, tensor<?x?xf8E4M3FNUZ>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32(%lhs: tensor<1x1xf32>, %rhs: tensor<1x1xf32>, %acc: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<1x1xf32> to tensor<1x1xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<1x1xf32> to tensor<1x1xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<1x1xf8E4M3FNUZ>, tensor<1x1xf8E4M3FNUZ>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf32>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<?x?xf8E4M3FNUZ>, tensor<?x?xf8E4M3FNUZ>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32(%lhs: tensor<1x1xf32>, %rhs: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %init_acc = tensor.empty() : tensor<1x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x1xf32>) -> tensor<1x1xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<1x1xf32> to tensor<1x1xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<1x1xf32> to tensor<1x1xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<1x1xf8E4M3FNUZ>, tensor<1x1xf8E4M3FNUZ>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_accumulate_2x2xf8E4M3FNUZ_times_2x2xf8E4M3FNUZ_into_2x2xf32(%lhs: tensor<2x2xf32>, %rhs: tensor<2x2xf32>, %acc: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<2x2xf32> to tensor<2x2xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<2x2xf32> to tensor<2x2xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<2x2xf8E4M3FNUZ>, tensor<2x2xf8E4M3FNUZ>) outs(%acc: tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %result: tensor<2x2xf32>
+}
+
+func.func @matmul_accumulate_4x4xf8E4M3FNUZ_times_4x4xf8E4M3FNUZ_into_4x4xf32(%lhs: tensor<4x4xf32>, %rhs: tensor<4x4xf32>, %acc: tensor<4x4xf32>) -> tensor<4x4xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<4x4xf32> to tensor<4x4xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<4x4xf32> to tensor<4x4xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<4x4xf8E4M3FNUZ>, tensor<4x4xf8E4M3FNUZ>) outs(%acc: tensor<4x4xf32>) -> tensor<4x4xf32>
+  return %result: tensor<4x4xf32>
+}
+
+func.func @matmul_accumulate_8x8xf8E4M3FNUZ_times_8x8xf8E4M3FNUZ_into_8x8xf32(%lhs: tensor<8x8xf32>, %rhs: tensor<8x8xf32>, %acc: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<8x8xf32> to tensor<8x8xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<8x8xf32> to tensor<8x8xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<8x8xf8E4M3FNUZ>, tensor<8x8xf8E4M3FNUZ>) outs(%acc: tensor<8x8xf32>) -> tensor<8x8xf32>
+  return %result: tensor<8x8xf32>
+}
+
+func.func @matmul_accumulate_9x9xf8E4M3FNUZ_times_9x9xf8E4M3FNUZ_into_9x9xf32(%lhs: tensor<9x9xf32>, %rhs: tensor<9x9xf32>, %acc: tensor<9x9xf32>) -> tensor<9x9xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<9x9xf32> to tensor<9x9xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<9x9xf32> to tensor<9x9xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<9x9xf8E4M3FNUZ>, tensor<9x9xf8E4M3FNUZ>) outs(%acc: tensor<9x9xf32>) -> tensor<9x9xf32>
+  return %result: tensor<9x9xf32>
+}
+
+func.func @matmul_accumulate_6x13xf8E4M3FNUZ_times_13x3xf8E4M3FNUZ_into_6x3xf32(%lhs: tensor<6x13xf32>, %rhs: tensor<13x3xf32>, %acc: tensor<6x3xf32>) -> tensor<6x3xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<6x13xf32> to tensor<6x13xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<13x3xf32> to tensor<13x3xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<6x13xf8E4M3FNUZ>, tensor<13x3xf8E4M3FNUZ>) outs(%acc: tensor<6x3xf32>) -> tensor<6x3xf32>
+  return %result: tensor<6x3xf32>
+}
+
+func.func @matmul_15x37xf8E4M3FNUZ_times_37x7xf8E4M3FNUZ_into_15x7xf32(%lhs: tensor<15x37xf32>, %rhs: tensor<37x7xf32>) -> tensor<15x7xf32> {
+  %init_acc = tensor.empty() : tensor<15x7xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<15x7xf32>) -> tensor<15x7xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<15x37xf32> to tensor<15x37xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<37x7xf32> to tensor<37x7xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<15x37xf8E4M3FNUZ>, tensor<37x7xf8E4M3FNUZ>) outs(%acc: tensor<15x7xf32>) -> tensor<15x7xf32>
+  return %result: tensor<15x7xf32>
+}
+
+func.func @matmul_accumulate_81x19xf8E4M3FNUZ_times_19x41xf8E4M3FNUZ_into_81x41xf32(%lhs: tensor<81x19xf32>, %rhs: tensor<19x41xf32>, %acc: tensor<81x41xf32>) -> tensor<81x41xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<81x19xf32> to tensor<81x19xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<19x41xf32> to tensor<19x41xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<81x19xf8E4M3FNUZ>, tensor<19x41xf8E4M3FNUZ>) outs(%acc: tensor<81x41xf32>) -> tensor<81x41xf32>
+  return %result: tensor<81x41xf32>
+}
+
+func.func @matmul_accumulate_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32(%lhs: tensor<1x10xf32>, %rhs: tensor<10x10xf32>, %acc: tensor<1x10xf32>) -> tensor<1x10xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<1x10xf32> to tensor<1x10xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<10x10xf32> to tensor<10x10xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<1x10xf8E4M3FNUZ>, tensor<10x10xf8E4M3FNUZ>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32(%lhs: tensor<1x10xf32>, %rhs: tensor<10x10xf32>) -> tensor<1x10xf32> {
+  %init_acc = tensor.empty() : tensor<1x10xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x10xf32>) -> tensor<1x10xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<1x10xf32> to tensor<1x10xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<10x10xf32> to tensor<10x10xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<1x10xf8E4M3FNUZ>, tensor<10x10xf8E4M3FNUZ>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_accumulate_10x1xf8E4M3FNUZ_times_1x10xf8E4M3FNUZ_into_10x10xf32(%lhs: tensor<10x1xf32>, %rhs: tensor<1x10xf32>, %acc: tensor<10x10xf32>) -> tensor<10x10xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<10x1xf32> to tensor<10x1xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<1x10xf32> to tensor<1x10xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<10x1xf8E4M3FNUZ>, tensor<1x10xf8E4M3FNUZ>) outs(%acc: tensor<10x10xf32>) -> tensor<10x10xf32>
+  return %result: tensor<10x10xf32>
+}
+
+func.func @matmul_accumulate_10x10xf8E4M3FNUZ_times_10x1xf8E4M3FNUZ_into_10x1xf32(%lhs: tensor<10x10xf32>, %rhs: tensor<10x1xf32>, %acc: tensor<10x1xf32>) -> tensor<10x1xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<10x10xf32> to tensor<10x10xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<10x1xf32> to tensor<10x1xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<10x10xf8E4M3FNUZ>, tensor<10x1xf8E4M3FNUZ>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
+func.func @matmul_10x10xf8E4M3FNUZ_times_10x1xf8E4M3FNUZ_into_10x1xf32(%lhs: tensor<10x10xf32>, %rhs: tensor<10x1xf32>) -> tensor<10x1xf32> {
+  %init_acc = tensor.empty() : tensor<10x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<10x1xf32>) -> tensor<10x1xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<10x10xf32> to tensor<10x10xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<10x1xf32> to tensor<10x1xf8E4M3FNUZ>
+  %result = linalg.matmul ins(%lhs_casted, %rhs_casted: tensor<10x10xf8E4M3FNUZ>, tensor<10x1xf8E4M3FNUZ>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_small_calls.mlir b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_small_calls.mlir
new file mode 100644
index 0000000..758ec4a
--- /dev/null
+++ b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_f8E4M3FNUZ_into_f32_small_calls.mlir
@@ -0,0 +1,906 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_2x2xf8E4M3FNUZ_times_2x2xf8E4M3FNUZ_into_2x2xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_4x4xf8E4M3FNUZ_times_4x4xf8E4M3FNUZ_into_4x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_8x8xf8E4M3FNUZ_times_8x8xf8E4M3FNUZ_into_8x8xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_9x9xf8E4M3FNUZ_times_9x9xf8E4M3FNUZ_into_9x9xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_6x13xf8E4M3FNUZ_times_13x3xf8E4M3FNUZ_into_6x3xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_15x37xf8E4M3FNUZ_times_37x7xf8E4M3FNUZ_into_15x7xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_81x19xf8E4M3FNUZ_times_19x41xf8E4M3FNUZ_into_81x41xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x1xf8E4M3FNUZ_times_1x10xf8E4M3FNUZ_into_10x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x10xf8E4M3FNUZ_times_10x1xf8E4M3FNUZ_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_10x10xf8E4M3FNUZ_times_10x1xf8E4M3FNUZ_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32_1_1_1_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1_1_1_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32_1_1_1_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_2_2_2_acc_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 14 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 14 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_2x2xf8E4M3FNUZ_times_2x2xf8E4M3FNUZ_into_2x2xf32_2_2_2_acc_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 15 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 16 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 17 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 17 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_2x2xf8E4M3FNUZ_times_2x2xf8E4M3FNUZ_into_2x2xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_4_4_4_acc_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 20 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 20 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_4x4xf8E4M3FNUZ_times_4x4xf8E4M3FNUZ_into_4x4xf32_4_4_4_acc_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 21 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 22 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 23 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 23 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_4x4xf8E4M3FNUZ_times_4x4xf8E4M3FNUZ_into_4x4xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_8_8_8_acc_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 26 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 26 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_8x8xf8E4M3FNUZ_times_8x8xf8E4M3FNUZ_into_8x8xf32_8_8_8_acc_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 27 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 28 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 29 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 29 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_8x8xf8E4M3FNUZ_times_8x8xf8E4M3FNUZ_into_8x8xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_9_9_9_acc_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 32 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 32 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_9x9xf8E4M3FNUZ_times_9x9xf8E4M3FNUZ_into_9x9xf32_9_9_9_acc_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 33 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 34 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 35 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 35 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_9x9xf8E4M3FNUZ_times_9x9xf8E4M3FNUZ_into_9x9xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_6_13_3_acc_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 13 : i64
+  %rhs_dim1 = arith.constant 3 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 38 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 38 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_6x13xf8E4M3FNUZ_times_13x3xf8E4M3FNUZ_into_6x3xf32_6_13_3_acc_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 39 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 13 : i64
+  %rhs_dim1 = arith.constant 3 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 40 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 41 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 41 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_6x13xf8E4M3FNUZ_times_13x3xf8E4M3FNUZ_into_6x3xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_15_37_7_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 37 : i64
+  %rhs_dim1 = arith.constant 7 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_15x37xf8E4M3FNUZ_times_37x7xf8E4M3FNUZ_into_15x7xf32_15_37_7_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 37 : i64
+  %rhs_dim1 = arith.constant 7 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_15x37xf8E4M3FNUZ_times_37x7xf8E4M3FNUZ_into_15x7xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_81_19_41_acc_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 19 : i64
+  %rhs_dim1 = arith.constant 41 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 48 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 48 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_81x19xf8E4M3FNUZ_times_19x41xf8E4M3FNUZ_into_81x41xf32_81_19_41_acc_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 49 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 19 : i64
+  %rhs_dim1 = arith.constant 41 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 50 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 51 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 51 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_81x19xf8E4M3FNUZ_times_19x41xf8E4M3FNUZ_into_81x41xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1_10_10_acc_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 54 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 54 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32_1_10_10_acc_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 55 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 56 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 57 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 57 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1_10_10_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 58 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 59 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32_1_10_10_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 60 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 61 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_10_1_10_acc_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x1xf8E4M3FNUZ_times_1x10xf8E4M3FNUZ_into_10x10xf32_10_1_10_acc_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x1xf8E4M3FNUZ_times_1x10xf8E4M3FNUZ_into_10x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_10_10_1_acc_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 70 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 70 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x10xf8E4M3FNUZ_times_10x1xf8E4M3FNUZ_into_10x1xf32_10_10_1_acc_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 71 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 72 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 73 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 73 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x10xf8E4M3FNUZ_times_10x1xf8E4M3FNUZ_into_10x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_10_10_1_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 74 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 75 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_10x10xf8E4M3FNUZ_times_10x1xf8E4M3FNUZ_into_10x1xf32_10_10_1_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 76 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 77 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_10x10xf8E4M3FNUZ_times_10x1xf8E4M3FNUZ_into_10x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_large.mlir b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_large.mlir
new file mode 100644
index 0000000..c2c9702
--- /dev/null
+++ b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_large.mlir
@@ -0,0 +1,172 @@
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<?x?xf8E4M3FNUZ>, tensor<?x?xf8E4M3FNUZ>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_512x128xf8E4M3FNUZ_times_512x128xf8E4M3FNUZ_into_512x512xf32(%lhs: tensor<512x128xf32>, %rhs: tensor<512x128xf32>, %acc: tensor<512x512xf32>) -> tensor<512x512xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<512x128xf32> to tensor<512x128xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<512x128xf32> to tensor<512x128xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<512x128xf8E4M3FNUZ>, tensor<512x128xf8E4M3FNUZ>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf32>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<?x?xf8E4M3FNUZ>, tensor<?x?xf8E4M3FNUZ>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_512x128xf8E4M3FNUZ_times_512x128xf8E4M3FNUZ_into_512x512xf32(%lhs: tensor<512x128xf32>, %rhs: tensor<512x128xf32>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<512x128xf32> to tensor<512x128xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<512x128xf32> to tensor<512x128xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<512x128xf8E4M3FNUZ>, tensor<512x128xf8E4M3FNUZ>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_1000x4xf8E4M3FNUZ_times_512x4xf8E4M3FNUZ_into_1000x512xf32(%lhs: tensor<1000x4xf32>, %rhs: tensor<512x4xf32>) -> tensor<1000x512xf32> {
+  %init_acc = tensor.empty() : tensor<1000x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<1000x4xf32> to tensor<1000x4xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<512x4xf32> to tensor<512x4xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<1000x4xf8E4M3FNUZ>, tensor<512x4xf8E4M3FNUZ>) outs(%acc: tensor<1000x512xf32>) -> tensor<1000x512xf32>
+  return %result: tensor<1000x512xf32>
+}
+
+func.func @matmul_4x1000xf8E4M3FNUZ_times_512x1000xf8E4M3FNUZ_into_4x512xf32(%lhs: tensor<4x1000xf32>, %rhs: tensor<512x1000xf32>) -> tensor<4x512xf32> {
+  %init_acc = tensor.empty() : tensor<4x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<4x512xf32>) -> tensor<4x512xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<4x1000xf32> to tensor<4x1000xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<512x1000xf32> to tensor<512x1000xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<4x1000xf8E4M3FNUZ>, tensor<512x1000xf8E4M3FNUZ>) outs(%acc: tensor<4x512xf32>) -> tensor<4x512xf32>
+  return %result: tensor<4x512xf32>
+}
+
+func.func @matmul_512x1000xf8E4M3FNUZ_times_4x1000xf8E4M3FNUZ_into_512x4xf32(%lhs: tensor<512x1000xf32>, %rhs: tensor<4x1000xf32>) -> tensor<512x4xf32> {
+  %init_acc = tensor.empty() : tensor<512x4xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x4xf32>) -> tensor<512x4xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<512x1000xf32> to tensor<512x1000xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<4x1000xf32> to tensor<4x1000xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<512x1000xf8E4M3FNUZ>, tensor<4x1000xf8E4M3FNUZ>) outs(%acc: tensor<512x4xf32>) -> tensor<512x4xf32>
+  return %result: tensor<512x4xf32>
+}
+
+func.func @matmul_512x128xf8E4M3FNUZ_times_500x128xf8E4M3FNUZ_into_512x500xf32(%lhs: tensor<512x128xf32>, %rhs: tensor<500x128xf32>) -> tensor<512x500xf32> {
+  %init_acc = tensor.empty() : tensor<512x500xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x500xf32>) -> tensor<512x500xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<512x128xf32> to tensor<512x128xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<500x128xf32> to tensor<500x128xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<512x128xf8E4M3FNUZ>, tensor<500x128xf8E4M3FNUZ>) outs(%acc: tensor<512x500xf32>) -> tensor<512x500xf32>
+  return %result: tensor<512x500xf32>
+}
+
+func.func @matmul_457x330xf8E4M3FNUZ_times_512x330xf8E4M3FNUZ_into_457x512xf32(%lhs: tensor<457x330xf32>, %rhs: tensor<512x330xf32>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<457x330xf32> to tensor<457x330xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<512x330xf32> to tensor<512x330xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<457x330xf8E4M3FNUZ>, tensor<512x330xf8E4M3FNUZ>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_457x330xf8E4M3FNUZ_times_514x330xf8E4M3FNUZ_into_457x514xf32(%lhs: tensor<457x330xf32>, %rhs: tensor<514x330xf32>) -> tensor<457x514xf32> {
+  %init_acc = tensor.empty() : tensor<457x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x514xf32>) -> tensor<457x514xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<457x330xf32> to tensor<457x330xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<514x330xf32> to tensor<514x330xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<457x330xf8E4M3FNUZ>, tensor<514x330xf8E4M3FNUZ>) outs(%acc: tensor<457x514xf32>) -> tensor<457x514xf32>
+  return %result: tensor<457x514xf32>
+}
+
+func.func @matmul_438x330xf8E4M3FNUZ_times_514x330xf8E4M3FNUZ_into_438x514xf32(%lhs: tensor<438x330xf32>, %rhs: tensor<514x330xf32>) -> tensor<438x514xf32> {
+  %init_acc = tensor.empty() : tensor<438x514xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<438x514xf32>) -> tensor<438x514xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<438x330xf32> to tensor<438x330xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<514x330xf32> to tensor<514x330xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<438x330xf8E4M3FNUZ>, tensor<514x330xf8E4M3FNUZ>) outs(%acc: tensor<438x514xf32>) -> tensor<438x514xf32>
+  return %result: tensor<438x514xf32>
+}
+
+func.func @matmul_540x332xf8E4M3FNUZ_times_516x332xf8E4M3FNUZ_into_540x516xf32(%lhs: tensor<540x332xf32>, %rhs: tensor<516x332xf32>) -> tensor<540x516xf32> {
+  %init_acc = tensor.empty() : tensor<540x516xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<540x516xf32>) -> tensor<540x516xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<540x332xf32> to tensor<540x332xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<516x332xf32> to tensor<516x332xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<540x332xf8E4M3FNUZ>, tensor<516x332xf8E4M3FNUZ>) outs(%acc: tensor<540x516xf32>) -> tensor<540x516xf32>
+  return %result: tensor<540x516xf32>
+}
+
+func.func @matmul_654x321xf8E4M3FNUZ_times_234x321xf8E4M3FNUZ_into_654x234xf32(%lhs: tensor<654x321xf32>, %rhs: tensor<234x321xf32>) -> tensor<654x234xf32> {
+  %init_acc = tensor.empty() : tensor<654x234xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<654x234xf32>) -> tensor<654x234xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<654x321xf32> to tensor<654x321xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<234x321xf32> to tensor<234x321xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<654x321xf8E4M3FNUZ>, tensor<234x321xf8E4M3FNUZ>) outs(%acc: tensor<654x234xf32>) -> tensor<654x234xf32>
+  return %result: tensor<654x234xf32>
+}
+
+func.func @matmul_457x160xf8E4M3FNUZ_times_512x160xf8E4M3FNUZ_into_457x512xf32(%lhs: tensor<457x160xf32>, %rhs: tensor<512x160xf32>) -> tensor<457x512xf32> {
+  %init_acc = tensor.empty() : tensor<457x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<457x512xf32>) -> tensor<457x512xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<457x160xf32> to tensor<457x160xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<512x160xf32> to tensor<512x160xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<457x160xf8E4M3FNUZ>, tensor<512x160xf8E4M3FNUZ>) outs(%acc: tensor<457x512xf32>) -> tensor<457x512xf32>
+  return %result: tensor<457x512xf32>
+}
+
+func.func @matmul_512x330xf8E4M3FNUZ_times_512x330xf8E4M3FNUZ_into_512x512xf32(%lhs: tensor<512x330xf32>, %rhs: tensor<512x330xf32>) -> tensor<512x512xf32> {
+  %init_acc = tensor.empty() : tensor<512x512xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<512x512xf32>) -> tensor<512x512xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<512x330xf32> to tensor<512x330xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<512x330xf32> to tensor<512x330xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<512x330xf8E4M3FNUZ>, tensor<512x330xf8E4M3FNUZ>) outs(%acc: tensor<512x512xf32>) -> tensor<512x512xf32>
+  return %result: tensor<512x512xf32>
+}
+
+func.func @matmul_accumulate_1x1000xf8E4M3FNUZ_times_1000x1000xf8E4M3FNUZ_into_1x1000xf32(%lhs: tensor<1x1000xf32>, %rhs: tensor<1000x1000xf32>, %acc: tensor<1x1000xf32>) -> tensor<1x1000xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<1x1000xf32> to tensor<1x1000xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<1000x1000xf32> to tensor<1000x1000xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<1x1000xf8E4M3FNUZ>, tensor<1000x1000xf8E4M3FNUZ>) outs(%acc: tensor<1x1000xf32>) -> tensor<1x1000xf32>
+  return %result: tensor<1x1000xf32>
+}
+
+func.func @matmul_accumulate_1000x1000xf8E4M3FNUZ_times_1x1000xf8E4M3FNUZ_into_1000x1xf32(%lhs: tensor<1000x1000xf32>, %rhs: tensor<1x1000xf32>, %acc: tensor<1000x1xf32>) -> tensor<1000x1xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<1000x1000xf32> to tensor<1000x1000xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<1x1000xf32> to tensor<1x1000xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<1000x1000xf8E4M3FNUZ>, tensor<1x1000xf8E4M3FNUZ>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
+func.func @matmul_1000x1000xf8E4M3FNUZ_times_1x1000xf8E4M3FNUZ_into_1000x1xf32(%lhs: tensor<1000x1000xf32>, %rhs: tensor<1x1000xf32>) -> tensor<1000x1xf32> {
+  %init_acc = tensor.empty() : tensor<1000x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<1000x1000xf32> to tensor<1000x1000xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<1x1000xf32> to tensor<1x1000xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<1000x1000xf8E4M3FNUZ>, tensor<1x1000xf8E4M3FNUZ>) outs(%acc: tensor<1000x1xf32>) -> tensor<1000x1xf32>
+  return %result: tensor<1000x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_large_calls.mlir b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_large_calls.mlir
new file mode 100644
index 0000000..249c528
--- /dev/null
+++ b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_large_calls.mlir
@@ -0,0 +1,882 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_512x128xf8E4M3FNUZ_times_512x128xf8E4M3FNUZ_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf8E4M3FNUZ_times_512x128xf8E4M3FNUZ_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x4xf8E4M3FNUZ_times_512x4xf8E4M3FNUZ_into_1000x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_4x1000xf8E4M3FNUZ_times_512x1000xf8E4M3FNUZ_into_4x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x1000xf8E4M3FNUZ_times_4x1000xf8E4M3FNUZ_into_512x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xf8E4M3FNUZ_times_500x128xf8E4M3FNUZ_into_512x500xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf8E4M3FNUZ_times_512x330xf8E4M3FNUZ_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xf8E4M3FNUZ_times_514x330xf8E4M3FNUZ_into_457x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_438x330xf8E4M3FNUZ_times_514x330xf8E4M3FNUZ_into_438x514xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_540x332xf8E4M3FNUZ_times_516x332xf8E4M3FNUZ_into_540x516xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_654x321xf8E4M3FNUZ_times_234x321xf8E4M3FNUZ_into_654x234xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x160xf8E4M3FNUZ_times_512x160xf8E4M3FNUZ_into_457x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x330xf8E4M3FNUZ_times_512x330xf8E4M3FNUZ_into_512x512xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1000xf8E4M3FNUZ_times_1000x1000xf8E4M3FNUZ_into_1x1000xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1000x1000xf8E4M3FNUZ_times_1x1000xf8E4M3FNUZ_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x1000xf8E4M3FNUZ_times_1x1000xf8E4M3FNUZ_into_1000x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_512_128_512_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_512x128xf8E4M3FNUZ_times_512x128xf8E4M3FNUZ_into_512x512xf32_512_128_512_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_512x128xf8E4M3FNUZ_times_512x128xf8E4M3FNUZ_into_512x512xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_512_128_512_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf8E4M3FNUZ_times_512x128xf8E4M3FNUZ_into_512x512xf32_512_128_512_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf8E4M3FNUZ_times_512x128xf8E4M3FNUZ_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1000_4_512_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x4xf8E4M3FNUZ_times_512x4xf8E4M3FNUZ_into_1000x512xf32_1000_4_512_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 14 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 15 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x4xf8E4M3FNUZ_times_512x4xf8E4M3FNUZ_into_1000x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_4_1000_512_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 16 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 17 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_4x1000xf8E4M3FNUZ_times_512x1000xf8E4M3FNUZ_into_4x512xf32_4_1000_512_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_4x1000xf8E4M3FNUZ_times_512x1000xf8E4M3FNUZ_into_4x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_512_1000_4_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 20 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 21 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x1000xf8E4M3FNUZ_times_4x1000xf8E4M3FNUZ_into_512x4xf32_512_1000_4_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 22 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 23 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x1000xf8E4M3FNUZ_times_4x1000xf8E4M3FNUZ_into_512x4xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_512_128_500_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 500 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xf8E4M3FNUZ_times_500x128xf8E4M3FNUZ_into_512x500xf32_512_128_500_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 26 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 500 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 27 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xf8E4M3FNUZ_times_500x128xf8E4M3FNUZ_into_512x500xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_457_330_512_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 28 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 29 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf8E4M3FNUZ_times_512x330xf8E4M3FNUZ_into_457x512xf32_457_330_512_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf8E4M3FNUZ_times_512x330xf8E4M3FNUZ_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_457_330_514_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 32 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 33 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xf8E4M3FNUZ_times_514x330xf8E4M3FNUZ_into_457x514xf32_457_330_514_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 34 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 35 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xf8E4M3FNUZ_times_514x330xf8E4M3FNUZ_into_457x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_438_330_514_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_438x330xf8E4M3FNUZ_times_514x330xf8E4M3FNUZ_into_438x514xf32_438_330_514_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 38 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 39 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_438x330xf8E4M3FNUZ_times_514x330xf8E4M3FNUZ_into_438x514xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_540_332_516_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 40 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 516 : i64
+  %rhs_dim1 = arith.constant 332 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 41 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_540x332xf8E4M3FNUZ_times_516x332xf8E4M3FNUZ_into_540x516xf32_540_332_516_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 516 : i64
+  %rhs_dim1 = arith.constant 332 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_540x332xf8E4M3FNUZ_times_516x332xf8E4M3FNUZ_into_540x516xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_654_321_234_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 234 : i64
+  %rhs_dim1 = arith.constant 321 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_654x321xf8E4M3FNUZ_times_234x321xf8E4M3FNUZ_into_654x234xf32_654_321_234_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 234 : i64
+  %rhs_dim1 = arith.constant 321 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_654x321xf8E4M3FNUZ_times_234x321xf8E4M3FNUZ_into_654x234xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_457_160_512_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 48 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 160 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 49 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x160xf8E4M3FNUZ_times_512x160xf8E4M3FNUZ_into_457x512xf32_457_160_512_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 50 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 160 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 51 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x160xf8E4M3FNUZ_times_512x160xf8E4M3FNUZ_into_457x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_512_330_512_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x330xf8E4M3FNUZ_times_512x330xf8E4M3FNUZ_into_512x512xf32_512_330_512_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 54 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 55 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x330xf8E4M3FNUZ_times_512x330xf8E4M3FNUZ_into_512x512xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1_1000_1000_acc_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 56 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 57 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 58 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 58 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1000xf8E4M3FNUZ_times_1000x1000xf8E4M3FNUZ_into_1x1000xf32_1_1000_1000_acc_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 59 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 60 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 61 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 61 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1000xf8E4M3FNUZ_times_1000x1000xf8E4M3FNUZ_into_1x1000xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1000_1000_1_acc_28() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1000x1000xf8E4M3FNUZ_times_1x1000xf8E4M3FNUZ_into_1000x1xf32_1000_1000_1_acc_29() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1000x1000xf8E4M3FNUZ_times_1x1000xf8E4M3FNUZ_into_1000x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1000_1000_1_30() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x1000xf8E4M3FNUZ_times_1x1000xf8E4M3FNUZ_into_1000x1xf32_1000_1000_1_31() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 70 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 71 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x1000xf8E4M3FNUZ_times_1x1000xf8E4M3FNUZ_into_1000x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_small.mlir b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_small.mlir
new file mode 100644
index 0000000..6b56445
--- /dev/null
+++ b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_small.mlir
@@ -0,0 +1,131 @@
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<?x?xf8E4M3FNUZ>, tensor<?x?xf8E4M3FNUZ>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_accumulate_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32(%lhs: tensor<1x1xf32>, %rhs: tensor<1x1xf32>, %acc: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<1x1xf32> to tensor<1x1xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<1x1xf32> to tensor<1x1xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<1x1xf8E4M3FNUZ>, tensor<1x1xf8E4M3FNUZ>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf32>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<?x?xf32> to tensor<?x?xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<?x?xf8E4M3FNUZ>, tensor<?x?xf8E4M3FNUZ>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result: tensor<?x?xf32>
+}
+
+func.func @matmul_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32(%lhs: tensor<1x1xf32>, %rhs: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %init_acc = tensor.empty() : tensor<1x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x1xf32>) -> tensor<1x1xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<1x1xf32> to tensor<1x1xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<1x1xf32> to tensor<1x1xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<1x1xf8E4M3FNUZ>, tensor<1x1xf8E4M3FNUZ>) outs(%acc: tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %result: tensor<1x1xf32>
+}
+
+func.func @matmul_accumulate_2x2xf8E4M3FNUZ_times_2x2xf8E4M3FNUZ_into_2x2xf32(%lhs: tensor<2x2xf32>, %rhs: tensor<2x2xf32>, %acc: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<2x2xf32> to tensor<2x2xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<2x2xf32> to tensor<2x2xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<2x2xf8E4M3FNUZ>, tensor<2x2xf8E4M3FNUZ>) outs(%acc: tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %result: tensor<2x2xf32>
+}
+
+func.func @matmul_accumulate_4x4xf8E4M3FNUZ_times_4x4xf8E4M3FNUZ_into_4x4xf32(%lhs: tensor<4x4xf32>, %rhs: tensor<4x4xf32>, %acc: tensor<4x4xf32>) -> tensor<4x4xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<4x4xf32> to tensor<4x4xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<4x4xf32> to tensor<4x4xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<4x4xf8E4M3FNUZ>, tensor<4x4xf8E4M3FNUZ>) outs(%acc: tensor<4x4xf32>) -> tensor<4x4xf32>
+  return %result: tensor<4x4xf32>
+}
+
+func.func @matmul_accumulate_8x8xf8E4M3FNUZ_times_8x8xf8E4M3FNUZ_into_8x8xf32(%lhs: tensor<8x8xf32>, %rhs: tensor<8x8xf32>, %acc: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<8x8xf32> to tensor<8x8xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<8x8xf32> to tensor<8x8xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<8x8xf8E4M3FNUZ>, tensor<8x8xf8E4M3FNUZ>) outs(%acc: tensor<8x8xf32>) -> tensor<8x8xf32>
+  return %result: tensor<8x8xf32>
+}
+
+func.func @matmul_accumulate_9x9xf8E4M3FNUZ_times_9x9xf8E4M3FNUZ_into_9x9xf32(%lhs: tensor<9x9xf32>, %rhs: tensor<9x9xf32>, %acc: tensor<9x9xf32>) -> tensor<9x9xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<9x9xf32> to tensor<9x9xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<9x9xf32> to tensor<9x9xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<9x9xf8E4M3FNUZ>, tensor<9x9xf8E4M3FNUZ>) outs(%acc: tensor<9x9xf32>) -> tensor<9x9xf32>
+  return %result: tensor<9x9xf32>
+}
+
+func.func @matmul_accumulate_6x13xf8E4M3FNUZ_times_3x13xf8E4M3FNUZ_into_6x3xf32(%lhs: tensor<6x13xf32>, %rhs: tensor<3x13xf32>, %acc: tensor<6x3xf32>) -> tensor<6x3xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<6x13xf32> to tensor<6x13xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<3x13xf32> to tensor<3x13xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<6x13xf8E4M3FNUZ>, tensor<3x13xf8E4M3FNUZ>) outs(%acc: tensor<6x3xf32>) -> tensor<6x3xf32>
+  return %result: tensor<6x3xf32>
+}
+
+func.func @matmul_15x37xf8E4M3FNUZ_times_7x37xf8E4M3FNUZ_into_15x7xf32(%lhs: tensor<15x37xf32>, %rhs: tensor<7x37xf32>) -> tensor<15x7xf32> {
+  %init_acc = tensor.empty() : tensor<15x7xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<15x7xf32>) -> tensor<15x7xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<15x37xf32> to tensor<15x37xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<7x37xf32> to tensor<7x37xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<15x37xf8E4M3FNUZ>, tensor<7x37xf8E4M3FNUZ>) outs(%acc: tensor<15x7xf32>) -> tensor<15x7xf32>
+  return %result: tensor<15x7xf32>
+}
+
+func.func @matmul_accumulate_81x19xf8E4M3FNUZ_times_41x19xf8E4M3FNUZ_into_81x41xf32(%lhs: tensor<81x19xf32>, %rhs: tensor<41x19xf32>, %acc: tensor<81x41xf32>) -> tensor<81x41xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<81x19xf32> to tensor<81x19xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<41x19xf32> to tensor<41x19xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<81x19xf8E4M3FNUZ>, tensor<41x19xf8E4M3FNUZ>) outs(%acc: tensor<81x41xf32>) -> tensor<81x41xf32>
+  return %result: tensor<81x41xf32>
+}
+
+func.func @matmul_accumulate_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32(%lhs: tensor<1x10xf32>, %rhs: tensor<10x10xf32>, %acc: tensor<1x10xf32>) -> tensor<1x10xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<1x10xf32> to tensor<1x10xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<10x10xf32> to tensor<10x10xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<1x10xf8E4M3FNUZ>, tensor<10x10xf8E4M3FNUZ>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32(%lhs: tensor<1x10xf32>, %rhs: tensor<10x10xf32>) -> tensor<1x10xf32> {
+  %init_acc = tensor.empty() : tensor<1x10xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<1x10xf32>) -> tensor<1x10xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<1x10xf32> to tensor<1x10xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<10x10xf32> to tensor<10x10xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<1x10xf8E4M3FNUZ>, tensor<10x10xf8E4M3FNUZ>) outs(%acc: tensor<1x10xf32>) -> tensor<1x10xf32>
+  return %result: tensor<1x10xf32>
+}
+
+func.func @matmul_accumulate_10x1xf8E4M3FNUZ_times_10x1xf8E4M3FNUZ_into_10x10xf32(%lhs: tensor<10x1xf32>, %rhs: tensor<10x1xf32>, %acc: tensor<10x10xf32>) -> tensor<10x10xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<10x1xf32> to tensor<10x1xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<10x1xf32> to tensor<10x1xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<10x1xf8E4M3FNUZ>, tensor<10x1xf8E4M3FNUZ>) outs(%acc: tensor<10x10xf32>) -> tensor<10x10xf32>
+  return %result: tensor<10x10xf32>
+}
+
+func.func @matmul_accumulate_10x10xf8E4M3FNUZ_times_1x10xf8E4M3FNUZ_into_10x1xf32(%lhs: tensor<10x10xf32>, %rhs: tensor<1x10xf32>, %acc: tensor<10x1xf32>) -> tensor<10x1xf32> {
+  %lhs_casted = arith.truncf %lhs: tensor<10x10xf32> to tensor<10x10xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<1x10xf32> to tensor<1x10xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<10x10xf8E4M3FNUZ>, tensor<1x10xf8E4M3FNUZ>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
+func.func @matmul_10x10xf8E4M3FNUZ_times_1x10xf8E4M3FNUZ_into_10x1xf32(%lhs: tensor<10x10xf32>, %rhs: tensor<1x10xf32>) -> tensor<10x1xf32> {
+  %init_acc = tensor.empty() : tensor<10x1xf32>
+  %c0_acc_type = arith.constant 0.0: f32
+  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<10x1xf32>) -> tensor<10x1xf32>
+  %lhs_casted = arith.truncf %lhs: tensor<10x10xf32> to tensor<10x10xf8E4M3FNUZ>
+  %rhs_casted = arith.truncf %rhs: tensor<1x10xf32> to tensor<1x10xf8E4M3FNUZ>
+  %result = linalg.matmul_transpose_b ins(%lhs_casted, %rhs_casted: tensor<10x10xf8E4M3FNUZ>, tensor<1x10xf8E4M3FNUZ>) outs(%acc: tensor<10x1xf32>) -> tensor<10x1xf32>
+  return %result: tensor<10x1xf32>
+}
+
diff --git a/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_small_calls.mlir b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_small_calls.mlir
new file mode 100644
index 0000000..84f89f3
--- /dev/null
+++ b/linalg_ops/matmul/generated/f8E4M3FNUZ_into_f32/matmul_transpose_b_f8E4M3FNUZ_into_f32_small_calls.mlir
@@ -0,0 +1,906 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_2x2xf8E4M3FNUZ_times_2x2xf8E4M3FNUZ_into_2x2xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_4x4xf8E4M3FNUZ_times_4x4xf8E4M3FNUZ_into_4x4xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_8x8xf8E4M3FNUZ_times_8x8xf8E4M3FNUZ_into_8x8xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_9x9xf8E4M3FNUZ_times_9x9xf8E4M3FNUZ_into_9x9xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_6x13xf8E4M3FNUZ_times_3x13xf8E4M3FNUZ_into_6x3xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_15x37xf8E4M3FNUZ_times_7x37xf8E4M3FNUZ_into_15x7xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_81x19xf8E4M3FNUZ_times_41x19xf8E4M3FNUZ_into_81x41xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x1xf8E4M3FNUZ_times_10x1xf8E4M3FNUZ_into_10x10xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x10xf8E4M3FNUZ_times_1x10xf8E4M3FNUZ_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_10x10xf8E4M3FNUZ_times_1x10xf8E4M3FNUZ_into_10x1xf32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32_1_1_1_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1_1_1_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32_1_1_1_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x1xf8E4M3FNUZ_times_1x1xf8E4M3FNUZ_into_1x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_2_2_2_acc_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 14 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 14 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_2x2xf8E4M3FNUZ_times_2x2xf8E4M3FNUZ_into_2x2xf32_2_2_2_acc_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 15 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 16 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 17 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 17 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_2x2xf8E4M3FNUZ_times_2x2xf8E4M3FNUZ_into_2x2xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_4_4_4_acc_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 20 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 20 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_4x4xf8E4M3FNUZ_times_4x4xf8E4M3FNUZ_into_4x4xf32_4_4_4_acc_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 21 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 22 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 23 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 23 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_4x4xf8E4M3FNUZ_times_4x4xf8E4M3FNUZ_into_4x4xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_8_8_8_acc_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 26 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 26 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_8x8xf8E4M3FNUZ_times_8x8xf8E4M3FNUZ_into_8x8xf32_8_8_8_acc_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 27 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 28 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 29 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 29 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_8x8xf8E4M3FNUZ_times_8x8xf8E4M3FNUZ_into_8x8xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_9_9_9_acc_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 32 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 32 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_9x9xf8E4M3FNUZ_times_9x9xf8E4M3FNUZ_into_9x9xf32_9_9_9_acc_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 33 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 34 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 35 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 35 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_9x9xf8E4M3FNUZ_times_9x9xf8E4M3FNUZ_into_9x9xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_6_13_3_acc_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 3 : i64
+  %rhs_dim1 = arith.constant 13 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 38 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 38 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_6x13xf8E4M3FNUZ_times_3x13xf8E4M3FNUZ_into_6x3xf32_6_13_3_acc_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 39 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 3 : i64
+  %rhs_dim1 = arith.constant 13 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 40 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 41 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 41 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_6x13xf8E4M3FNUZ_times_3x13xf8E4M3FNUZ_into_6x3xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_15_37_7_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 7 : i64
+  %rhs_dim1 = arith.constant 37 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_15x37xf8E4M3FNUZ_times_7x37xf8E4M3FNUZ_into_15x7xf32_15_37_7_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 7 : i64
+  %rhs_dim1 = arith.constant 37 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_15x37xf8E4M3FNUZ_times_7x37xf8E4M3FNUZ_into_15x7xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_81_19_41_acc_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 41 : i64
+  %rhs_dim1 = arith.constant 19 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 48 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 48 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_81x19xf8E4M3FNUZ_times_41x19xf8E4M3FNUZ_into_81x41xf32_81_19_41_acc_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 49 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 41 : i64
+  %rhs_dim1 = arith.constant 19 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 50 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 51 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 51 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_81x19xf8E4M3FNUZ_times_41x19xf8E4M3FNUZ_into_81x41xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1_10_10_acc_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 54 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 54 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32_1_10_10_acc_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 55 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 56 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 57 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 57 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_1_10_10_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 58 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 59 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32_1_10_10_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 60 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 61 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x10xf8E4M3FNUZ_times_10x10xf8E4M3FNUZ_into_1x10xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_10_1_10_acc_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x1xf8E4M3FNUZ_times_10x1xf8E4M3FNUZ_into_10x10xf32_10_1_10_acc_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x1xf8E4M3FNUZ_times_10x1xf8E4M3FNUZ_into_10x10xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_10_10_1_acc_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 70 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 70 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x10xf8E4M3FNUZ_times_1x10xf8E4M3FNUZ_into_10x1xf32_10_10_1_acc_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 71 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 72 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 73 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 73 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x10xf8E4M3FNUZ_times_1x10xf8E4M3FNUZ_into_10x1xf32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32_10_10_1_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 74 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 75 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxf8E4M3FNUZ_times_DYNxDYNxf8E4M3FNUZ_into_DYNxDYNxf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_10x10xf8E4M3FNUZ_times_1x10xf8E4M3FNUZ_into_10x1xf32_10_10_1_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<f32> : i32
+  %lhs_seed = arith.constant 76 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<f32> : i32
+  %rhs_seed = arith.constant 77 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_10x10xf8E4M3FNUZ_times_1x10xf8E4M3FNUZ_into_10x1xf32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_large.mlir b/linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_large.mlir
new file mode 100644
index 0000000..5fa3c90
--- /dev/null
+++ b/linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_large.mlir
@@ -0,0 +1,136 @@
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: tensor<?x?xi8>, %rhs: tensor<?x?xi8>, %acc: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xi8>, tensor<?x?xi8>) outs(%acc: tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %result: tensor<?x?xi32>
+}
+
+func.func @matmul_accumulate_512x128xi8_times_128x512xi8_into_512x512xi32(%lhs: tensor<512x128xi8>, %rhs: tensor<128x512xi8>, %acc: tensor<512x512xi32>) -> tensor<512x512xi32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xi8>, tensor<128x512xi8>) outs(%acc: tensor<512x512xi32>) -> tensor<512x512xi32>
+  return %result: tensor<512x512xi32>
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: tensor<?x?xi8>, %rhs: tensor<?x?xi8>) -> tensor<?x?xi32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xi8>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xi8>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<?x?xi32>) -> tensor<?x?xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xi8>, tensor<?x?xi8>) outs(%acc: tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %result: tensor<?x?xi32>
+}
+
+func.func @matmul_512x128xi8_times_128x512xi8_into_512x512xi32(%lhs: tensor<512x128xi8>, %rhs: tensor<128x512xi8>) -> tensor<512x512xi32> {
+  %init_acc = tensor.empty() : tensor<512x512xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<512x512xi32>) -> tensor<512x512xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xi8>, tensor<128x512xi8>) outs(%acc: tensor<512x512xi32>) -> tensor<512x512xi32>
+  return %result: tensor<512x512xi32>
+}
+
+func.func @matmul_1000x4xi8_times_4x512xi8_into_1000x512xi32(%lhs: tensor<1000x4xi8>, %rhs: tensor<4x512xi8>) -> tensor<1000x512xi32> {
+  %init_acc = tensor.empty() : tensor<1000x512xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<1000x512xi32>) -> tensor<1000x512xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x4xi8>, tensor<4x512xi8>) outs(%acc: tensor<1000x512xi32>) -> tensor<1000x512xi32>
+  return %result: tensor<1000x512xi32>
+}
+
+func.func @matmul_4x1000xi8_times_1000x512xi8_into_4x512xi32(%lhs: tensor<4x1000xi8>, %rhs: tensor<1000x512xi8>) -> tensor<4x512xi32> {
+  %init_acc = tensor.empty() : tensor<4x512xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<4x512xi32>) -> tensor<4x512xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<4x1000xi8>, tensor<1000x512xi8>) outs(%acc: tensor<4x512xi32>) -> tensor<4x512xi32>
+  return %result: tensor<4x512xi32>
+}
+
+func.func @matmul_512x1000xi8_times_1000x4xi8_into_512x4xi32(%lhs: tensor<512x1000xi8>, %rhs: tensor<1000x4xi8>) -> tensor<512x4xi32> {
+  %init_acc = tensor.empty() : tensor<512x4xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<512x4xi32>) -> tensor<512x4xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x1000xi8>, tensor<1000x4xi8>) outs(%acc: tensor<512x4xi32>) -> tensor<512x4xi32>
+  return %result: tensor<512x4xi32>
+}
+
+func.func @matmul_512x128xi8_times_128x500xi8_into_512x500xi32(%lhs: tensor<512x128xi8>, %rhs: tensor<128x500xi8>) -> tensor<512x500xi32> {
+  %init_acc = tensor.empty() : tensor<512x500xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<512x500xi32>) -> tensor<512x500xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x128xi8>, tensor<128x500xi8>) outs(%acc: tensor<512x500xi32>) -> tensor<512x500xi32>
+  return %result: tensor<512x500xi32>
+}
+
+func.func @matmul_457x330xi8_times_330x512xi8_into_457x512xi32(%lhs: tensor<457x330xi8>, %rhs: tensor<330x512xi8>) -> tensor<457x512xi32> {
+  %init_acc = tensor.empty() : tensor<457x512xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<457x512xi32>) -> tensor<457x512xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x330xi8>, tensor<330x512xi8>) outs(%acc: tensor<457x512xi32>) -> tensor<457x512xi32>
+  return %result: tensor<457x512xi32>
+}
+
+func.func @matmul_457x330xi8_times_330x514xi8_into_457x514xi32(%lhs: tensor<457x330xi8>, %rhs: tensor<330x514xi8>) -> tensor<457x514xi32> {
+  %init_acc = tensor.empty() : tensor<457x514xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<457x514xi32>) -> tensor<457x514xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x330xi8>, tensor<330x514xi8>) outs(%acc: tensor<457x514xi32>) -> tensor<457x514xi32>
+  return %result: tensor<457x514xi32>
+}
+
+func.func @matmul_438x330xi8_times_330x514xi8_into_438x514xi32(%lhs: tensor<438x330xi8>, %rhs: tensor<330x514xi8>) -> tensor<438x514xi32> {
+  %init_acc = tensor.empty() : tensor<438x514xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<438x514xi32>) -> tensor<438x514xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<438x330xi8>, tensor<330x514xi8>) outs(%acc: tensor<438x514xi32>) -> tensor<438x514xi32>
+  return %result: tensor<438x514xi32>
+}
+
+func.func @matmul_540x332xi8_times_332x516xi8_into_540x516xi32(%lhs: tensor<540x332xi8>, %rhs: tensor<332x516xi8>) -> tensor<540x516xi32> {
+  %init_acc = tensor.empty() : tensor<540x516xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<540x516xi32>) -> tensor<540x516xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<540x332xi8>, tensor<332x516xi8>) outs(%acc: tensor<540x516xi32>) -> tensor<540x516xi32>
+  return %result: tensor<540x516xi32>
+}
+
+func.func @matmul_654x321xi8_times_321x234xi8_into_654x234xi32(%lhs: tensor<654x321xi8>, %rhs: tensor<321x234xi8>) -> tensor<654x234xi32> {
+  %init_acc = tensor.empty() : tensor<654x234xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<654x234xi32>) -> tensor<654x234xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<654x321xi8>, tensor<321x234xi8>) outs(%acc: tensor<654x234xi32>) -> tensor<654x234xi32>
+  return %result: tensor<654x234xi32>
+}
+
+func.func @matmul_457x160xi8_times_160x512xi8_into_457x512xi32(%lhs: tensor<457x160xi8>, %rhs: tensor<160x512xi8>) -> tensor<457x512xi32> {
+  %init_acc = tensor.empty() : tensor<457x512xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<457x512xi32>) -> tensor<457x512xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<457x160xi8>, tensor<160x512xi8>) outs(%acc: tensor<457x512xi32>) -> tensor<457x512xi32>
+  return %result: tensor<457x512xi32>
+}
+
+func.func @matmul_512x330xi8_times_330x512xi8_into_512x512xi32(%lhs: tensor<512x330xi8>, %rhs: tensor<330x512xi8>) -> tensor<512x512xi32> {
+  %init_acc = tensor.empty() : tensor<512x512xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<512x512xi32>) -> tensor<512x512xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<512x330xi8>, tensor<330x512xi8>) outs(%acc: tensor<512x512xi32>) -> tensor<512x512xi32>
+  return %result: tensor<512x512xi32>
+}
+
+func.func @matmul_accumulate_1x1000xi8_times_1000x1000xi8_into_1x1000xi32(%lhs: tensor<1x1000xi8>, %rhs: tensor<1000x1000xi8>, %acc: tensor<1x1000xi32>) -> tensor<1x1000xi32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1000xi8>, tensor<1000x1000xi8>) outs(%acc: tensor<1x1000xi32>) -> tensor<1x1000xi32>
+  return %result: tensor<1x1000xi32>
+}
+
+func.func @matmul_accumulate_1000x1000xi8_times_1000x1xi8_into_1000x1xi32(%lhs: tensor<1000x1000xi8>, %rhs: tensor<1000x1xi8>, %acc: tensor<1000x1xi32>) -> tensor<1000x1xi32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x1000xi8>, tensor<1000x1xi8>) outs(%acc: tensor<1000x1xi32>) -> tensor<1000x1xi32>
+  return %result: tensor<1000x1xi32>
+}
+
+func.func @matmul_1000x1000xi8_times_1000x1xi8_into_1000x1xi32(%lhs: tensor<1000x1000xi8>, %rhs: tensor<1000x1xi8>) -> tensor<1000x1xi32> {
+  %init_acc = tensor.empty() : tensor<1000x1xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<1000x1xi32>) -> tensor<1000x1xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1000x1000xi8>, tensor<1000x1xi8>) outs(%acc: tensor<1000x1xi32>) -> tensor<1000x1xi32>
+  return %result: tensor<1000x1xi32>
+}
+
diff --git a/linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_large_calls.mlir b/linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_large_calls.mlir
new file mode 100644
index 0000000..575772e
--- /dev/null
+++ b/linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_large_calls.mlir
@@ -0,0 +1,882 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_512x128xi8_times_128x512xi8_into_512x512xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xi8_times_128x512xi8_into_512x512xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x4xi8_times_4x512xi8_into_1000x512xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_4x1000xi8_times_1000x512xi8_into_4x512xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x1000xi8_times_1000x4xi8_into_512x4xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xi8_times_128x500xi8_into_512x500xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xi8_times_330x512xi8_into_457x512xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xi8_times_330x514xi8_into_457x514xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_438x330xi8_times_330x514xi8_into_438x514xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_540x332xi8_times_332x516xi8_into_540x516xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_654x321xi8_times_321x234xi8_into_654x234xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x160xi8_times_160x512xi8_into_457x512xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x330xi8_times_330x512xi8_into_512x512xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1000xi8_times_1000x1000xi8_into_1x1000xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1000x1000xi8_times_1000x1xi8_into_1000x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x1000xi8_times_1000x1xi8_into_1000x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_512_128_512_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_512x128xi8_times_128x512xi8_into_512x512xi32_512_128_512_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_512x128xi8_times_128x512xi8_into_512x512xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_512_128_512_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xi8_times_128x512xi8_into_512x512xi32_512_128_512_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xi8_times_128x512xi8_into_512x512xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1000_4_512_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x4xi8_times_4x512xi8_into_1000x512xi32_1000_4_512_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 14 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 15 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x4xi8_times_4x512xi8_into_1000x512xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_4_1000_512_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 16 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 17 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_4x1000xi8_times_1000x512xi8_into_4x512xi32_4_1000_512_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_4x1000xi8_times_1000x512xi8_into_4x512xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_512_1000_4_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 20 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 21 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x1000xi8_times_1000x4xi8_into_512x4xi32_512_1000_4_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 22 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 23 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x1000xi8_times_1000x4xi8_into_512x4xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_512_128_500_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 500 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xi8_times_128x500xi8_into_512x500xi32_512_128_500_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 26 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 128 : i64
+  %rhs_dim1 = arith.constant 500 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 27 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xi8_times_128x500xi8_into_512x500xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_457_330_512_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 28 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 29 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xi8_times_330x512xi8_into_457x512xi32_457_330_512_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xi8_times_330x512xi8_into_457x512xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_457_330_514_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 32 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 33 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xi8_times_330x514xi8_into_457x514xi32_457_330_514_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 34 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 35 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xi8_times_330x514xi8_into_457x514xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_438_330_514_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_438x330xi8_times_330x514xi8_into_438x514xi32_438_330_514_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 38 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 514 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 39 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_438x330xi8_times_330x514xi8_into_438x514xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_540_332_516_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 40 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 332 : i64
+  %rhs_dim1 = arith.constant 516 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 41 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_540x332xi8_times_332x516xi8_into_540x516xi32_540_332_516_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 332 : i64
+  %rhs_dim1 = arith.constant 516 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_540x332xi8_times_332x516xi8_into_540x516xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_654_321_234_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 321 : i64
+  %rhs_dim1 = arith.constant 234 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_654x321xi8_times_321x234xi8_into_654x234xi32_654_321_234_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 321 : i64
+  %rhs_dim1 = arith.constant 234 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_654x321xi8_times_321x234xi8_into_654x234xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_457_160_512_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 48 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 160 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 49 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x160xi8_times_160x512xi8_into_457x512xi32_457_160_512_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 50 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 160 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 51 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x160xi8_times_160x512xi8_into_457x512xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_512_330_512_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x330xi8_times_330x512xi8_into_512x512xi32_512_330_512_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 54 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 330 : i64
+  %rhs_dim1 = arith.constant 512 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 55 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x330xi8_times_330x512xi8_into_512x512xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1_1000_1000_acc_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 56 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 57 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 58 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 58 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1000xi8_times_1000x1000xi8_into_1x1000xi32_1_1000_1000_acc_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 59 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 60 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 61 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 61 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1000xi8_times_1000x1000xi8_into_1x1000xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1000_1000_1_acc_28() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1000x1000xi8_times_1000x1xi8_into_1000x1xi32_1000_1000_1_acc_29() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1000x1000xi8_times_1000x1xi8_into_1000x1xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1000_1000_1_30() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x1000xi8_times_1000x1xi8_into_1000x1xi32_1000_1000_1_31() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 70 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 71 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x1000xi8_times_1000x1xi8_into_1000x1xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_small.mlir b/linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_small.mlir
new file mode 100644
index 0000000..b89d848
--- /dev/null
+++ b/linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_small.mlir
@@ -0,0 +1,99 @@
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: tensor<?x?xi8>, %rhs: tensor<?x?xi8>, %acc: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xi8>, tensor<?x?xi8>) outs(%acc: tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %result: tensor<?x?xi32>
+}
+
+func.func @matmul_accumulate_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs: tensor<1x1xi8>, %rhs: tensor<1x1xi8>, %acc: tensor<1x1xi32>) -> tensor<1x1xi32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1xi8>, tensor<1x1xi8>) outs(%acc: tensor<1x1xi32>) -> tensor<1x1xi32>
+  return %result: tensor<1x1xi32>
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: tensor<?x?xi8>, %rhs: tensor<?x?xi8>) -> tensor<?x?xi32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xi8>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xi8>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<?x?xi32>) -> tensor<?x?xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xi8>, tensor<?x?xi8>) outs(%acc: tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %result: tensor<?x?xi32>
+}
+
+func.func @matmul_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs: tensor<1x1xi8>, %rhs: tensor<1x1xi8>) -> tensor<1x1xi32> {
+  %init_acc = tensor.empty() : tensor<1x1xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<1x1xi32>) -> tensor<1x1xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x1xi8>, tensor<1x1xi8>) outs(%acc: tensor<1x1xi32>) -> tensor<1x1xi32>
+  return %result: tensor<1x1xi32>
+}
+
+func.func @matmul_accumulate_2x2xi8_times_2x2xi8_into_2x2xi32(%lhs: tensor<2x2xi8>, %rhs: tensor<2x2xi8>, %acc: tensor<2x2xi32>) -> tensor<2x2xi32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<2x2xi8>, tensor<2x2xi8>) outs(%acc: tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %result: tensor<2x2xi32>
+}
+
+func.func @matmul_accumulate_4x4xi8_times_4x4xi8_into_4x4xi32(%lhs: tensor<4x4xi8>, %rhs: tensor<4x4xi8>, %acc: tensor<4x4xi32>) -> tensor<4x4xi32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<4x4xi8>, tensor<4x4xi8>) outs(%acc: tensor<4x4xi32>) -> tensor<4x4xi32>
+  return %result: tensor<4x4xi32>
+}
+
+func.func @matmul_accumulate_8x8xi8_times_8x8xi8_into_8x8xi32(%lhs: tensor<8x8xi8>, %rhs: tensor<8x8xi8>, %acc: tensor<8x8xi32>) -> tensor<8x8xi32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<8x8xi8>, tensor<8x8xi8>) outs(%acc: tensor<8x8xi32>) -> tensor<8x8xi32>
+  return %result: tensor<8x8xi32>
+}
+
+func.func @matmul_accumulate_9x9xi8_times_9x9xi8_into_9x9xi32(%lhs: tensor<9x9xi8>, %rhs: tensor<9x9xi8>, %acc: tensor<9x9xi32>) -> tensor<9x9xi32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<9x9xi8>, tensor<9x9xi8>) outs(%acc: tensor<9x9xi32>) -> tensor<9x9xi32>
+  return %result: tensor<9x9xi32>
+}
+
+func.func @matmul_accumulate_6x13xi8_times_13x3xi8_into_6x3xi32(%lhs: tensor<6x13xi8>, %rhs: tensor<13x3xi8>, %acc: tensor<6x3xi32>) -> tensor<6x3xi32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<6x13xi8>, tensor<13x3xi8>) outs(%acc: tensor<6x3xi32>) -> tensor<6x3xi32>
+  return %result: tensor<6x3xi32>
+}
+
+func.func @matmul_15x37xi8_times_37x7xi8_into_15x7xi32(%lhs: tensor<15x37xi8>, %rhs: tensor<37x7xi8>) -> tensor<15x7xi32> {
+  %init_acc = tensor.empty() : tensor<15x7xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<15x7xi32>) -> tensor<15x7xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<15x37xi8>, tensor<37x7xi8>) outs(%acc: tensor<15x7xi32>) -> tensor<15x7xi32>
+  return %result: tensor<15x7xi32>
+}
+
+func.func @matmul_accumulate_81x19xi8_times_19x41xi8_into_81x41xi32(%lhs: tensor<81x19xi8>, %rhs: tensor<19x41xi8>, %acc: tensor<81x41xi32>) -> tensor<81x41xi32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<81x19xi8>, tensor<19x41xi8>) outs(%acc: tensor<81x41xi32>) -> tensor<81x41xi32>
+  return %result: tensor<81x41xi32>
+}
+
+func.func @matmul_accumulate_1x10xi8_times_10x10xi8_into_1x10xi32(%lhs: tensor<1x10xi8>, %rhs: tensor<10x10xi8>, %acc: tensor<1x10xi32>) -> tensor<1x10xi32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x10xi8>, tensor<10x10xi8>) outs(%acc: tensor<1x10xi32>) -> tensor<1x10xi32>
+  return %result: tensor<1x10xi32>
+}
+
+func.func @matmul_1x10xi8_times_10x10xi8_into_1x10xi32(%lhs: tensor<1x10xi8>, %rhs: tensor<10x10xi8>) -> tensor<1x10xi32> {
+  %init_acc = tensor.empty() : tensor<1x10xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<1x10xi32>) -> tensor<1x10xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<1x10xi8>, tensor<10x10xi8>) outs(%acc: tensor<1x10xi32>) -> tensor<1x10xi32>
+  return %result: tensor<1x10xi32>
+}
+
+func.func @matmul_accumulate_10x1xi8_times_1x10xi8_into_10x10xi32(%lhs: tensor<10x1xi8>, %rhs: tensor<1x10xi8>, %acc: tensor<10x10xi32>) -> tensor<10x10xi32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x1xi8>, tensor<1x10xi8>) outs(%acc: tensor<10x10xi32>) -> tensor<10x10xi32>
+  return %result: tensor<10x10xi32>
+}
+
+func.func @matmul_accumulate_10x10xi8_times_10x1xi8_into_10x1xi32(%lhs: tensor<10x10xi8>, %rhs: tensor<10x1xi8>, %acc: tensor<10x1xi32>) -> tensor<10x1xi32> {
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x10xi8>, tensor<10x1xi8>) outs(%acc: tensor<10x1xi32>) -> tensor<10x1xi32>
+  return %result: tensor<10x1xi32>
+}
+
+func.func @matmul_10x10xi8_times_10x1xi8_into_10x1xi32(%lhs: tensor<10x10xi8>, %rhs: tensor<10x1xi8>) -> tensor<10x1xi32> {
+  %init_acc = tensor.empty() : tensor<10x1xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<10x1xi32>) -> tensor<10x1xi32>
+  %result = linalg.matmul ins(%lhs, %rhs: tensor<10x10xi8>, tensor<10x1xi8>) outs(%acc: tensor<10x1xi32>) -> tensor<10x1xi32>
+  return %result: tensor<10x1xi32>
+}
+
diff --git a/linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_small_calls.mlir b/linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_small_calls.mlir
new file mode 100644
index 0000000..3b93cbe
--- /dev/null
+++ b/linalg_ops/matmul/generated/i8_into_i32/matmul_i8_into_i32_small_calls.mlir
@@ -0,0 +1,906 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_2x2xi8_times_2x2xi8_into_2x2xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_4x4xi8_times_4x4xi8_into_4x4xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_8x8xi8_times_8x8xi8_into_8x8xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_9x9xi8_times_9x9xi8_into_9x9xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_6x13xi8_times_13x3xi8_into_6x3xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_15x37xi8_times_37x7xi8_into_15x7xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_81x19xi8_times_19x41xi8_into_81x41xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x10xi8_times_10x10xi8_into_1x10xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x10xi8_times_10x10xi8_into_1x10xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x1xi8_times_1x10xi8_into_10x10xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x10xi8_times_10x1xi8_into_10x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_10x10xi8_times_10x1xi8_into_10x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1xi8_times_1x1xi8_into_1x1xi32_1_1_1_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1_1_1_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x1xi8_times_1x1xi8_into_1x1xi32_1_1_1_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_2_2_2_acc_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 14 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 14 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_2x2xi8_times_2x2xi8_into_2x2xi32_2_2_2_acc_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 15 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 16 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 17 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 17 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_2x2xi8_times_2x2xi8_into_2x2xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_4_4_4_acc_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 20 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 20 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_4x4xi8_times_4x4xi8_into_4x4xi32_4_4_4_acc_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 21 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 22 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 23 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 23 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_4x4xi8_times_4x4xi8_into_4x4xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_8_8_8_acc_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 26 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 26 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_8x8xi8_times_8x8xi8_into_8x8xi32_8_8_8_acc_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 27 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 28 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 29 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 29 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_8x8xi8_times_8x8xi8_into_8x8xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_9_9_9_acc_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 32 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 32 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_9x9xi8_times_9x9xi8_into_9x9xi32_9_9_9_acc_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 33 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 34 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 35 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 35 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_9x9xi8_times_9x9xi8_into_9x9xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_6_13_3_acc_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 13 : i64
+  %rhs_dim1 = arith.constant 3 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 38 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 38 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_6x13xi8_times_13x3xi8_into_6x3xi32_6_13_3_acc_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 39 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 13 : i64
+  %rhs_dim1 = arith.constant 3 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 40 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 41 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 41 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_6x13xi8_times_13x3xi8_into_6x3xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_15_37_7_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 37 : i64
+  %rhs_dim1 = arith.constant 7 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_15x37xi8_times_37x7xi8_into_15x7xi32_15_37_7_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 37 : i64
+  %rhs_dim1 = arith.constant 7 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_15x37xi8_times_37x7xi8_into_15x7xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_81_19_41_acc_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 19 : i64
+  %rhs_dim1 = arith.constant 41 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 48 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 48 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_81x19xi8_times_19x41xi8_into_81x41xi32_81_19_41_acc_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 49 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 19 : i64
+  %rhs_dim1 = arith.constant 41 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 50 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 51 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 51 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_81x19xi8_times_19x41xi8_into_81x41xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1_10_10_acc_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 54 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 54 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x10xi8_times_10x10xi8_into_1x10xi32_1_10_10_acc_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 55 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 56 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 57 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 57 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x10xi8_times_10x10xi8_into_1x10xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1_10_10_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 58 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 59 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x10xi8_times_10x10xi8_into_1x10xi32_1_10_10_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 60 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 61 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x10xi8_times_10x10xi8_into_1x10xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_10_1_10_acc_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x1xi8_times_1x10xi8_into_10x10xi32_10_1_10_acc_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x1xi8_times_1x10xi8_into_10x10xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_10_10_1_acc_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 70 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 70 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x10xi8_times_10x1xi8_into_10x1xi32_10_10_1_acc_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 71 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 72 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 73 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 73 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x10xi8_times_10x1xi8_into_10x1xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_10_10_1_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 74 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 75 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_10x10xi8_times_10x1xi8_into_10x1xi32_10_10_1_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 76 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 77 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_10x10xi8_times_10x1xi8_into_10x1xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 0 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_large.mlir b/linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_large.mlir
new file mode 100644
index 0000000..1879a13
--- /dev/null
+++ b/linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_large.mlir
@@ -0,0 +1,136 @@
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: tensor<?x?xi8>, %rhs: tensor<?x?xi8>, %acc: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xi8>, tensor<?x?xi8>) outs(%acc: tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %result: tensor<?x?xi32>
+}
+
+func.func @matmul_accumulate_512x128xi8_times_512x128xi8_into_512x512xi32(%lhs: tensor<512x128xi8>, %rhs: tensor<512x128xi8>, %acc: tensor<512x512xi32>) -> tensor<512x512xi32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xi8>, tensor<512x128xi8>) outs(%acc: tensor<512x512xi32>) -> tensor<512x512xi32>
+  return %result: tensor<512x512xi32>
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: tensor<?x?xi8>, %rhs: tensor<?x?xi8>) -> tensor<?x?xi32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xi8>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xi8>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<?x?xi32>) -> tensor<?x?xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xi8>, tensor<?x?xi8>) outs(%acc: tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %result: tensor<?x?xi32>
+}
+
+func.func @matmul_512x128xi8_times_512x128xi8_into_512x512xi32(%lhs: tensor<512x128xi8>, %rhs: tensor<512x128xi8>) -> tensor<512x512xi32> {
+  %init_acc = tensor.empty() : tensor<512x512xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<512x512xi32>) -> tensor<512x512xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xi8>, tensor<512x128xi8>) outs(%acc: tensor<512x512xi32>) -> tensor<512x512xi32>
+  return %result: tensor<512x512xi32>
+}
+
+func.func @matmul_1000x4xi8_times_512x4xi8_into_1000x512xi32(%lhs: tensor<1000x4xi8>, %rhs: tensor<512x4xi8>) -> tensor<1000x512xi32> {
+  %init_acc = tensor.empty() : tensor<1000x512xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<1000x512xi32>) -> tensor<1000x512xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x4xi8>, tensor<512x4xi8>) outs(%acc: tensor<1000x512xi32>) -> tensor<1000x512xi32>
+  return %result: tensor<1000x512xi32>
+}
+
+func.func @matmul_4x1000xi8_times_512x1000xi8_into_4x512xi32(%lhs: tensor<4x1000xi8>, %rhs: tensor<512x1000xi8>) -> tensor<4x512xi32> {
+  %init_acc = tensor.empty() : tensor<4x512xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<4x512xi32>) -> tensor<4x512xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<4x1000xi8>, tensor<512x1000xi8>) outs(%acc: tensor<4x512xi32>) -> tensor<4x512xi32>
+  return %result: tensor<4x512xi32>
+}
+
+func.func @matmul_512x1000xi8_times_4x1000xi8_into_512x4xi32(%lhs: tensor<512x1000xi8>, %rhs: tensor<4x1000xi8>) -> tensor<512x4xi32> {
+  %init_acc = tensor.empty() : tensor<512x4xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<512x4xi32>) -> tensor<512x4xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x1000xi8>, tensor<4x1000xi8>) outs(%acc: tensor<512x4xi32>) -> tensor<512x4xi32>
+  return %result: tensor<512x4xi32>
+}
+
+func.func @matmul_512x128xi8_times_500x128xi8_into_512x500xi32(%lhs: tensor<512x128xi8>, %rhs: tensor<500x128xi8>) -> tensor<512x500xi32> {
+  %init_acc = tensor.empty() : tensor<512x500xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<512x500xi32>) -> tensor<512x500xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x128xi8>, tensor<500x128xi8>) outs(%acc: tensor<512x500xi32>) -> tensor<512x500xi32>
+  return %result: tensor<512x500xi32>
+}
+
+func.func @matmul_457x330xi8_times_512x330xi8_into_457x512xi32(%lhs: tensor<457x330xi8>, %rhs: tensor<512x330xi8>) -> tensor<457x512xi32> {
+  %init_acc = tensor.empty() : tensor<457x512xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<457x512xi32>) -> tensor<457x512xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x330xi8>, tensor<512x330xi8>) outs(%acc: tensor<457x512xi32>) -> tensor<457x512xi32>
+  return %result: tensor<457x512xi32>
+}
+
+func.func @matmul_457x330xi8_times_514x330xi8_into_457x514xi32(%lhs: tensor<457x330xi8>, %rhs: tensor<514x330xi8>) -> tensor<457x514xi32> {
+  %init_acc = tensor.empty() : tensor<457x514xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<457x514xi32>) -> tensor<457x514xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x330xi8>, tensor<514x330xi8>) outs(%acc: tensor<457x514xi32>) -> tensor<457x514xi32>
+  return %result: tensor<457x514xi32>
+}
+
+func.func @matmul_438x330xi8_times_514x330xi8_into_438x514xi32(%lhs: tensor<438x330xi8>, %rhs: tensor<514x330xi8>) -> tensor<438x514xi32> {
+  %init_acc = tensor.empty() : tensor<438x514xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<438x514xi32>) -> tensor<438x514xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<438x330xi8>, tensor<514x330xi8>) outs(%acc: tensor<438x514xi32>) -> tensor<438x514xi32>
+  return %result: tensor<438x514xi32>
+}
+
+func.func @matmul_540x332xi8_times_516x332xi8_into_540x516xi32(%lhs: tensor<540x332xi8>, %rhs: tensor<516x332xi8>) -> tensor<540x516xi32> {
+  %init_acc = tensor.empty() : tensor<540x516xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<540x516xi32>) -> tensor<540x516xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<540x332xi8>, tensor<516x332xi8>) outs(%acc: tensor<540x516xi32>) -> tensor<540x516xi32>
+  return %result: tensor<540x516xi32>
+}
+
+func.func @matmul_654x321xi8_times_234x321xi8_into_654x234xi32(%lhs: tensor<654x321xi8>, %rhs: tensor<234x321xi8>) -> tensor<654x234xi32> {
+  %init_acc = tensor.empty() : tensor<654x234xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<654x234xi32>) -> tensor<654x234xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<654x321xi8>, tensor<234x321xi8>) outs(%acc: tensor<654x234xi32>) -> tensor<654x234xi32>
+  return %result: tensor<654x234xi32>
+}
+
+func.func @matmul_457x160xi8_times_512x160xi8_into_457x512xi32(%lhs: tensor<457x160xi8>, %rhs: tensor<512x160xi8>) -> tensor<457x512xi32> {
+  %init_acc = tensor.empty() : tensor<457x512xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<457x512xi32>) -> tensor<457x512xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<457x160xi8>, tensor<512x160xi8>) outs(%acc: tensor<457x512xi32>) -> tensor<457x512xi32>
+  return %result: tensor<457x512xi32>
+}
+
+func.func @matmul_512x330xi8_times_512x330xi8_into_512x512xi32(%lhs: tensor<512x330xi8>, %rhs: tensor<512x330xi8>) -> tensor<512x512xi32> {
+  %init_acc = tensor.empty() : tensor<512x512xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<512x512xi32>) -> tensor<512x512xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<512x330xi8>, tensor<512x330xi8>) outs(%acc: tensor<512x512xi32>) -> tensor<512x512xi32>
+  return %result: tensor<512x512xi32>
+}
+
+func.func @matmul_accumulate_1x1000xi8_times_1000x1000xi8_into_1x1000xi32(%lhs: tensor<1x1000xi8>, %rhs: tensor<1000x1000xi8>, %acc: tensor<1x1000xi32>) -> tensor<1x1000xi32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1000xi8>, tensor<1000x1000xi8>) outs(%acc: tensor<1x1000xi32>) -> tensor<1x1000xi32>
+  return %result: tensor<1x1000xi32>
+}
+
+func.func @matmul_accumulate_1000x1000xi8_times_1x1000xi8_into_1000x1xi32(%lhs: tensor<1000x1000xi8>, %rhs: tensor<1x1000xi8>, %acc: tensor<1000x1xi32>) -> tensor<1000x1xi32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x1000xi8>, tensor<1x1000xi8>) outs(%acc: tensor<1000x1xi32>) -> tensor<1000x1xi32>
+  return %result: tensor<1000x1xi32>
+}
+
+func.func @matmul_1000x1000xi8_times_1x1000xi8_into_1000x1xi32(%lhs: tensor<1000x1000xi8>, %rhs: tensor<1x1000xi8>) -> tensor<1000x1xi32> {
+  %init_acc = tensor.empty() : tensor<1000x1xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<1000x1xi32>) -> tensor<1000x1xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1000x1000xi8>, tensor<1x1000xi8>) outs(%acc: tensor<1000x1xi32>) -> tensor<1000x1xi32>
+  return %result: tensor<1000x1xi32>
+}
+
diff --git a/linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_large_calls.mlir b/linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_large_calls.mlir
new file mode 100644
index 0000000..20ae545
--- /dev/null
+++ b/linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_large_calls.mlir
@@ -0,0 +1,882 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_512x128xi8_times_512x128xi8_into_512x512xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xi8_times_512x128xi8_into_512x512xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x4xi8_times_512x4xi8_into_1000x512xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_4x1000xi8_times_512x1000xi8_into_4x512xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x1000xi8_times_4x1000xi8_into_512x4xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x128xi8_times_500x128xi8_into_512x500xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xi8_times_512x330xi8_into_457x512xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x330xi8_times_514x330xi8_into_457x514xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_438x330xi8_times_514x330xi8_into_438x514xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_540x332xi8_times_516x332xi8_into_540x516xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_654x321xi8_times_234x321xi8_into_654x234xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_457x160xi8_times_512x160xi8_into_457x512xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_512x330xi8_times_512x330xi8_into_512x512xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1000xi8_times_1000x1000xi8_into_1x1000xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1000x1000xi8_times_1x1000xi8_into_1000x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1000x1000xi8_times_1x1000xi8_into_1000x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_512_128_512_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_512x128xi8_times_512x128xi8_into_512x512xi32_512_128_512_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 512 : i64
+  %acc_dim1 = arith.constant 512 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 512 : i64
+  %acc_copy_dim1 = arith.constant 512 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_512x128xi8_times_512x128xi8_into_512x512xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_512_128_512_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xi8_times_512x128xi8_into_512x512xi32_512_128_512_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xi8_times_512x128xi8_into_512x512xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1000_4_512_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x4xi8_times_512x4xi8_into_1000x512xi32_1000_4_512_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x4x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 14 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 15 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x4xi8_times_512x4xi8_into_1000x512xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_4_1000_512_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 16 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 17 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_4x1000xi8_times_512x1000xi8_into_4x512xi32_4_1000_512_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x1000x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_4x1000xi8_times_512x1000xi8_into_4x512xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_512_1000_4_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 20 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 21 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x1000xi8_times_4x1000xi8_into_512x4xi32_512_1000_4_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x1000x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 22 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 23 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x1000xi8_times_4x1000xi8_into_512x4xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_512_128_500_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 500 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x128xi8_times_500x128xi8_into_512x500xi32_512_128_500_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x128x500"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 128 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 26 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 500 : i64
+  %rhs_dim1 = arith.constant 128 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 27 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x128xi8_times_500x128xi8_into_512x500xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 128 : i64
+  %n = arith.constant 500 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_457_330_512_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 28 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 29 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xi8_times_512x330xi8_into_457x512xi32_457_330_512_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xi8_times_512x330xi8_into_457x512xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_457_330_514_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 32 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 33 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x330xi8_times_514x330xi8_into_457x514xi32_457_330_514_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 34 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 35 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x330xi8_times_514x330xi8_into_457x514xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_438_330_514_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_438x330xi8_times_514x330xi8_into_438x514xi32_438_330_514_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 438x330x514"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 438 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 38 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 514 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 39 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_438x330xi8_times_514x330xi8_into_438x514xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 438 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 514 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_540_332_516_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 40 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 516 : i64
+  %rhs_dim1 = arith.constant 332 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 41 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_540x332xi8_times_516x332xi8_into_540x516xi32_540_332_516_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 540x332x516"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 540 : i64
+  %lhs_dim1 = arith.constant 332 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 516 : i64
+  %rhs_dim1 = arith.constant 332 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_540x332xi8_times_516x332xi8_into_540x516xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 540 : i64
+  %k = arith.constant 332 : i64
+  %n = arith.constant 516 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_654_321_234_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 234 : i64
+  %rhs_dim1 = arith.constant 321 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_654x321xi8_times_234x321xi8_into_654x234xi32_654_321_234_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 654x321x234"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 654 : i64
+  %lhs_dim1 = arith.constant 321 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 234 : i64
+  %rhs_dim1 = arith.constant 321 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_654x321xi8_times_234x321xi8_into_654x234xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 654 : i64
+  %k = arith.constant 321 : i64
+  %n = arith.constant 234 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_457_160_512_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 48 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 160 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 49 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_457x160xi8_times_512x160xi8_into_457x512xi32_457_160_512_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 457x160x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 457 : i64
+  %lhs_dim1 = arith.constant 160 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 50 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 160 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 51 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_457x160xi8_times_512x160xi8_into_457x512xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 457 : i64
+  %k = arith.constant 160 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_512_330_512_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_512x330xi8_times_512x330xi8_into_512x512xi32_512_330_512_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 512x330x512"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 512 : i64
+  %lhs_dim1 = arith.constant 330 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 54 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 512 : i64
+  %rhs_dim1 = arith.constant 330 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 55 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_512x330xi8_times_512x330xi8_into_512x512xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 512 : i64
+  %k = arith.constant 330 : i64
+  %n = arith.constant 512 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1_1000_1000_acc_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 56 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 57 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 58 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 58 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1000xi8_times_1000x1000xi8_into_1x1000xi32_1_1000_1000_acc_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1000x1000"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 59 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1000 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 60 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1000 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 61 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1000 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 61 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1000xi8_times_1000x1000xi8_into_1x1000xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1000 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1000_1000_1_acc_28() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1000x1000xi8_times_1x1000xi8_into_1000x1xi32_1000_1000_1_acc_29() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1000 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1000 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1000x1000xi8_times_1x1000xi8_into_1000x1xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1000_1000_1_30() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1000x1000xi8_times_1x1000xi8_into_1000x1xi32_1000_1000_1_31() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1000x1000x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1000 : i64
+  %lhs_dim1 = arith.constant 1000 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 70 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1000 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 71 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1000x1000xi8_times_1x1000xi8_into_1000x1xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1000 : i64
+  %k = arith.constant 1000 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_small.mlir b/linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_small.mlir
new file mode 100644
index 0000000..2a0da4f
--- /dev/null
+++ b/linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_small.mlir
@@ -0,0 +1,99 @@
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: tensor<?x?xi8>, %rhs: tensor<?x?xi8>, %acc: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xi8>, tensor<?x?xi8>) outs(%acc: tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %result: tensor<?x?xi32>
+}
+
+func.func @matmul_accumulate_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs: tensor<1x1xi8>, %rhs: tensor<1x1xi8>, %acc: tensor<1x1xi32>) -> tensor<1x1xi32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1xi8>, tensor<1x1xi8>) outs(%acc: tensor<1x1xi32>) -> tensor<1x1xi32>
+  return %result: tensor<1x1xi32>
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: tensor<?x?xi8>, %rhs: tensor<?x?xi8>) -> tensor<?x?xi32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %acc_dim0 = tensor.dim %lhs, %c0 : tensor<?x?xi8>
+  %acc_dim1 = tensor.dim %rhs, %c1 : tensor<?x?xi8>
+  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : tensor<?x?xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<?x?xi32>) -> tensor<?x?xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<?x?xi8>, tensor<?x?xi8>) outs(%acc: tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %result: tensor<?x?xi32>
+}
+
+func.func @matmul_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs: tensor<1x1xi8>, %rhs: tensor<1x1xi8>) -> tensor<1x1xi32> {
+  %init_acc = tensor.empty() : tensor<1x1xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<1x1xi32>) -> tensor<1x1xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x1xi8>, tensor<1x1xi8>) outs(%acc: tensor<1x1xi32>) -> tensor<1x1xi32>
+  return %result: tensor<1x1xi32>
+}
+
+func.func @matmul_accumulate_2x2xi8_times_2x2xi8_into_2x2xi32(%lhs: tensor<2x2xi8>, %rhs: tensor<2x2xi8>, %acc: tensor<2x2xi32>) -> tensor<2x2xi32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<2x2xi8>, tensor<2x2xi8>) outs(%acc: tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %result: tensor<2x2xi32>
+}
+
+func.func @matmul_accumulate_4x4xi8_times_4x4xi8_into_4x4xi32(%lhs: tensor<4x4xi8>, %rhs: tensor<4x4xi8>, %acc: tensor<4x4xi32>) -> tensor<4x4xi32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<4x4xi8>, tensor<4x4xi8>) outs(%acc: tensor<4x4xi32>) -> tensor<4x4xi32>
+  return %result: tensor<4x4xi32>
+}
+
+func.func @matmul_accumulate_8x8xi8_times_8x8xi8_into_8x8xi32(%lhs: tensor<8x8xi8>, %rhs: tensor<8x8xi8>, %acc: tensor<8x8xi32>) -> tensor<8x8xi32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<8x8xi8>, tensor<8x8xi8>) outs(%acc: tensor<8x8xi32>) -> tensor<8x8xi32>
+  return %result: tensor<8x8xi32>
+}
+
+func.func @matmul_accumulate_9x9xi8_times_9x9xi8_into_9x9xi32(%lhs: tensor<9x9xi8>, %rhs: tensor<9x9xi8>, %acc: tensor<9x9xi32>) -> tensor<9x9xi32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<9x9xi8>, tensor<9x9xi8>) outs(%acc: tensor<9x9xi32>) -> tensor<9x9xi32>
+  return %result: tensor<9x9xi32>
+}
+
+func.func @matmul_accumulate_6x13xi8_times_3x13xi8_into_6x3xi32(%lhs: tensor<6x13xi8>, %rhs: tensor<3x13xi8>, %acc: tensor<6x3xi32>) -> tensor<6x3xi32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<6x13xi8>, tensor<3x13xi8>) outs(%acc: tensor<6x3xi32>) -> tensor<6x3xi32>
+  return %result: tensor<6x3xi32>
+}
+
+func.func @matmul_15x37xi8_times_7x37xi8_into_15x7xi32(%lhs: tensor<15x37xi8>, %rhs: tensor<7x37xi8>) -> tensor<15x7xi32> {
+  %init_acc = tensor.empty() : tensor<15x7xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<15x7xi32>) -> tensor<15x7xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<15x37xi8>, tensor<7x37xi8>) outs(%acc: tensor<15x7xi32>) -> tensor<15x7xi32>
+  return %result: tensor<15x7xi32>
+}
+
+func.func @matmul_accumulate_81x19xi8_times_41x19xi8_into_81x41xi32(%lhs: tensor<81x19xi8>, %rhs: tensor<41x19xi8>, %acc: tensor<81x41xi32>) -> tensor<81x41xi32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<81x19xi8>, tensor<41x19xi8>) outs(%acc: tensor<81x41xi32>) -> tensor<81x41xi32>
+  return %result: tensor<81x41xi32>
+}
+
+func.func @matmul_accumulate_1x10xi8_times_10x10xi8_into_1x10xi32(%lhs: tensor<1x10xi8>, %rhs: tensor<10x10xi8>, %acc: tensor<1x10xi32>) -> tensor<1x10xi32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x10xi8>, tensor<10x10xi8>) outs(%acc: tensor<1x10xi32>) -> tensor<1x10xi32>
+  return %result: tensor<1x10xi32>
+}
+
+func.func @matmul_1x10xi8_times_10x10xi8_into_1x10xi32(%lhs: tensor<1x10xi8>, %rhs: tensor<10x10xi8>) -> tensor<1x10xi32> {
+  %init_acc = tensor.empty() : tensor<1x10xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<1x10xi32>) -> tensor<1x10xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<1x10xi8>, tensor<10x10xi8>) outs(%acc: tensor<1x10xi32>) -> tensor<1x10xi32>
+  return %result: tensor<1x10xi32>
+}
+
+func.func @matmul_accumulate_10x1xi8_times_10x1xi8_into_10x10xi32(%lhs: tensor<10x1xi8>, %rhs: tensor<10x1xi8>, %acc: tensor<10x10xi32>) -> tensor<10x10xi32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x1xi8>, tensor<10x1xi8>) outs(%acc: tensor<10x10xi32>) -> tensor<10x10xi32>
+  return %result: tensor<10x10xi32>
+}
+
+func.func @matmul_accumulate_10x10xi8_times_1x10xi8_into_10x1xi32(%lhs: tensor<10x10xi8>, %rhs: tensor<1x10xi8>, %acc: tensor<10x1xi32>) -> tensor<10x1xi32> {
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x10xi8>, tensor<1x10xi8>) outs(%acc: tensor<10x1xi32>) -> tensor<10x1xi32>
+  return %result: tensor<10x1xi32>
+}
+
+func.func @matmul_10x10xi8_times_1x10xi8_into_10x1xi32(%lhs: tensor<10x10xi8>, %rhs: tensor<1x10xi8>) -> tensor<10x1xi32> {
+  %init_acc = tensor.empty() : tensor<10x1xi32>
+  %c0_acc_type = arith.constant 0: i32
+  %acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<10x1xi32>) -> tensor<10x1xi32>
+  %result = linalg.matmul_transpose_b ins(%lhs, %rhs: tensor<10x10xi8>, tensor<1x10xi8>) outs(%acc: tensor<10x1xi32>) -> tensor<10x1xi32>
+  return %result: tensor<10x1xi32>
+}
+
diff --git a/linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_small_calls.mlir b/linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_small_calls.mlir
new file mode 100644
index 0000000..e3407bd
--- /dev/null
+++ b/linalg_ops/matmul/generated/i8_into_i32/matmul_transpose_b_i8_into_i32_small_calls.mlir
@@ -0,0 +1,906 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_2x2xi8_times_2x2xi8_into_2x2xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_4x4xi8_times_4x4xi8_into_4x4xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_8x8xi8_times_8x8xi8_into_8x8xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_9x9xi8_times_9x9xi8_into_9x9xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_6x13xi8_times_3x13xi8_into_6x3xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_15x37xi8_times_7x37xi8_into_15x7xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_81x19xi8_times_41x19xi8_into_81x41xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_1x10xi8_times_10x10xi8_into_1x10xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_1x10xi8_times_10x10xi8_into_1x10xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x1xi8_times_10x1xi8_into_10x10xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_accumulate_10x10xi8_times_1x10xi8_into_10x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.matmul_10x10xi8_times_1x10xi8_into_10x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 2 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 3 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x1xi8_times_1x1xi8_into_1x1xi32_1_1_1_acc_1() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 5 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 6 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1_1_1_2() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 8 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 9 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x1xi8_times_1x1xi8_into_1x1xi32_1_1_1_3() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 10 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 11 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_2_2_2_acc_4() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 12 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 13 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 14 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 14 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_2x2xi8_times_2x2xi8_into_2x2xi32_2_2_2_acc_5() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 2x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 2 : i64
+  %lhs_dim1 = arith.constant 2 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 15 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 2 : i64
+  %rhs_dim1 = arith.constant 2 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 16 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 17 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 17 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_2x2xi8_times_2x2xi8_into_2x2xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 2 : i64
+  %k = arith.constant 2 : i64
+  %n = arith.constant 2 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_4_4_4_acc_6() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 18 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 19 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 20 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 20 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_4x4xi8_times_4x4xi8_into_4x4xi32_4_4_4_acc_7() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 4x4x4"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 4 : i64
+  %lhs_dim1 = arith.constant 4 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 21 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 4 : i64
+  %rhs_dim1 = arith.constant 4 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 22 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 4 : i64
+  %acc_dim1 = arith.constant 4 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 23 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 4 : i64
+  %acc_copy_dim1 = arith.constant 4 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 23 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_4x4xi8_times_4x4xi8_into_4x4xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 4 : i64
+  %k = arith.constant 4 : i64
+  %n = arith.constant 4 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_8_8_8_acc_8() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 24 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 25 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 26 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 26 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_8x8xi8_times_8x8xi8_into_8x8xi32_8_8_8_acc_9() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 8x8x8"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 8 : i64
+  %lhs_dim1 = arith.constant 8 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 27 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 8 : i64
+  %rhs_dim1 = arith.constant 8 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 28 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 8 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 29 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 8 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 29 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_8x8xi8_times_8x8xi8_into_8x8xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 8 : i64
+  %k = arith.constant 8 : i64
+  %n = arith.constant 8 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_9_9_9_acc_10() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 30 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 31 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 32 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 32 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_9x9xi8_times_9x9xi8_into_9x9xi32_9_9_9_acc_11() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 9x9x9"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 9 : i64
+  %lhs_dim1 = arith.constant 9 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 33 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 9 : i64
+  %rhs_dim1 = arith.constant 9 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 34 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 9 : i64
+  %acc_dim1 = arith.constant 9 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 35 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 9 : i64
+  %acc_copy_dim1 = arith.constant 9 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 35 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_9x9xi8_times_9x9xi8_into_9x9xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 9 : i64
+  %k = arith.constant 9 : i64
+  %n = arith.constant 9 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_6_13_3_acc_12() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 36 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 3 : i64
+  %rhs_dim1 = arith.constant 13 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 37 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 38 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 38 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_6x13xi8_times_3x13xi8_into_6x3xi32_6_13_3_acc_13() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 6x13x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 6 : i64
+  %lhs_dim1 = arith.constant 13 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 39 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 3 : i64
+  %rhs_dim1 = arith.constant 13 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 40 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 6 : i64
+  %acc_dim1 = arith.constant 3 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 41 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 6 : i64
+  %acc_copy_dim1 = arith.constant 3 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 41 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_6x13xi8_times_3x13xi8_into_6x3xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 6 : i64
+  %k = arith.constant 13 : i64
+  %n = arith.constant 3 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_15_37_7_14() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 42 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 7 : i64
+  %rhs_dim1 = arith.constant 37 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 43 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_15x37xi8_times_7x37xi8_into_15x7xi32_15_37_7_15() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 15x37x7"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 15 : i64
+  %lhs_dim1 = arith.constant 37 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 44 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 7 : i64
+  %rhs_dim1 = arith.constant 37 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 45 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_15x37xi8_times_7x37xi8_into_15x7xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 15 : i64
+  %k = arith.constant 37 : i64
+  %n = arith.constant 7 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_81_19_41_acc_16() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 46 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 41 : i64
+  %rhs_dim1 = arith.constant 19 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 47 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 48 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 48 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_81x19xi8_times_41x19xi8_into_81x41xi32_81_19_41_acc_17() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 81x19x41"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 81 : i64
+  %lhs_dim1 = arith.constant 19 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 49 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 41 : i64
+  %rhs_dim1 = arith.constant 19 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 50 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 81 : i64
+  %acc_dim1 = arith.constant 41 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 51 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 81 : i64
+  %acc_copy_dim1 = arith.constant 41 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 51 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_81x19xi8_times_41x19xi8_into_81x41xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 81 : i64
+  %k = arith.constant 19 : i64
+  %n = arith.constant 41 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1_10_10_acc_18() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 52 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 53 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 54 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 54 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_1x10xi8_times_10x10xi8_into_1x10xi32_1_10_10_acc_19() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 55 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 56 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 57 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 57 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_1x10xi8_times_10x10xi8_into_1x10xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_1_10_10_20() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 58 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 59 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_1x10xi8_times_10x10xi8_into_1x10xi32_1_10_10_21() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 1x10x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 1 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 60 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 61 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_1x10xi8_times_10x10xi8_into_1x10xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 1 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_10_1_10_acc_22() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 62 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 63 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 64 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 64 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x1xi8_times_10x1xi8_into_10x10xi32_10_1_10_acc_23() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x1x10"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 1 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 65 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 10 : i64
+  %rhs_dim1 = arith.constant 1 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 66 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 10 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 67 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 10 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 67 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x1xi8_times_10x1xi8_into_10x10xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 1 : i64
+  %n = arith.constant 10 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_10_10_1_acc_24() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 68 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 69 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 70 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 70 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_accumulate_10x10xi8_times_1x10xi8_into_10x1xi32_10_10_1_acc_25() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 71 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 72 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 10 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 73 : i32
+  %acc = call @matmul_test.generate_random_matrix(%device, %acc_dim0, %acc_dim1, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 10 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 73 : i32
+  %acc_copy = call @matmul_test.generate_random_matrix(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.matmul_accumulate_10x10xi8_times_1x10xi8_into_10x1xi32(%lhs, %rhs, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32_10_10_1_26() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 74 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 75 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @matmul_10x10xi8_times_1x10xi8_into_10x1xi32_10_10_1_27() attributes {
+  iree.reflection = {description = "Matmul shape (MxKxN): 10x10x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %lhs_dim0 = arith.constant 10 : i64
+  %lhs_dim1 = arith.constant 10 : i64
+  %lhs_element_type = hal.element_type<i8> : i32
+  %lhs_seed = arith.constant 76 : i32
+  %lhs = call @matmul_test.generate_random_matrix(%device, %lhs_dim0, %lhs_dim1, %lhs_element_type, %lhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %rhs_dim0 = arith.constant 1 : i64
+  %rhs_dim1 = arith.constant 10 : i64
+  %rhs_element_type = hal.element_type<i8> : i32
+  %rhs_seed = arith.constant 77 : i32
+  %rhs = call @matmul_test.generate_random_matrix(%device, %rhs_dim0, %rhs_dim1, %rhs_element_type, %rhs_seed) : (!hal.device, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc = util.null : !hal.buffer_view
+  %result = call @module.matmul_10x10xi8_times_1x10xi8_into_10x1xi32(%lhs, %rhs) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %m = arith.constant 10 : i64
+  %k = arith.constant 10 : i64
+  %n = arith.constant 1 : i64
+  %transpose_rhs = arith.constant 1 : i32
+  call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, %lhs, %rhs, %acc, %result) : (!hal.device, i64, i64, i64, i32, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}