linkedin · AndreSlavescu · Aug 31, 2024 · Aug 31, 2024 · Aug 31, 2024 · Aug 31, 2024
diff --git a/benchmark/benchmark_gemm_split_k_fp8_e4m3.py b/benchmark/benchmark_gemm_split_k_fp8_e4m3.py
@@ -0,0 +1,184 @@
+import os
+
+import torch
+import triton
+from utils import _print_speed_banner, _test_memory, get_current_file_directory
+
+from liger_kernel.ops.experimental.gemm_split_k_fp8_e4m3 import (
+    LigerFP8GemmSplitKFunction,
+)
+
+# enable TensorFloat32 tensor cores for better performance in benchmark
+torch.set_float32_matmul_precision("high")
+
+
+@triton.testing.perf_report(
+    [
+        triton.testing.Benchmark(
+            x_names=["m", "k", "n"],
+            x_vals=[
+                (64, 64, 64),
+                (256, 256, 256),
+                (512, 512, 512),
+                (1024, 1024, 1024),
+                (64, 128, 64),
+                (256, 512, 256),
+                (512, 1024, 512),
+                (1024, 2048, 1024),
+            ],
+            xlabel="Matrix Size (m x k x n)",
+            line_arg="provider",
+            line_vals=["liger", "torch", "torch_compile"],
+            line_names=["Liger", "PyTorch", "Torch Compile"],
+            styles=[("blue", "solid"), ("orange", "solid"), ("green", "solid")],
+            ylabel="time (ms)",
+            plot_name="gemm-split-k-fp8-fwd-speed-benchmark",
+            args={"mode": "forward", "dtype": torch.float32},
+        ),
+        triton.testing.Benchmark(
+            x_names=["m", "k", "n"],
+            x_vals=[
+                (64, 64, 64),
+                (256, 256, 256),
+                (512, 512, 512),
+                (1024, 1024, 1024),
+                (64, 128, 64),
+                (256, 512, 256),
+                (512, 1024, 512),
+                (1024, 2048, 1024),
+            ],
+            xlabel="Matrix Size (m x k x n)",
+            line_arg="provider",
+            line_vals=["liger", "torch", "torch_compile"],
+            line_names=["Liger", "PyTorch", "Torch Compile"],
+            styles=[("blue", "solid"), ("orange", "solid"), ("green", "solid")],
+            ylabel="time (ms)",
+            plot_name="gemm-split-k-fp8-full-speed-benchmark",
+            args={"mode": "full", "dtype": torch.float32},
+        ),
+    ]
+)
+def bench_speed_gemm_split_k_fp8(m, k, n, provider, mode, dtype, device="cuda"):
+    a_fp8 = torch.randn((m, k), device=device, dtype=dtype).to(torch.float8_e4m3fn)
+    b_fp8 = torch.randn((k, n), device=device, dtype=dtype).to(torch.float8_e4m3fn)
+
+    a_float = a_fp8.float().requires_grad_()
+    b_float = b_fp8.float().requires_grad_()
+
+    def fwd_liger():
+        return LigerFP8GemmSplitKFunction.apply(a_fp8, b_fp8)
+
+    def fwd_torch():
+        return torch.matmul(a_float, b_float)
+
+    fwd_torch_compiled = torch.compile(fwd_torch)
+
+    if provider == "liger":
+        fwd_fn = fwd_liger
+    elif provider == "torch":
+        fwd_fn = fwd_torch
+    else:
+        fwd_fn = fwd_torch_compiled
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if mode == "forward":
+        ms, min_ms, max_ms = triton.testing.do_bench(fwd_fn, quantiles=quantiles)
+    elif mode == "full":
+
+        def full():
+            y = fwd_fn()
+            if provider == "liger":
+                # compute manually gradients for Liger to avoid: "ufunc_add_CUDA" not implemented for 'Float8_e4m3fn'
+                dc = torch.ones_like(y, dtype=torch.float8_e4m3fn)
+                LigerFP8GemmSplitKFunction.apply(dc, b_fp8.t())
+                LigerFP8GemmSplitKFunction.apply(a_fp8.t(), dc)
+            else:
+                torch.sum(y).backward()
+
+        ms, min_ms, max_ms = triton.testing.do_bench(full, quantiles=quantiles)
+
+    return ms, min_ms, max_ms
+
+
+def benchmark_speed_gemm_split_k_fp8_wrapper():
+    _print_speed_banner()
+
+    curr_dir = get_current_file_directory()
+    dir_name = "gemm_split_k_fp8_speed"
+    output_dir = os.path.join(curr_dir, dir_name)
+    os.makedirs(output_dir, exist_ok=True)
+
+    bench_speed_gemm_split_k_fp8.run(save_path=output_dir, print_data=True)
+
+
+@triton.testing.perf_report(
+    [
+        triton.testing.Benchmark(
+            x_names=["m", "k", "n"],
+            x_vals=[
+                (64, 64, 64),
+                (256, 256, 256),
+                (512, 512, 512),
+                (1024, 1024, 1024),
+                (64, 128, 64),
+                (256, 512, 256),
+                (512, 1024, 512),
+                (1024, 2048, 1024),
+            ],
+            xlabel="Matrix Size (m x k x n)",
+            line_arg="provider",
+            line_vals=["liger", "torch", "torch_compile"],
+            line_names=["Liger", "PyTorch", "Torch Compile"],
+            styles=[("blue", "solid"), ("orange", "solid"), ("green", "solid")],
+            ylabel="GPU memory usage (MB)",
+            plot_name="gemm-split-k-fp8-memory-benchmark",
+            args={"dtype": torch.float32},
+        )
+    ]
+)
+def bench_memory_gemm_split_k_fp8(m, k, n, provider, dtype, device="cuda"):
+    a_fp8 = torch.randn((m, k), device=device, dtype=dtype).to(torch.float8_e4m3fn)
+    b_fp8 = torch.randn((k, n), device=device, dtype=dtype).to(torch.float8_e4m3fn)
+
+    a_float = a_fp8.float().requires_grad_()
+    b_float = b_fp8.float().requires_grad_()
+
+    def full_liger():
+        y = LigerFP8GemmSplitKFunction.apply(a_fp8, b_fp8)
+        # compute manually gradients for Liger to avoid: "ufunc_add_CUDA" not implemented for 'Float8_e4m3fn'
+        dc = torch.ones_like(y, dtype=torch.float8_e4m3fn)
+        LigerFP8GemmSplitKFunction.apply(dc, b_fp8.t())
+        LigerFP8GemmSplitKFunction.apply(a_fp8.t(), dc)
+
+    def full_torch():
+        y = torch.matmul(a_float, b_float)
+        torch.sum(y).backward()
+
+    full_torch_compiled = torch.compile(full_torch)
+
+    if provider == "liger":
+        full_fn = full_liger
+    elif provider == "torch":
+        full_fn = full_torch
+    else:
+        full_fn = full_torch_compiled
+
+    mem = _test_memory(full_fn)
+    return mem / 2**20
+
+
+def benchmark_memory_gemm_split_k_fp8_wrapper():
+    _print_speed_banner()
+
+    curr_dir = get_current_file_directory()
+    dir_name = "gemm_split_k_fp8_memory"
+    output_dir = os.path.join(curr_dir, dir_name)
+    os.makedirs(output_dir, exist_ok=True)
+
+    bench_memory_gemm_split_k_fp8.run(save_path=output_dir, print_data=True)
+
+
+if __name__ == "__main__":
+    benchmark_speed_gemm_split_k_fp8_wrapper()
+    benchmark_memory_gemm_split_k_fp8_wrapper()
diff --git a/benchmark/gemm_split_k_fp8_memory/gemm-split-k-fp8-memory-benchmark.csv b/benchmark/gemm_split_k_fp8_memory/gemm-split-k-fp8-memory-benchmark.csv
@@ -0,0 +1,9 @@
+m,k,n,Liger,PyTorch,Torch Compile
+64.000000,64.000000,64.000000,16.312500,16.380664,16.365039
+256.000000,256.000000,256.000000,17.250000,18.325977,18.075977
+512.000000,512.000000,512.000000,20.250000,24.550977,23.550977
+1024.000000,1024.000000,1024.000000,32.250000,49.450977,45.450977
+64.000000,128.000000,64.000000,16.363281,16.479102,16.463477
+256.000000,512.000000,256.000000,18.062500,19.900977,19.650977
+512.000000,1024.000000,512.000000,23.500000,30.850977,29.850977
+1024.000000,2048.000000,1024.000000,45.250000,74.650977,70.650977
diff --git a/benchmark/gemm_split_k_fp8_memory/gemm-split-k-fp8-memory-benchmark.png b/benchmark/gemm_split_k_fp8_memory/gemm-split-k-fp8-memory-benchmark.png
diff --git a/benchmark/gemm_split_k_fp8_memory/results.html b/benchmark/gemm_split_k_fp8_memory/results.html
@@ -0,0 +1,3 @@
+<html><body>
+<image src="gemm-split-k-fp8-memory-benchmark.png"/>
+</body></html>
diff --git a/benchmark/gemm_split_k_fp8_speed/gemm-split-k-fp8-full-speed-benchmark.csv b/benchmark/gemm_split_k_fp8_speed/gemm-split-k-fp8-full-speed-benchmark.csv
@@ -0,0 +1,9 @@
+m,k,n,Liger,PyTorch,Torch Compile
+64.000000,64.000000,64.000000,0.131072,0.150432,0.824320
+256.000000,256.000000,256.000000,0.376832,0.429056,0.659328
+512.000000,512.000000,512.000000,0.214016,0.405504,0.675872
+1024.000000,1024.000000,1024.000000,0.302592,0.425984,0.985088
+64.000000,128.000000,64.000000,0.368640,0.342528,0.586736
+256.000000,512.000000,256.000000,0.241696,0.360512,0.943616
+512.000000,1024.000000,512.000000,0.400384,0.513024,0.991744
+1024.000000,2048.000000,1024.000000,0.272384,0.315392,0.983568
diff --git a/benchmark/gemm_split_k_fp8_speed/gemm-split-k-fp8-full-speed-benchmark.png b/benchmark/gemm_split_k_fp8_speed/gemm-split-k-fp8-full-speed-benchmark.png
diff --git a/benchmark/gemm_split_k_fp8_speed/gemm-split-k-fp8-fwd-speed-benchmark.csv b/benchmark/gemm_split_k_fp8_speed/gemm-split-k-fp8-fwd-speed-benchmark.csv
@@ -0,0 +1,9 @@
+m,k,n,Liger,PyTorch,Torch Compile
+64.000000,64.000000,64.000000,0.011264,0.008192,0.007168
+256.000000,256.000000,256.000000,0.013312,0.009216,0.009216
+512.000000,512.000000,512.000000,0.016384,0.016384,0.016384
+1024.000000,1024.000000,1024.000000,0.035840,0.038912,0.038912
+64.000000,128.000000,64.000000,0.011264,0.007168,0.007168
+256.000000,512.000000,256.000000,0.016384,0.013280,0.012288
+512.000000,1024.000000,512.000000,0.022528,0.018944,0.019360
+1024.000000,2048.000000,1024.000000,0.048128,0.062464,0.062464
diff --git a/benchmark/gemm_split_k_fp8_speed/gemm-split-k-fp8-fwd-speed-benchmark.png b/benchmark/gemm_split_k_fp8_speed/gemm-split-k-fp8-fwd-speed-benchmark.png
diff --git a/benchmark/gemm_split_k_fp8_speed/results.html b/benchmark/gemm_split_k_fp8_speed/results.html
@@ -0,0 +1,4 @@
+<html><body>
+<image src="gemm-split-k-fp8-fwd-speed-benchmark.png"/>
+<image src="gemm-split-k-fp8-full-speed-benchmark.png"/>
+</body></html>