Support for sdxl pipeline (benchmarking) (#155)

This commit adds the V0 support for testing all sdxl submodels. This PR is not a long term solution for benchmarking as Scott and I discussed here: #152. This is the result of a request to get sdxl benchmarking in ASAP by our team. Due to the high priority for this to be added to the sdxl testing as our team lands patches in IREE, this is simply just to get the implementation in and working. Scott and I discussed some more intensive and well structured ways to add benchmarking, which either of us may implement in the future. Also, this PR depends on this one in terms of landing: #152 (hence the CI failure) Notes for future if we decide that we need a stronger implementation: 1. Maybe something like iree-org/iree#16965 which will feed into https://perf.iree.dev/. 2. This is the benchmarking framework we already have: https://github.com/openxla/iree-comparative-benchmark and https://github.com/openxla/iree/tree/main/build_tools/benchmarks 3. Some questions Scott raised to keep in mind for future implementation: * What metrics/artifacts do we want from benchmarking? * Each model in isolation? Full pipeline latency? Just dispatch time? * What do we want done with benchmark results / artifacts? * The in-tree benchmarks in IREE submit results to a dashboard (that should use a queryable database...), upload Tracy files to cloud storage, and comment on pending pull requests with results summaries * Where do we want benchmarks to run? * Right after tests, on presubmit to IREE? * In a separate job, on separate runners? If we decide benchmarking needs changes, we will address all of these and come up with a more structured, methodical implementation that either creates a new benchmarking flow here or plugs into the iree benchmarking setup.
nod-ai · Apr 8, 2024 · 6bf5cc7 · 6bf5cc7
1 parent 9256979
commit 6bf5cc7
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 1 deletion.
diff --git a/iree_tests/benchmarks/benchmark_sdxl_cpu.sh b/iree_tests/benchmarks/benchmark_sdxl_cpu.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+#
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+set -xeuo pipefail
+
+THIS_DIR="$(cd $(dirname $0) && pwd)"
+IREE_ROOT="$(cd ${THIS_DIR?}/.. && pwd)"
+VAE_DECODE_DIR="${IREE_ROOT?}/pytorch/models/sdxl-vae-decode-tank"
+SCHEDULED_UNET_DIR="${IREE_ROOT?}/pytorch/models/sdxl-scheduled-unet-3-tank"
+PROMPT_ENCODER_DIR="${IREE_ROOT?}/pytorch/models/sdxl-prompt-encoder-tank"
+
+echo "Echo compiling full sdxl pipeline"
+
+iree-compile "${THIS_DIR?}/sdxl_pipeline_bench_f16.mlir" \
+  -iree-hal-target-backends=llvm-cpu \
+  --iree-llvmcpu-target-cpu-features=host \
+  --iree-llvmcpu-distribution-size=32 \
+  -o "${THIS_DIR?}/sdxl_full_pipeline_fp16_.vmfb"
+
+echo "Running sdxl benchmark"
+
+iree-benchmark-module \
+  --device=local-task \
+  --module="${PROMPT_ENCODER_DIR?}/model_cpu_llvm_task_real_weights.vmfb" \
+  --parameters=model="${PROMPT_ENCODER_DIR?}/real_weights.irpa" \
+  --module="${SCHEDULED_UNET_DIR?}/model_cpu_llvm_task_real_weights.vmfb" \
+  --parameters=model="${SCHEDULED_UNET_DIR?}/real_weights.irpa" \
+  --module="${VAE_DECODE_DIR?}/model_cpu_llvm_task_real_weights.vmfb" \
+  --parameters=model="${VAE_DECODE_DIR?}/real_weights.irpa" \
+  --module="${THIS_DIR}/sdxl_full_pipeline_fp16_.vmfb" \
+  --function=tokens_to_image \
+  --input=1x4x128x128xf16 \
+  --input=1xf16 \
+  --input=1x64xi64 \
+  --input=1x64xi64 \
+  --input=1x64xi64 \
+  --input=1x64xi64
+
+echo "Succesfully finished sdxl pipeline benchmark"
diff --git a/iree_tests/benchmarks/sdxl_pipeline_bench_f16.mlir b/iree_tests/benchmarks/sdxl_pipeline_bench_f16.mlir
@@ -0,0 +1,23 @@
+module @sdxl_compiled_pipeline {
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_vae.main(%arg0: tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+
+  func.func @tokens_to_image(%sample: tensor<1x4x128x128xf16>, %guidance_scale: tensor<1xf16>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x1024x1024xf16> {
+    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>)
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>)
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %steps_int = tensor.extract %steps[] : tensor<i64>
+    %n_steps = arith.index_cast %steps_int: i64 to index
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
+      %step_64 = arith.index_cast %arg0 : index to i64
+      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
+      scf.yield %inner : tensor<1x4x128x128xf16>
+    }
+    %image = func.call @compiled_vae.main(%res): (tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16>
+    return %image : tensor<1x3x1024x1024xf16>
+  } 
+}
diff --git a/iree_tests/configs/config_pytorch_models_cpu_llvm_task.json b/iree_tests/configs/config_pytorch_models_cpu_llvm_task.json
@@ -1,5 +1,5 @@
 {
-    "config_name": "sdxl_cpu_llvm_task",
+    "config_name": "cpu_llvm_task",
     "iree_compile_flags" : [
       "--iree-hal-target-backends=llvm-cpu",
       "--iree-llvmcpu-target-cpu-features=host",