Skip to content

Commit

Permalink
Support for sdxl pipeline (benchmarking) (#155)
Browse files Browse the repository at this point in the history
This commit adds the V0 support for testing all sdxl submodels. This PR
is not a long term solution for benchmarking as Scott and I discussed
here: #152. This is the
result of a request to get sdxl benchmarking in ASAP by our team. Due to
the high priority for this to be added to the sdxl testing as our team
lands patches in IREE, this is simply just to get the implementation in
and working. Scott and I discussed some more intensive and well
structured ways to add benchmarking, which either of us may implement in
the future.

Also, this PR depends on this one in terms of landing:
#152 (hence the CI
failure)

Notes for future if we decide that we need a stronger implementation: 

1. Maybe something like iree-org/iree#16965 which
will feed into https://perf.iree.dev/.
2. This is the benchmarking framework we already have:
https://github.com/openxla/iree-comparative-benchmark and
https://github.com/openxla/iree/tree/main/build_tools/benchmarks
3. Some questions Scott raised to keep in mind for future
implementation:
* What metrics/artifacts do we want from benchmarking?
  * Each model in isolation? Full pipeline latency? Just dispatch time?
* What do we want done with benchmark results / artifacts?
* The in-tree benchmarks in IREE submit results to a dashboard (that
should use a queryable database...), upload Tracy files to cloud
storage, and comment on pending pull requests with results summaries
* Where do we want benchmarks to run?
  * Right after tests, on presubmit to IREE?
  * In a separate job, on separate runners?
If we decide benchmarking needs changes, we will address all of these
and come up with a more structured, methodical implementation that
either creates a new benchmarking flow here or plugs into the iree
benchmarking setup.
  • Loading branch information
saienduri authored Apr 8, 2024
1 parent 9256979 commit 6bf5cc7
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 1 deletion.
44 changes: 44 additions & 0 deletions iree_tests/benchmarks/benchmark_sdxl_cpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
#
# Copyright 2024 Advanced Micro Devices, Inc.
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

set -xeuo pipefail

THIS_DIR="$(cd $(dirname $0) && pwd)"
IREE_ROOT="$(cd ${THIS_DIR?}/.. && pwd)"
VAE_DECODE_DIR="${IREE_ROOT?}/pytorch/models/sdxl-vae-decode-tank"
SCHEDULED_UNET_DIR="${IREE_ROOT?}/pytorch/models/sdxl-scheduled-unet-3-tank"
PROMPT_ENCODER_DIR="${IREE_ROOT?}/pytorch/models/sdxl-prompt-encoder-tank"

echo "Echo compiling full sdxl pipeline"

iree-compile "${THIS_DIR?}/sdxl_pipeline_bench_f16.mlir" \
-iree-hal-target-backends=llvm-cpu \
--iree-llvmcpu-target-cpu-features=host \
--iree-llvmcpu-distribution-size=32 \
-o "${THIS_DIR?}/sdxl_full_pipeline_fp16_.vmfb"

echo "Running sdxl benchmark"

iree-benchmark-module \
--device=local-task \
--module="${PROMPT_ENCODER_DIR?}/model_cpu_llvm_task_real_weights.vmfb" \
--parameters=model="${PROMPT_ENCODER_DIR?}/real_weights.irpa" \
--module="${SCHEDULED_UNET_DIR?}/model_cpu_llvm_task_real_weights.vmfb" \
--parameters=model="${SCHEDULED_UNET_DIR?}/real_weights.irpa" \
--module="${VAE_DECODE_DIR?}/model_cpu_llvm_task_real_weights.vmfb" \
--parameters=model="${VAE_DECODE_DIR?}/real_weights.irpa" \
--module="${THIS_DIR}/sdxl_full_pipeline_fp16_.vmfb" \
--function=tokens_to_image \
--input=1x4x128x128xf16 \
--input=1xf16 \
--input=1x64xi64 \
--input=1x64xi64 \
--input=1x64xi64 \
--input=1x64xi64

echo "Succesfully finished sdxl pipeline benchmark"
23 changes: 23 additions & 0 deletions iree_tests/benchmarks/sdxl_pipeline_bench_f16.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
module @sdxl_compiled_pipeline {
func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
func.func private @compiled_vae.main(%arg0: tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}

func.func @tokens_to_image(%sample: tensor<1x4x128x128xf16>, %guidance_scale: tensor<1xf16>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x1024x1024xf16> {
%p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>)
%noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>)
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%steps_int = tensor.extract %steps[] : tensor<i64>
%n_steps = arith.index_cast %steps_int: i64 to index
%res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
%step_64 = arith.index_cast %arg0 : index to i64
%this_step = tensor.from_elements %step_64 : tensor<1xi64>
%inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
scf.yield %inner : tensor<1x4x128x128xf16>
}
%image = func.call @compiled_vae.main(%res): (tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16>
return %image : tensor<1x3x1024x1024xf16>
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"config_name": "sdxl_cpu_llvm_task",
"config_name": "cpu_llvm_task",
"iree_compile_flags" : [
"--iree-hal-target-backends=llvm-cpu",
"--iree-llvmcpu-target-cpu-features=host",
Expand Down

0 comments on commit 6bf5cc7

Please sign in to comment.