Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for sdxl pipeline (testing) #152

Merged
merged 24 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 22 additions & 11 deletions .github/workflows/test_iree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,11 @@ concurrency:

jobs:
linux_x86_64:
strategy:
matrix:
os: [nodai-amdgpu-w7900-x86-64]
name: Linux (x86_64)
runs-on: ubuntu-latest
runs-on: ${{matrix.os}}
env:
VENV_DIR: ${{ github.workspace }}/.venv
steps:
Expand Down Expand Up @@ -57,22 +60,30 @@ jobs:
source ${VENV_DIR}/bin/activate
python3 -m pip install -r iree_tests/requirements.txt

- name: "Running simple tests"
run: |
source ${VENV_DIR}/bin/activate
pytest iree_tests/simple --durations=0
# - name: "Running simple tests"
# run: |
# source ${VENV_DIR}/bin/activate
# pytest iree_tests/simple --durations=0

- name: "Running the generated ONNX test suite"
run: |
source ${VENV_DIR}/bin/activate
pytest iree_tests/onnx/node/generated -n auto -rpfE --timeout=30 --retries 2 --retry-delay 5 --durations=10
# - name: "Running the generated ONNX test suite"
# run: |
# source ${VENV_DIR}/bin/activate
# pytest iree_tests/onnx/node/generated -n auto -rpfE --timeout=30 --retries 2 --retry-delay 5 --durations=10

# TODO(scotttodd): add a local cache for these large files to a persistent runner
- name: "Downloading remote files for real weight model tests"
run: |
source ${VENV_DIR}/bin/activate
python3 iree_tests/download_remote_files.py
python3 iree_tests/download_remote_files.py --root-dir pytorch/models/sdxl-scheduled-unet-3-tank
- name: "Running real weight model tests"
env:
IREE_TEST_CONFIG_FILES: iree_tests/configs/config_sdxl_cpu_llvm_task.json
run: |
source ${VENV_DIR}/bin/activate
pytest iree_tests/pytorch/models -s -n auto -k real_weights -rpfE --timeout=1200 --retries 2 --retry-delay 5 --durations=0
- name: "Running SDXL pipeline benchmark"
env:
IREE_TEST_CONFIG_FILES: iree_tests/configs/config_sdxl_cpu_llvm_task.json
run: |
source ${VENV_DIR}/bin/activate
pytest iree_tests -n auto -k real_weights -rpfE --timeout=600 --retries 2 --retry-delay 5 --durations=0
pytest iree_tests/benchmarks -s -rpfE --timeout=6000 --durations=0
Binary file added iree_tests/benchmarks/sdxl-benchmark/model.mlirbc
Binary file not shown.
1 change: 1 addition & 0 deletions iree_tests/benchmarks/sdxl-benchmark/test_data_flags.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

16 changes: 16 additions & 0 deletions iree_tests/configs/config_gpu_rocm.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"config_name": "gpu_rocm",
"iree_compile_flags" : [
"--iree-hal-target-backends=rocm",
"--iree-rocm-target-chip=gfx90a",
"--iree-rocm-link-bc=true",
"--verify=false"
],
"iree_run_module_flags": [
"--device=rocm"
],
"skip_compile_tests": ["llama-tank", "opt-125M", "resnet50", "sd-clip-tank", "sd-unet-tank", "sd-vae-decode-tank"],
"skip_run_tests": ["sdxl-scheduled-unet-tank", "llama-tank", "opt-125M", "resnet50", "sd-clip-tank", "sd-unet-tank", "sd-vae-decode-tank"],
"expected_compile_failures": [],
"expected_run_failures": []
}
17 changes: 17 additions & 0 deletions iree_tests/configs/config_sdxl_cpu_llvm_task.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"config_name": "sdxl_cpu_llvm_task",
"iree_compile_flags" : [
"--iree-hal-target-backends=llvm-cpu",
"--iree-llvmcpu-target-triple=x86_64-linux-gnu",
ScottTodd marked this conversation as resolved.
Show resolved Hide resolved
"--iree-llvmcpu-target-cpu-features=host",
"--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false",
"--iree-llvmcpu-distribution-size=32"
ScottTodd marked this conversation as resolved.
Show resolved Hide resolved
],
"iree_run_module_flags": [
"--device=local-task"
],
"skip_compile_tests": ["opt-125M", "resnet50", "sdxl-vae-decode-tank", "sdxl-prompt-encoder-tank"],
"skip_run_tests": ["opt-125M", "resnet50", "sdxl-vae-decode-tank", "sdxl-prompt-encoder-tank"],
"expected_compile_failures": [],
"expected_run_failures": []
}
ScottTodd marked this conversation as resolved.
Show resolved Hide resolved
19 changes: 18 additions & 1 deletion iree_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,9 @@ def __init__(self, spec, **kwargs):

# TODO(scotttodd): swap cwd for a temp path?
self.test_cwd = self.spec.test_directory
vae_decode_path = os.path.dirname(os.path.dirname(self.test_cwd)) + "/pytorch/models/sdxl-vae-decode-tank"
scheduled_unet_path = os.path.dirname(os.path.dirname(self.test_cwd)) + "/pytorch/models/sdxl-scheduled-unet-3-tank"
prompt_encoder_path = os.path.dirname(os.path.dirname(self.test_cwd)) + "/pytorch/models/sdxl-prompt-encoder-tank"
vmfb_name = f"{self.spec.input_mlir_stem}_{self.spec.test_name}.vmfb"

self.compile_args = ["iree-compile", self.spec.input_mlir_name]
Expand All @@ -292,6 +295,8 @@ def __init__(self, spec, **kwargs):
self.run_args.extend(self.spec.iree_run_module_flags)
self.run_args.append(f"--flagfile={self.spec.data_flagfile_name}")

self.benchmark_args = ["iree-benchmark-module", "--device=local-task", f"--module={prompt_encoder_path}/model_sdxl_cpu_llvm_task_real_weights.vmfb", f"--parameters=model={prompt_encoder_path}/real_weights.irpa", f"--module={scheduled_unet_path}/model_sdxl_cpu_llvm_task_real_weights.vmfb", f"--parameters=model={scheduled_unet_path}/real_weights.irpa", f"--module={vae_decode_path}/model_sdxl_cpu_llvm_task_real_weights.vmfb", f"--parameters=model={vae_decode_path}/real_weights.irpa", f"--module={vmfb_name}", "--function=tokens_to_image", "--input=1x4x128x128xf16", "--input=1xf16", "--input=1x64xi64", "--input=1x64xi64", "--input=1x64xi64", "--input=1x64xi64"]

def runtest(self):
if self.spec.skip_test:
pytest.skip()
Expand All @@ -317,7 +322,12 @@ def runtest(self):
reason="Expected run to fail",
)
)
self.test_run()
# self.test_run()
print("TEST DIR NAME: " + str(self.spec.test_directory.name))
if self.spec.test_directory.name == "sdxl-benchmark":
self.test_benchmark()
else:
self.test_run()

def test_compile(self):
proc = subprocess.run(self.compile_args, capture_output=True, cwd=self.test_cwd)
Expand All @@ -328,6 +338,13 @@ def test_run(self):
proc = subprocess.run(self.run_args, capture_output=True, cwd=self.test_cwd)
if proc.returncode != 0:
raise IreeRunException(proc, self.test_cwd, self.compile_args)

def test_benchmark(self):
proc = subprocess.run(self.benchmark_args, capture_output=True, cwd=self.test_cwd)
if proc.returncode != 0:
raise IreeRunException(proc, self.test_cwd, self.compile_args)
outs = proc.stdout.decode("utf-8")
print(f"Stdout benchmark:\n{outs}\n")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before this sort of change lands, let's think a bit about what we actually want coverage for. I'm skeptical about having benchmarks built into the same testing flow... though the suite that Stella set up has these:

Copy link
Contributor Author

@saienduri saienduri Apr 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just because we care so much about sdxl perf, I think it would be great to have it included in this flow. I didn't look into adding a whole separate flow for it or making it very scalable because I doubt we will be adding benchmarks for anything else. That's why also just went with hardcoded commands (also flags have to be in conftest.py because some flag values are path names that are relative to other directories that we figure out there). I was thinking we can evaluate and iterate if this becomes a bigger utility but for now, just went for easiest/simple add for benchmarking. Here is an example log with everything running: https://github.com/nod-ai/SHARK-TestSuite/actions/runs/8543035287/job/23405926540

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The value of continuous benchmarks is clear, but I want to be careful about how we integrate them. For this PR, can you leave the benchmarks off and focus on just adding the sdxl models? A follow-up PR can then add benchmarking.

I'd at least like to take some time to work through the specific requirements before jumping straight to an implementation. For example:

  • What metrics/artifacts do we want from benchmarking?
    • Each model in isolation? Full pipeline latency? Just dispatch time?
  • What do we want done with benchmark results / artifacts?
    • The in-tree benchmarks in IREE submit results to a dashboard (that should use a queryable database...), upload Tracy files to cloud storage, and comment on pending pull requests with results summaries
  • Where do we want benchmarks to run?
    • Right after tests, on presubmit to IREE?
    • In a separate job, on separate runners?

I'm also wondering if we want to use pytest as the benchmark runner (either with the existing conftest.py or a forked one), or if we would want to use another runner (could start with a pile of scripts, just using the same test suite source files)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be reasonable to start with pytest -k benchmark or pytest iree_tests/benchmarks that just run iree-benchmark-module instead of iree-run-module and then let developers dig through the GitHub Actions logs to see results, but I'm worried about going down the path of building an entirely new benchmark "framework" when we already have https://github.com/openxla/iree-comparative-benchmark and https://github.com/openxla/iree/tree/main/build_tools/benchmarks (building something new is likely going to make sense, at least in the short term, but this stuff gets complicated very quickly)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this stuff gets complicated very quickly

For example: https://github.com/openxla/community/blob/main/rfcs/20230505-benchmarking-strategy.md

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(think only way to split by runner is having them on different jobs).

I believe that can be done with a matrix too: https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs#example-using-a-multi-dimension-matrix

Could then check other matrix parameters to choose which steps to run... maybe like this:

jobs:
  iree_tests:
    strategy:
      matrix:
        configuration:
          - runner: ubuntu-22.04
            test_command: "pytest iree_tests/simple --durations=0"
          - runner: ubuntu-22.04
            test_command: "pytest iree_tests/onnx/node/generated -n auto -rpfE --timeout=30 --retries 2 --retry-delay 5 --durations=10"
    runs-on: ${{ matrix.runner }}
    steps:
      ...

I'm also referencing https://github.com/openxla/iree/blob/573ff1ff02347266ed747dd316cefaeb4c710396/.github/workflows/ci.yml#L749-L784 (probably tons of other files to reference across GitHub, but that's what I know already...)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we wanted to plug in to the in-tree benchmark infrastructure that IREE has, we'd want a PR like iree-org/iree#16965 . That would feed into https://perf.iree.dev/ and PR comments, but it doesn't also test correctness out of the box, can be tricky to update (multiple Python files, coupled with GitHub Actions), and puts input files / parameters behind a few levels of abstraction that make it harder to run locally.

Copy link
Contributor Author

@saienduri saienduri Apr 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I also wonder how that would work because we need to essentially compile multiple submodels and then use their vmfbs for the pipeline's vmfb. Not sure if it is setup for a pipeline structure.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I've placed the onnx and model tests in different jobs. I think that's best for this suite. Because they don't depend on each other and are running independently on different machines, I don't think we need the sequential steps. This way we have parallel execution which can help for scalability in future. Once we get more machines, splitting on models also would be great :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is the PR for benchmarking that should be landed after this one: #155. Feel free to add on notes there for future reference


def repr_failure(self, excinfo):
"""Called when self.runtest() raises an exception."""
Expand Down
ScottTodd marked this conversation as resolved.
Show resolved Hide resolved
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
--parameters=model=real_weights.irpa
--input=1x64xi64=@inference_input.0.bin
--input=1x64xi64=@inference_input.1.bin
--input=1x64xi64=@inference_input.2.bin
--input=1x64xi64=@inference_input.3.bin
--expected_output=2x64x2048xf16=@inference_output.0.bin
--expected_output=2x1280xf16=@inference_output.1.bin
--expected_f16_threshold=1.0f
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
--input="1x64xi64"
--input="1x64xi64"
--input="1x64xi64"
--input="1x64xi64"
--parameters=splats.irpa
Binary file not shown.
29 changes: 29 additions & 0 deletions iree_tests/pytorch/models/sdxl-prompt-encoder-tank/test_cases.json
ScottTodd marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"test_cases": [
{
"name": "splats",
"runtime_flagfile": "splat_data_flags.txt",
"remote_file_groups": []
},
{
"name": "real_weights",
"runtime_flagfile": "real_weights_data_flags.txt",
"remote_file_groups": [
{
"azure_account_url": "https://sharkpublic.blob.core.windows.net",
"azure_container_name": "sharkpublic",
"azure_base_blob_name": "sai/sdxl-prompt-encoder/",
"files": [
"inference_input.0.bin",
"inference_input.1.bin",
"inference_input.2.bin",
"inference_input.3.bin",
"inference_output.0.bin",
"inference_output.1.bin",
"real_weights.irpa"
]
}
]
}
]
}
Comment on lines +24 to +29
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seeing more of these files, I'm still thinking about how to keep them easy to update. Might refactor to separate JSON files like

test_case_splats.json:

{
  "name": "splats",
  "runtime_flagfile": "splat_data_flags.txt",
  "remote_files": []
}

test_case_real_weights.json:

{
  "name": "real_weights",
  "runtime_flagfile": "real_weights_data_flags.txt",
  "remote_files": [
    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_input.0.bin",
    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_input.1.bin",
    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_input.2.bin",
    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_input.3.bin",
    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_output.0.bin",
    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_output.1.bin",
    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/real_weights.irpa"
  ]
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm yeah, this is probably easier to decode/update

Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
--parameters=model=real_weights.irpa
--module=sdxl_scheduled_unet_pipeline_fp16_.vmfb
--input=1x4x128x128xf16=@inference_input.0.bin
--input=2x64x2048xf16=@inference_input.1.bin
--input=2x1280xf16=@inference_input.2.bin
--input=1xf16=@inference_input.3.bin
--expected_output=1x4x128x128xf16=@inference_output.0.bin
--expected_f16_threshold=1.5f
ScottTodd marked this conversation as resolved.
Show resolved Hide resolved
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
--input="1x4x128x128xf16"
--input="2x64x2048xf16"
--input="2x1280xf16"
--input="1xf16"
--parameters=splats.irpa
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"test_cases": [
{
"name": "splats",
"runtime_flagfile": "splat_data_flags.txt",
"remote_file_groups": []
},
{
"name": "real_weights",
"runtime_flagfile": "real_weights_data_flags.txt",
"remote_file_groups": [
{
"azure_account_url": "https://sharkpublic.blob.core.windows.net",
"azure_container_name": "sharkpublic",
"azure_base_blob_name": "sai/sdxl-scheduled-unet/",
"files": [
"inference_input.0.bin",
"inference_input.1.bin",
"inference_input.2.bin",
"inference_input.3.bin",
"inference_output.0.bin",
"real_weights.irpa"
]
}
]
}
]
}
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
--parameters=model=real_weights.irpa
--input=1x4x128x128xf16=@inference_input.0.bin
--expected_output=1x3x1024x1024xf16=@inference_output.0.bin
--expected_f16_threshold=0.02f
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
--input="1x4x128x128xf16"
--parameters=splats.irpa
Binary file not shown.
25 changes: 25 additions & 0 deletions iree_tests/pytorch/models/sdxl-vae-decode-tank/test_cases.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"test_cases": [
{
"name": "splats",
"runtime_flagfile": "splat_data_flags.txt",
"remote_file_groups": []
},
{
"name": "real_weights",
"runtime_flagfile": "real_weights_data_flags.txt",
"remote_file_groups": [
{
"azure_account_url": "https://sharkpublic.blob.core.windows.net",
"azure_container_name": "sharkpublic",
"azure_base_blob_name": "sai/sdxl-vae-decode/",
"files": [
"inference_input.0.bin",
"inference_output.0.bin",
"real_weights.irpa"
]
}
]
}
]
}
Loading