From 889e5cbc0943958fe111f4ec373a2301832d4dd1 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 25 Jul 2024 15:52:54 -0700
Subject: [PATCH 01/75] Enable SPIR-V compiler optimization (#4402)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4402

Call the SPIR-V compiler with the `-O` flag, which enables optimizations when compiling GLSL to SPIR-V. The `-Os` flag (which tries to minimize SPIR-V size) was tested as well, but resulted in (very) slightly worse performance.

Reviewed By: jorgep31415

Differential Revision: D60193514

fbshipit-source-id: 2dfb999fb1951a63a990773ab563a1a3a3c304b0
---
 backends/vulkan/runtime/gen_vulkan_spv.py | 44 +++++++++++++++--------
 backends/vulkan/targets.bzl               | 29 +++++++++------
 2 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py
index c9e3aaa31e..c734ed395e 100644
--- a/backends/vulkan/runtime/gen_vulkan_spv.py
+++ b/backends/vulkan/runtime/gen_vulkan_spv.py
@@ -482,6 +482,7 @@ def __init__(
         src_dir_paths: Union[str, List[str]],
         env: Dict[Any, Any],
         glslc_path: Optional[str],
+        glslc_flags: str = "",
     ) -> None:
         if isinstance(src_dir_paths, str):
             self.src_dir_paths = [src_dir_paths]
@@ -490,6 +491,7 @@ def __init__(
 
         self.env = env
         self.glslc_path = glslc_path
+        self.glslc_flags = glslc_flags
 
         self.glsl_src_files: Dict[str, str] = {}
         self.template_yaml_files: List[str] = []
@@ -668,19 +670,23 @@ def process_shader(shader_paths_pair):
             if self.glslc_path is not None:
                 spv_out_path = os.path.join(output_dir, f"{shader_name}.spv")
 
-                cmd = [
-                    self.glslc_path,
-                    "-fshader-stage=compute",
-                    glsl_out_path,
-                    "-o",
-                    spv_out_path,
-                    "--target-env=vulkan1.1",
-                    "-Werror",
-                ] + [
-                    arg
-                    for src_dir_path in self.src_dir_paths
-                    for arg in ["-I", src_dir_path]
-                ]
+                cmd = (
+                    [
+                        self.glslc_path,
+                        "-fshader-stage=compute",
+                        glsl_out_path,
+                        "-o",
+                        spv_out_path,
+                        "--target-env=vulkan1.1",
+                        "-Werror",
+                    ]
+                    + [
+                        arg
+                        for src_dir_path in self.src_dir_paths
+                        for arg in ["-I", src_dir_path]
+                    ]
+                    + self.glslc_flags.split()
+                )
 
                 subprocess.check_call(cmd)
 
@@ -966,6 +972,8 @@ def main(argv: List[str]) -> int:
     parser.add_argument("-c", "--glslc-path", required=True, help="")
     parser.add_argument("-t", "--tmp-dir-path", required=True, help="/tmp")
     parser.add_argument("-o", "--output-path", required=True, help="")
+    parser.add_argument("--optimize_size", action="store_true", help="")
+    parser.add_argument("--optimize", action="store_true", help="")
     parser.add_argument(
         "--env", metavar="KEY=VALUE", nargs="*", help="Set a number of key-value pairs"
     )
@@ -984,7 +992,15 @@ def main(argv: List[str]) -> int:
     if not os.path.exists(options.tmp_dir_path):
         os.makedirs(options.tmp_dir_path)
 
-    shader_generator = SPVGenerator(options.glsl_paths, env, options.glslc_path)
+    glslc_flags = ""
+    if options.optimize_size:
+        glslc_flags += "-Os"
+    elif options.optimize:
+        glslc_flags += "-O"
+
+    shader_generator = SPVGenerator(
+        options.glsl_paths, env, options.glslc_path, glslc_flags
+    )
     output_spv_files = shader_generator.generateSPV(options.tmp_dir_path)
 
     genCppFiles(
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
index 981552f17a..e8b232098b 100644
--- a/backends/vulkan/targets.bzl
+++ b/backends/vulkan/targets.bzl
@@ -1,12 +1,15 @@
+load("@fbcode_macros//build_defs:native_rules.bzl", "buck_genrule")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 def get_vulkan_compiler_flags():
     return ["-Wno-missing-prototypes", "-Wno-global-constructors"]
 
 def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False):
-    gen_vulkan_spv_target = "//executorch/backends/vulkan:gen_vulkan_spv_bin"
-    glslc_path = "//caffe2/fb/vulkan/dotslash:glslc"
+    gen_vulkan_spv_target = "//xplat/executorch/backends/vulkan:gen_vulkan_spv_bin"
+    glslc_path = "//xplat/caffe2/fb/vulkan/dotslash:glslc"
+
     if is_fbcode:
+        gen_vulkan_spv_target = "//executorch/backends/vulkan:gen_vulkan_spv_bin"
         glslc_path = "//caffe2/fb/vulkan/tools:glslc"
 
     glsl_paths = []
@@ -15,21 +18,25 @@ def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False):
     for target, subpath in spv_filegroups.items():
         glsl_paths.append("$(location {})/{}".format(target, subpath))
 
-    genrule_cmd = [
-        "$(exe {})".format(gen_vulkan_spv_target),
-        "--glsl-paths {}".format(" ".join(glsl_paths)),
-        "--output-path $OUT",
-        "--glslc-path=$(exe {})".format(glslc_path),
-        "--tmp-dir-path=$OUT",
-    ]
+    genrule_cmd = (
+        "$(exe {}) ".format(gen_vulkan_spv_target) +
+        "--glsl-paths {} ".format(" ".join(glsl_paths)) +
+        "--output-path $OUT " +
+        "--glslc-path=$(exe {}) ".format(glslc_path) +
+        "--tmp-dir-path=$OUT " +
+        select({
+            "DEFAULT": "",
+            "ovr_config//os:android": "--optimize",
+        })
+    )
 
     genrule_name = "gen_{}_cpp".format(name)
-    runtime.genrule(
+    buck_genrule(
         name = genrule_name,
         outs = {
             "{}.cpp".format(name): ["spv.cpp"],
         },
-        cmd = " ".join(genrule_cmd),
+        cmd = genrule_cmd,
         default_outs = ["."],
         labels = ["uses_dotslash"],
     )

From faeeca8ec9040ae2db23973139c1b5f71ea51d4c Mon Sep 17 00:00:00 2001
From: Wei Lu <luwei@meta.com>
Date: Thu, 25 Jul 2024 21:23:45 -0700
Subject: [PATCH 02/75] remove unused tensors from VK model's graph (#4427)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4427

We implemented [operators fusion](https://github.com/pytorch/executorch/pull/3769?fbclid=IwZXh0bgNhZW0CMTEAAR3kYya0wRrkupmV86OpPZZ9_QhqLYEmNrKcJk5Jj_4VSO_WqvFsbWNigTs_aem_gQeSu2zvazf_hpy3RsIXhg) (`conv+bn`) which fused `conv` and `bn`'s weights and biases, but the old parameters are not deleted. Hence we saw that VK model's size is nearly twice large as CPU's.

As regards mobilenet_v2, before this diff CPU vs VK: 14M vs 22M. After this diff, both of them have 14M.

Reviewed By: SS-JIA

Differential Revision: D60257047

fbshipit-source-id: ca9e0f38d53187edff9dba45fdeffa619fde51a7
---
 backends/vulkan/serialization/vulkan_graph_builder.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/backends/vulkan/serialization/vulkan_graph_builder.py b/backends/vulkan/serialization/vulkan_graph_builder.py
index 477e54a2d7..da40f0a720 100644
--- a/backends/vulkan/serialization/vulkan_graph_builder.py
+++ b/backends/vulkan/serialization/vulkan_graph_builder.py
@@ -262,6 +262,9 @@ def get_or_create_value_for(self, arg: _Argument):
             raise RuntimeError(f"Cannot create value for arg of type {type(arg)}")
 
     def process_placeholder_node(self, node: Node) -> None:
+        # ignores any tensors that don't get used in any ops
+        if len(node.users) == 0:
+            return None
         ids = self.create_node_value(node)
         if not self.is_param_node(node):
             if isinstance(ids, int):

From 11407f05edf6e5304dc3199c9b4bad345387d5a2 Mon Sep 17 00:00:00 2001
From: Avik Chaudhuri <avik@meta.com>
Date: Fri, 26 Jul 2024 01:39:05 -0700
Subject: [PATCH 03/75] immutable accessors in graph signature (#4428)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4428

X-link: https://github.com/pytorch/pytorch/pull/131807

bypass-github-export-checks Test failures either unrelated or due to cross-dependencies between repos
bypass-github-executorch-ci-checks
bypass-github-pytorch-ci-checks

Reviewed By: ydwu4

Differential Revision: D60253955

fbshipit-source-id: eb6eb65bf17fd7e20287881a297d9eac2cbee691
---
 exir/passes/constant_prop_pass.py | 6 +++---
 exir/program/_program.py          | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py
index 354a1b071f..6ab3abbd7b 100644
--- a/exir/passes/constant_prop_pass.py
+++ b/exir/passes/constant_prop_pass.py
@@ -212,11 +212,11 @@ def erase_constant_node(
 ) -> None:
     # Remove corresponding tensor from param/constants dict.
     signature = exported_program.graph_signature
-    if name := signature.inputs_to_parameters.pop(node.name, None):
+    if name := signature.inputs_to_parameters.get(node.name, None):
         exported_program.state_dict.pop(name, None)
-    elif name := signature.inputs_to_lifted_tensor_constants.pop(node.name, None):
+    elif name := signature.inputs_to_lifted_tensor_constants.get(node.name, None):
         exported_program.constants.pop(name, None)
-    elif name := signature.inputs_to_buffers.pop(node.name, None):
+    elif name := signature.inputs_to_buffers.get(node.name, None):
         exported_program.constants.pop(name, None)
         exported_program.state_dict.pop(name, None)
 
diff --git a/exir/program/_program.py b/exir/program/_program.py
index 6fb9eca46f..fd6253a8aa 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -225,7 +225,7 @@ def lift_constant_tensor_pass(ep):
         return ep
 
     graph_signature = ep.graph_signature
-    buffers = graph_signature.buffers
+    buffers = list(graph_signature.buffers)
 
     fake_mode = list(ep.graph.nodes)[0].meta["val"].fake_mode
     first_user_input = None

From 5d3ec1323183aa1bcbba8026986d2aca3fab88d3 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Fri, 26 Jul 2024 10:11:33 -0700
Subject: [PATCH 04/75] Revert D60253955: immutable accessors in graph
 signature

Differential Revision:
D60253955

Original commit changeset: eb6eb65bf17f

Original Phabricator Diff: D60253955

fbshipit-source-id: f203ef791da6f7efa40bf51a6e905eba65cb6b47
---
 exir/passes/constant_prop_pass.py | 6 +++---
 exir/program/_program.py          | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py
index 6ab3abbd7b..354a1b071f 100644
--- a/exir/passes/constant_prop_pass.py
+++ b/exir/passes/constant_prop_pass.py
@@ -212,11 +212,11 @@ def erase_constant_node(
 ) -> None:
     # Remove corresponding tensor from param/constants dict.
     signature = exported_program.graph_signature
-    if name := signature.inputs_to_parameters.get(node.name, None):
+    if name := signature.inputs_to_parameters.pop(node.name, None):
         exported_program.state_dict.pop(name, None)
-    elif name := signature.inputs_to_lifted_tensor_constants.get(node.name, None):
+    elif name := signature.inputs_to_lifted_tensor_constants.pop(node.name, None):
         exported_program.constants.pop(name, None)
-    elif name := signature.inputs_to_buffers.get(node.name, None):
+    elif name := signature.inputs_to_buffers.pop(node.name, None):
         exported_program.constants.pop(name, None)
         exported_program.state_dict.pop(name, None)
 
diff --git a/exir/program/_program.py b/exir/program/_program.py
index fd6253a8aa..6fb9eca46f 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -225,7 +225,7 @@ def lift_constant_tensor_pass(ep):
         return ep
 
     graph_signature = ep.graph_signature
-    buffers = list(graph_signature.buffers)
+    buffers = graph_signature.buffers
 
     fake_mode = list(ep.graph.nodes)[0].meta["val"].fake_mode
     first_user_input = None

From 91298923a0076c1b41059efb6dad2876426e4b03 Mon Sep 17 00:00:00 2001
From: Avik Chaudhuri <avik@meta.com>
Date: Fri, 26 Jul 2024 12:45:15 -0700
Subject: [PATCH 05/75] immutable accessors in graph signature (#4433)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4433

splitting ET part of D60253955

Reviewed By: guangy10, zhxchen17

Differential Revision: D60295940

fbshipit-source-id: 4ad9a661a50db9b9e9bccbc13b232416d7264a49
---
 exir/passes/constant_prop_pass.py | 6 +++---
 exir/program/_program.py          | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py
index 354a1b071f..6ab3abbd7b 100644
--- a/exir/passes/constant_prop_pass.py
+++ b/exir/passes/constant_prop_pass.py
@@ -212,11 +212,11 @@ def erase_constant_node(
 ) -> None:
     # Remove corresponding tensor from param/constants dict.
     signature = exported_program.graph_signature
-    if name := signature.inputs_to_parameters.pop(node.name, None):
+    if name := signature.inputs_to_parameters.get(node.name, None):
         exported_program.state_dict.pop(name, None)
-    elif name := signature.inputs_to_lifted_tensor_constants.pop(node.name, None):
+    elif name := signature.inputs_to_lifted_tensor_constants.get(node.name, None):
         exported_program.constants.pop(name, None)
-    elif name := signature.inputs_to_buffers.pop(node.name, None):
+    elif name := signature.inputs_to_buffers.get(node.name, None):
         exported_program.constants.pop(name, None)
         exported_program.state_dict.pop(name, None)
 
diff --git a/exir/program/_program.py b/exir/program/_program.py
index 6fb9eca46f..fd6253a8aa 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -225,7 +225,7 @@ def lift_constant_tensor_pass(ep):
         return ep
 
     graph_signature = ep.graph_signature
-    buffers = graph_signature.buffers
+    buffers = list(graph_signature.buffers)
 
     fake_mode = list(ep.graph.nodes)[0].meta["val"].fake_mode
     first_user_input = None

From 5a20a49517c5c05a71692d6d6885735a2cd30bb1 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <shoumikhin@meta.com>
Date: Fri, 26 Jul 2024 13:52:34 -0700
Subject: [PATCH 06/75] Fix numpy and pandas versions. (#4430)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4430

Numpy has to match the coremltools requirements, and Pandas depends on Numpy:
https://github.com/apple/coremltools/blob/main/reqs/build.pip

Reviewed By: kirklandsign

Differential Revision: D60265982

fbshipit-source-id: c84dd319c19fb48dc6d4ad3ffc8accd1fdc9b840
---
 .ci/docker/requirements-ci.txt | 6 +++++-
 pyproject.toml                 | 7 +++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 3a0cd57ddb..c33cc533c0 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -1,5 +1,7 @@
 mpmath==1.3.0
-numpy==1.25.2
+numpy==1.21.3; python_version == '3.10'
+numpy==1.23.2; python_version == '3.11'
+numpy; python_version >= '3.12'
 PyYAML==6.0.1
 ruamel.yaml==0.17.32
 sympy==1.12
@@ -8,6 +10,8 @@ tomli==2.0.1
 torchsr==1.0.4
 transformers==4.38.0
 zstd==1.5.5.1
+pandas==2.0.3; python_version == '3.10'
+pandas; python_version >= '3.11'
 pytest==7.2.0
 pytest-cov==4.1.0
 expecttest==0.1.6
diff --git a/pyproject.toml b/pyproject.toml
index b23091cc5f..e83fe2bc2a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,9 +55,12 @@ dependencies=[
   "flatbuffers",
   "hypothesis",
   "mpmath==1.3.0",
-  "numpy>=1.25.2",
+  "numpy==1.21.3; python_version == '3.10'",
+  "numpy==1.23.2; python_version == '3.11'",
+  "numpy; python_version >= '3.12'",
   "packaging",
-  "pandas",
+  "pandas==2.0.3; python_version == '3.10'",
+  "pandas; python_version >= '3.11'",
   "parameterized",
   "pytest",
   "pytest-xdist",

From 1e4603d2e8264d61a006a1a27258214c15d465ce Mon Sep 17 00:00:00 2001
From: Gyanendra Sinha <gyanendra_sinha@apple.com>
Date: Mon, 29 Jul 2024 00:24:58 -0700
Subject: [PATCH 07/75] FileDataLoader fails to read the file when size >
 INT32_MAX (#4435)

Summary:
On macOS, the `read` function will fail with an `EINVAL` error if the size parameter exceeds `INT32_MAX`. This update addresses the issue by adding a check to ensure that the read size does not surpass `INT32_MAX`. On Linux, the maximum permissible read size is 2,147,479,552 bytes ( < `INT32_MAX`), so attempting to read beyond this limit is inconsequential.

Pull Request resolved: https://github.com/pytorch/executorch/pull/4435

Test Plan:
Exporting llama3 with `python -m examples.models.llama2.export_llama --checkpoint examples/models/llama-2-7B/consolidated.00.pth --params examples/models/llama-2-7B/params.json --coreml --disable_dynamic_shape -kv `

Without fix
Fails with `invalid argument` error.

With fix
Succeeds.

Reviewed By: kirklandsign

Differential Revision: D60321719

Pulled By: shoumikhin

fbshipit-source-id: fca265c6c1edc628b38a5044693ec7bbe0c0b43a
---
 extension/data_loader/file_data_loader.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/extension/data_loader/file_data_loader.cpp b/extension/data_loader/file_data_loader.cpp
index 651bc713db..7b041fef00 100644
--- a/extension/data_loader/file_data_loader.cpp
+++ b/extension/data_loader/file_data_loader.cpp
@@ -8,9 +8,11 @@
 
 #include <executorch/extension/data_loader/file_data_loader.h>
 
+#include <algorithm>
 #include <cerrno>
 #include <cstddef>
 #include <cstring>
+#include <limits>
 
 #include <fcntl.h>
 #include <sys/stat.h>
@@ -189,7 +191,12 @@ Result<FreeableBuffer> FileDataLoader::load(
   size_t needed = size;
   uint8_t* buf = reinterpret_cast<uint8_t*>(aligned_buffer);
   while (needed > 0) {
-    ssize_t nread = ::read(fd_, buf, needed);
+    // Reads on macos will fail with EINVAL if size > INT32_MAX.
+    ssize_t nread = ::read(
+        fd_,
+        buf,
+        std::min<size_t>(
+            needed, static_cast<size_t>(std::numeric_limits<int32_t>::max())));
     if (nread < 0 && errno == EINTR) {
       // Interrupted by a signal; zero bytes read.
       continue;

From dd88708719b488db6fd89c9b9846a207ea22f001 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Mon, 29 Jul 2024 12:15:06 -0700
Subject: [PATCH 08/75] Hoist numel out of loop condtion in op_embedding
 (#4146)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4146

This seems to save a single instruction in the inner loop (on x86, but I expect other architectures to be similar).
ghstack-source-id: 235302150

Reviewed By: tarun292

Differential Revision: D59335729

fbshipit-source-id: cf22669ffd8b127e60d863e4bc7858f994d8b1ce
---
 kernels/portable/cpu/op_embedding.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernels/portable/cpu/op_embedding.cpp b/kernels/portable/cpu/op_embedding.cpp
index 19e915ebc0..ffa43da739 100644
--- a/kernels/portable/cpu/op_embedding.cpp
+++ b/kernels/portable/cpu/op_embedding.cpp
@@ -37,7 +37,8 @@ void embedding_kernel(
   char* out_data = out.mutable_data_ptr<char>();
   const CTYPE* indices_ptr = indices.const_data_ptr<CTYPE>();
   ssize_t weight_height = weight.size(0);
-  for (int i = 0; i < indices.numel(); i++) {
+  const auto indices_numel = indices.numel();
+  for (int i = 0; i < indices_numel; i++) {
     // Ensure index is larger than 0 and smaller than weight.size(0)
     ET_KERNEL_CHECK_MSG(
         ctx,

From e087ac83fcfa5e051b6ab812ed969c3768b63c4b Mon Sep 17 00:00:00 2001
From: winskuo-quic <quic_winskuo@quicinc.com>
Date: Mon, 29 Jul 2024 15:03:31 -0700
Subject: [PATCH 09/75] Qualcomm AI Engine Direct - Fix UT example script hang
 when exception happened (#4355)

Summary:
- Fix UT Example Script hang when exceptions happened during execution. While the main process is waiting for child process to return a message, the child process exits without exceptions properly caught.
- Remove RemoveRedundancy Pass during quantizer to resolve memory format issues while quantizing.
- Prevent constant being double dequant in AnnotateQuantAttrs Pass

Pull Request resolved: https://github.com/pytorch/executorch/pull/4355

Reviewed By: kirklandsign

Differential Revision: D60177584

Pulled By: cccclai

fbshipit-source-id: fc4d277b8eef05bd42c4eae2b9aa67236f53cc32
---
 .../qualcomm/passes/annotate_quant_attrs.py   |   7 +-
 .../passes/recompose_pixel_unshuffle.py       |   8 +-
 backends/qualcomm/quantizer/quantizer.py      |   2 -
 backends/qualcomm/tests/test_qnn_delegate.py  | 116 +++++++++++++-----
 examples/qualcomm/llama2/llama.py             |   9 +-
 examples/qualcomm/oss_scripts/dino_v2.py      |  60 +++++----
 examples/qualcomm/oss_scripts/esrgan.py       |  98 ++++++++-------
 examples/qualcomm/oss_scripts/fbnet.py        |  58 +++++----
 .../oss_scripts/gMLP_image_classification.py  |  59 +++++----
 examples/qualcomm/oss_scripts/squeezenet.py   |  62 ++++++----
 examples/qualcomm/oss_scripts/ssd300_vgg16.py |  98 ++++++++-------
 examples/qualcomm/scripts/deeplab_v3.py       |  52 ++++----
 examples/qualcomm/scripts/edsr.py             |  84 +++++++------
 examples/qualcomm/scripts/inception_v3.py     |  62 ++++++----
 examples/qualcomm/scripts/inception_v4.py     |  62 ++++++----
 .../qualcomm/scripts/mobilebert_fine_tune.py  |  82 +++++++------
 examples/qualcomm/scripts/mobilenet_v2.py     |  62 ++++++----
 examples/qualcomm/scripts/mobilenet_v3.py     |  62 ++++++----
 examples/qualcomm/scripts/torchvision_vit.py  |  56 +++++----
 examples/qualcomm/scripts/utils.py            |   2 +-
 20 files changed, 652 insertions(+), 449 deletions(-)

diff --git a/backends/qualcomm/passes/annotate_quant_attrs.py b/backends/qualcomm/passes/annotate_quant_attrs.py
index 199d26b026..0dc39d2a4d 100644
--- a/backends/qualcomm/passes/annotate_quant_attrs.py
+++ b/backends/qualcomm/passes/annotate_quant_attrs.py
@@ -94,9 +94,11 @@ def _dequant_fold_params(self, n, quant_attrs, param):
     def _annotate_quant_attrs(
         self, graph_module: torch.fx.GraphModule
     ) -> torch.fx.GraphModule:
+        # Keep track of const params that has been dequant, so it does not get
+        # dequant multiple times if the const param has more than 1 user
+        visited_const_param = set()
         for n in graph_module.graph.nodes:
             self._annotate_requant(n)
-
             # With fold_quant enabled, check if the input of dq op is quantized param.
             param = None
             if n.target in dq_ops:
@@ -106,7 +108,8 @@ def _annotate_quant_attrs(
             quant_attrs = get_quant_attrs(self.edge_program, n)
             self._annotate_source_nodes(n, quant_attrs)
 
-            if param is not None:
+            if param is not None and n.args[0] not in visited_const_param:
+                visited_const_param.add(n.args[0])
                 self._dequant_fold_params(n, quant_attrs, param)
 
         return graph_module
diff --git a/backends/qualcomm/passes/recompose_pixel_unshuffle.py b/backends/qualcomm/passes/recompose_pixel_unshuffle.py
index cadc310bbb..a47f3d119a 100644
--- a/backends/qualcomm/passes/recompose_pixel_unshuffle.py
+++ b/backends/qualcomm/passes/recompose_pixel_unshuffle.py
@@ -35,7 +35,13 @@ def call(self, graph_module: torch.fx.GraphModule):
         for node in graph.nodes:
             if node.op == "call_function" and node.target == self.reshape_target:
                 with graph.inserting_after(node):
-                    premute_node = node.args[0]
+
+                    # Clone op still exists between permute and reshape_target during quantization,
+                    # so we need to check for args[0].args[0] to get permute node
+                    if self.quantization_capture:
+                        premute_node = node.args[0].args[0]
+                    else:
+                        premute_node = node.args[0]
                     if any(
                         [
                             len(node.args[1]) != 4,
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
index 91e31b62e4..d51e016473 100644
--- a/backends/qualcomm/quantizer/quantizer.py
+++ b/backends/qualcomm/quantizer/quantizer.py
@@ -12,7 +12,6 @@
     RecomposePixelUnshuffle,
 )
 from executorch.backends.qualcomm.passes.reduce_dynamic_range import ReduceDynamicRange
-from executorch.backends.qualcomm.passes.remove_redundancy import RemoveRedundancy
 from executorch.backends.qualcomm.passes.replace_inf_buffer import ReplaceInfBuffer
 from executorch.backends.transforms.decompose_sdpa import (
     DecomposeScaledDotProductAttention,
@@ -182,7 +181,6 @@ def set_per_channel_linear_quant(self, enable: bool) -> None:
         self._update_per_channel_weight_quant_ops(linear_ops, enable)
 
     def transform_for_annotation(self, model: GraphModule) -> GraphModule:
-        model = RemoveRedundancy()(model).graph_module
         model = ReduceDynamicRange()(model).graph_module
         model = RecomposePixelUnshuffle(quantization_capture=True)(model).graph_module
         model = DecomposeScaledDotProductAttention()(model).graph_module
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 508a027da6..f9d05131bb 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -1581,8 +1581,11 @@ def test_fbnet(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            self.assertGreaterEqual(msg["top_1"], 60)
-            self.assertGreaterEqual(msg["top_5"], 90)
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 60)
+                self.assertGreaterEqual(msg["top_5"], 90)
 
     def test_gMLP(self):
         if not self.required_envs([self.image_dataset]):
@@ -1614,8 +1617,11 @@ def test_gMLP(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            self.assertGreaterEqual(msg["top_1"], 60)
-            self.assertGreaterEqual(msg["top_5"], 90)
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 60)
+                self.assertGreaterEqual(msg["top_5"], 90)
 
     def test_ssd300_vgg16(self):
         if not self.required_envs([self.pretrained_weight, self.oss_repo]):
@@ -1649,7 +1655,10 @@ def test_ssd300_vgg16(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            self.assertGreaterEqual(msg["mAP"], 0.70)
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["mAP"], 0.70)
 
     def test_dino_v2(self):
         if not self.required_envs([self.image_dataset]):
@@ -1680,8 +1689,11 @@ def test_dino_v2(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            self.assertGreaterEqual(msg["top_1"], 70)
-            self.assertGreaterEqual(msg["top_5"], 85)
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 70)
+                self.assertGreaterEqual(msg["top_5"], 85)
 
     def test_esrgan(self):
         if not self.required_envs():
@@ -1714,8 +1726,11 @@ def test_esrgan(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            self.assertGreaterEqual(msg["PSNR"], 24)
-            self.assertGreaterEqual(msg["SSIM"], 0.8)
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["PSNR"], 24)
+                self.assertGreaterEqual(msg["SSIM"], 0.8)
 
     def test_squeezenet(self):
         if not self.required_envs([self.image_dataset]):
@@ -1747,8 +1762,11 @@ def test_squeezenet(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            self.assertGreaterEqual(msg["top_1"], 40)
-            self.assertGreaterEqual(msg["top_5"], 70)
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 40)
+                self.assertGreaterEqual(msg["top_5"], 70)
 
 
 class TestExampleScript(TestQNN):
@@ -1794,8 +1812,11 @@ def test_mobilenet_v2(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            self.assertGreaterEqual(msg["top_1"], 60)
-            self.assertGreaterEqual(msg["top_5"], 80)
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 60)
+                self.assertGreaterEqual(msg["top_5"], 80)
 
     def test_mobilenet_v3(self):
         if not self.required_envs([self.image_dataset]):
@@ -1829,8 +1850,11 @@ def test_mobilenet_v3(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            self.assertGreaterEqual(msg["top_1"], 60)
-            self.assertGreaterEqual(msg["top_5"], 80)
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 60)
+                self.assertGreaterEqual(msg["top_5"], 80)
 
     def test_inception_v3(self):
         if not self.required_envs([self.image_dataset]):
@@ -1864,8 +1888,11 @@ def test_inception_v3(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            self.assertGreaterEqual(msg["top_1"], 60)
-            self.assertGreaterEqual(msg["top_5"], 80)
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 60)
+                self.assertGreaterEqual(msg["top_5"], 80)
 
     def test_inception_v4(self):
         if not self.required_envs([self.image_dataset]):
@@ -1899,8 +1926,11 @@ def test_inception_v4(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            self.assertGreaterEqual(msg["top_1"], 60)
-            self.assertGreaterEqual(msg["top_5"], 80)
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 60)
+                self.assertGreaterEqual(msg["top_5"], 80)
 
     def test_vit(self):
         if not self.required_envs([self.image_dataset]):
@@ -1934,8 +1964,11 @@ def test_vit(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            self.assertGreaterEqual(msg["top_1"], 70)
-            self.assertGreaterEqual(msg["top_5"], 90)
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 70)
+                self.assertGreaterEqual(msg["top_5"], 90)
 
     def test_edsr(self):
         if not self.required_envs():
@@ -1968,8 +2001,11 @@ def test_edsr(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            self.assertGreaterEqual(msg["PSNR"], 25)
-            self.assertGreaterEqual(msg["SSIM"], 0.8)
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["PSNR"], 25)
+                self.assertGreaterEqual(msg["SSIM"], 0.8)
 
     def test_deeplab_v3(self):
         if not self.required_envs():
@@ -2002,9 +2038,12 @@ def test_deeplab_v3(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            self.assertGreaterEqual(msg["PA"], 0.85)
-            self.assertGreaterEqual(msg["MPA"], 0.70)
-            self.assertGreaterEqual(msg["MIoU"], 0.55)
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["PA"], 0.85)
+                self.assertGreaterEqual(msg["MPA"], 0.70)
+                self.assertGreaterEqual(msg["MIoU"], 0.55)
 
     def test_stories_single_llama(self):
         if not self.required_envs():
@@ -2049,8 +2088,11 @@ def test_stories_single_llama(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            model_out = msg["result"][0]
-            self.assertTrue(model_out.startswith(golden_start_with))
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                model_out = msg["result"][0]
+                self.assertTrue(model_out.startswith(golden_start_with))
 
     def test_mobilebert(self):
         if not self.required_envs([self.pretrained_weight]):
@@ -2085,9 +2127,12 @@ def test_mobilebert(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            cpu, htp = msg["CPU"], msg["HTP"]
-            for k, v in cpu.items():
-                self.assertLessEqual(abs(v[0] - htp[k][0]), 2)
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                cpu, htp = msg["CPU"], msg["HTP"]
+                for k, v in cpu.items():
+                    self.assertLessEqual(abs(v[0] - htp[k][0]), 2)
 
     @unittest.skip("will be enabled after TODOs got resolved")
     def test_ptq_mobilebert(self):
@@ -2127,9 +2172,12 @@ def test_ptq_mobilebert(self):
             conn = listener.accept()
             p.communicate()
             msg = json.loads(conn.recv())
-            cpu, htp = msg["CPU"], msg["HTP"]
-            for k, v in cpu.items():
-                self.assertLessEqual(abs(v[0] - htp[k][0]), 5)
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                cpu, htp = msg["CPU"], msg["HTP"]
+                for k, v in cpu.items():
+                    self.assertLessEqual(abs(v[0] - htp[k][0]), 5)
 
     def test_export_example(self):
         if not self.required_envs([self.model_name]):
diff --git a/examples/qualcomm/llama2/llama.py b/examples/qualcomm/llama2/llama.py
index 79cf5606d6..6e0f3f4399 100644
--- a/examples/qualcomm/llama2/llama.py
+++ b/examples/qualcomm/llama2/llama.py
@@ -586,4 +586,11 @@ def post_process():
     if args.compile_only:
         exit(f"Finish compile_only and save to {args.artifact}")
 
-    inference(args)
+    try:
+        inference(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/dino_v2.py b/examples/qualcomm/oss_scripts/dino_v2.py
index e4d4c6af25..03249b63d8 100644
--- a/examples/qualcomm/oss_scripts/dino_v2.py
+++ b/examples/qualcomm/oss_scripts/dino_v2.py
@@ -67,31 +67,7 @@ def get_instance():
     return model.eval()
 
 
-if __name__ == "__main__":
-    parser = setup_common_args_and_variables()
-
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="Path for storing generated artifacts by this example. Default ./dino_v2",
-        default="./dino_v2",
-        type=str,
-    )
-
-    parser.add_argument(
-        "-d",
-        "--dataset",
-        help=(
-            "path to the validation folder of ImageNet dataset. "
-            "e.g. --dataset imagenet-mini/val "
-            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
-        ),
-        type=str,
-        required=True,
-    )
-
-    args = parser.parse_args()
-
+def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
     # ensure the working directory exist.
@@ -170,3 +146,37 @@ def get_instance():
     else:
         for i, k in enumerate(k_val):
             print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="Path for storing generated artifacts by this example. Default ./dino_v2",
+        default="./dino_v2",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/esrgan.py b/examples/qualcomm/oss_scripts/esrgan.py
index 50dc59cf0c..e4e609e152 100644
--- a/examples/qualcomm/oss_scripts/esrgan.py
+++ b/examples/qualcomm/oss_scripts/esrgan.py
@@ -40,50 +40,7 @@ def get_instance(repo: str):
     return model.model.eval()
 
 
-if __name__ == "__main__":
-    parser = setup_common_args_and_variables()
-
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="path for storing generated artifacts by this example. Default ./esrgan",
-        default="./esrgan",
-        type=str,
-    )
-
-    parser.add_argument(
-        "-r",
-        "--hr_ref_dir",
-        help="Path to the high resolution images",
-        default="",
-        type=str,
-    )
-
-    parser.add_argument(
-        "-l",
-        "--lr_dir",
-        help="Path to the low resolution image inputs",
-        default="",
-        type=str,
-    )
-
-    parser.add_argument(
-        "-d",
-        "--default_dataset",
-        help="If specified, download and use B100 dataset by torchSR API",
-        action="store_true",
-        default=False,
-    )
-
-    parser.add_argument(
-        "--oss_repo",
-        help="Path to cloned https://github.com/ai-forever/Real-ESRGAN",
-        type=str,
-        required=True,
-    )
-
-    args = parser.parse_args()
-
+def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
     # ensure the working directory exist.
@@ -174,3 +131,56 @@ def post_process():
     else:
         print(f"Average of PSNR is: {avg_PSNR}")
         print(f"Average of SSIM is: {avg_SSIM}")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./esrgan",
+        default="./esrgan",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-r",
+        "--hr_ref_dir",
+        help="Path to the high resolution images",
+        default="",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-l",
+        "--lr_dir",
+        help="Path to the low resolution image inputs",
+        default="",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--default_dataset",
+        help="If specified, download and use B100 dataset by torchSR API",
+        action="store_true",
+        default=False,
+    )
+
+    parser.add_argument(
+        "--oss_repo",
+        help="Path to cloned https://github.com/ai-forever/Real-ESRGAN",
+        type=str,
+        required=True,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/fbnet.py b/examples/qualcomm/oss_scripts/fbnet.py
index d62c4a78b1..fe07ab83d2 100755
--- a/examples/qualcomm/oss_scripts/fbnet.py
+++ b/examples/qualcomm/oss_scripts/fbnet.py
@@ -23,30 +23,7 @@
 )
 
 
-if __name__ == "__main__":
-    parser = setup_common_args_and_variables()
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="path for storing generated artifacts by this example. Default ./fbnet",
-        default="./fbnet",
-        type=str,
-    )
-
-    parser.add_argument(
-        "-d",
-        "--dataset",
-        help=(
-            "path to the validation folder of ImageNet dataset. "
-            "e.g. --dataset imagenet-mini/val "
-            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
-        ),
-        type=str,
-        required=True,
-    )
-
-    args = parser.parse_args()
-
+def main(args):
     if not args.compile_only and args.device is None:
         raise RuntimeError(
             "device serial is required if not compile only. "
@@ -126,3 +103,36 @@ def post_process():
     else:
         for i, k in enumerate(k_val):
             print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./fbnet",
+        default="./fbnet",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/gMLP_image_classification.py b/examples/qualcomm/oss_scripts/gMLP_image_classification.py
index 3d98f55a7d..e9b9b91507 100644
--- a/examples/qualcomm/oss_scripts/gMLP_image_classification.py
+++ b/examples/qualcomm/oss_scripts/gMLP_image_classification.py
@@ -59,30 +59,7 @@ def get_data_loader():
     return inputs, targets, input_list
 
 
-if __name__ == "__main__":
-    parser = setup_common_args_and_variables()
-
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="path for storing generated artifacts by this example. Default ./gMLP_image_classification",
-        default="./gMLP_image_classification",
-        type=str,
-    )
-
-    parser.add_argument(
-        "-d",
-        "--dataset",
-        help=(
-            "path to the validation folder of ImageNet dataset. "
-            "e.g. --dataset imagenet-mini/val "
-            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
-        ),
-        type=str,
-        required=True,
-    )
-
-    args = parser.parse_args()
+def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
     # ensure the working directory exist.
@@ -161,3 +138,37 @@ def get_data_loader():
     else:
         for i, k in enumerate(k_val):
             print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./gMLP_image_classification",
+        default="./gMLP_image_classification",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/squeezenet.py b/examples/qualcomm/oss_scripts/squeezenet.py
index 53edb98b91..bc000c6938 100644
--- a/examples/qualcomm/oss_scripts/squeezenet.py
+++ b/examples/qualcomm/oss_scripts/squeezenet.py
@@ -57,32 +57,7 @@ def get_data_loader():
     return inputs, targets, input_list
 
 
-if __name__ == "__main__":
-    parser = setup_common_args_and_variables()
-
-    parser.add_argument(
-        "-d",
-        "--dataset",
-        help=(
-            "path to the validation folder of ImageNet dataset. "
-            "e.g. --dataset imagenet-mini/val "
-            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
-        ),
-        type=str,
-        required=True,
-    )
-
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="path for storing generated artifacts by this example. "
-        "Default ./squeezenet",
-        default="./squeezenet",
-        type=str,
-    )
-
-    args = parser.parse_args()
-
+def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
     # ensure the working directory exist.
@@ -158,3 +133,38 @@ def get_data_loader():
     else:
         for i, k in enumerate(k_val):
             print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./squeezenet",
+        default="./squeezenet",
+        type=str,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/ssd300_vgg16.py b/examples/qualcomm/oss_scripts/ssd300_vgg16.py
index cd4eb8764f..8fdb896e09 100644
--- a/examples/qualcomm/oss_scripts/ssd300_vgg16.py
+++ b/examples/qualcomm/oss_scripts/ssd300_vgg16.py
@@ -119,50 +119,7 @@ def SSD300VGG16(pretrained_weight_model):
     return model.eval()
 
 
-if __name__ == "__main__":
-    parser = setup_common_args_and_variables()
-
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="path for storing generated artifacts by this example. Default ./ssd300_vgg16",
-        default="./ssd300_vgg16",
-        type=str,
-    )
-
-    parser.add_argument(
-        "-d",
-        "--download",
-        help="If specified, download VOCSegmentation dataset by torchvision API",
-        action="store_true",
-        default=False,
-    )
-
-    parser.add_argument(
-        "--oss_repo",
-        help=(
-            "Repository that contains model backbone and score calculation."
-            "e.g., --M ./a-PyTorch-Tutorial-to-Object-Detection"
-            "Please clone the repository from https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection"
-        ),
-        type=str,
-        required=True,
-    )
-
-    parser.add_argument(
-        "-p",
-        "--pretrained_weight",
-        help=(
-            "Location of model pretrained weight."
-            "e.g., -p ./checkpoint_ssd300.pth.tar"
-            "Pretrained model can be found in the link https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection, under the Training Section"
-        ),
-        type=str,
-        required=True,
-    )
-
-    args = parser.parse_args()
-
+def main(args):
     sys.path.insert(0, args.oss_repo)
 
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
@@ -279,3 +236,56 @@ def post_process():
             pp.pprint(APs)
 
     adb.pull(output_path=args.artifact, callback=post_process)
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./ssd300_vgg16",
+        default="./ssd300_vgg16",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--download",
+        help="If specified, download VOCSegmentation dataset by torchvision API",
+        action="store_true",
+        default=False,
+    )
+
+    parser.add_argument(
+        "--oss_repo",
+        help=(
+            "Repository that contains model backbone and score calculation."
+            "e.g., --M ./a-PyTorch-Tutorial-to-Object-Detection"
+            "Please clone the repository from https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--pretrained_weight",
+        help=(
+            "Location of model pretrained weight."
+            "e.g., -p ./checkpoint_ssd300.pth.tar"
+            "Pretrained model can be found in the link https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection, under the Training Section"
+        ),
+        type=str,
+        required=True,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/scripts/deeplab_v3.py b/examples/qualcomm/scripts/deeplab_v3.py
index ff1f53c180..d870380e35 100755
--- a/examples/qualcomm/scripts/deeplab_v3.py
+++ b/examples/qualcomm/scripts/deeplab_v3.py
@@ -61,27 +61,7 @@ def get_dataset(data_size, dataset_dir, download):
     return inputs, targets, input_list
 
 
-if __name__ == "__main__":
-    parser = setup_common_args_and_variables()
-
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="path for storing generated artifacts by this example. Default ./deeplab_v3",
-        default="./deeplab_v3",
-        type=str,
-    )
-
-    parser.add_argument(
-        "-d",
-        "--download",
-        help="If specified, download VOCSegmentation dataset by torchvision API",
-        action="store_true",
-        default=False,
-    )
-
-    args = parser.parse_args()
-
+def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
     # ensure the working directory exist.
@@ -196,3 +176,33 @@ def post_process():
         print(f"MPA  : {mpa}%")
         print(f"MIoU : {miou}%")
         print(f"CIoU : \n{json.dumps(cls_iou, indent=2)}")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./deeplab_v3",
+        default="./deeplab_v3",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--download",
+        help="If specified, download VOCSegmentation dataset by torchvision API",
+        action="store_true",
+        default=False,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/scripts/edsr.py b/examples/qualcomm/scripts/edsr.py
index 54cc8bff19..f602ecc1af 100755
--- a/examples/qualcomm/scripts/edsr.py
+++ b/examples/qualcomm/scripts/edsr.py
@@ -91,43 +91,7 @@ def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str
     return SrDataset(hr_dir, lr_dir)
 
 
-if __name__ == "__main__":
-    parser = setup_common_args_and_variables()
-
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="path for storing generated artifacts by this example. Default ./edsr",
-        default="./edsr",
-        type=str,
-    )
-
-    parser.add_argument(
-        "-r",
-        "--hr_ref_dir",
-        help="Path to the high resolution images",
-        default="",
-        type=str,
-    )
-
-    parser.add_argument(
-        "-l",
-        "--lr_dir",
-        help="Path to the low resolution image inputs",
-        default="",
-        type=str,
-    )
-
-    parser.add_argument(
-        "-d",
-        "--default_dataset",
-        help="If specified, download and use B100 dataset by torchSR API",
-        action="store_true",
-        default=False,
-    )
-
-    args = parser.parse_args()
-
+def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
     # ensure the working directory exist.
@@ -223,3 +187,49 @@ def post_process():
     else:
         print(f"Average of PNSR is: {avg_PSNR}")
         print(f"Average of SSIM is: {avg_SSIM}")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./edsr",
+        default="./edsr",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-r",
+        "--hr_ref_dir",
+        help="Path to the high resolution images",
+        default="",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-l",
+        "--lr_dir",
+        help="Path to the low resolution image inputs",
+        default="",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--default_dataset",
+        help="If specified, download and use B100 dataset by torchSR API",
+        action="store_true",
+        default=False,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/scripts/inception_v3.py b/examples/qualcomm/scripts/inception_v3.py
index 94aa618c72..90eb8cf206 100755
--- a/examples/qualcomm/scripts/inception_v3.py
+++ b/examples/qualcomm/scripts/inception_v3.py
@@ -58,32 +58,7 @@ def get_data_loader():
     return inputs, targets, input_list
 
 
-if __name__ == "__main__":
-    parser = setup_common_args_and_variables()
-
-    parser.add_argument(
-        "-d",
-        "--dataset",
-        help=(
-            "path to the validation folder of ImageNet dataset. "
-            "e.g. --dataset imagenet-mini/val "
-            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
-        ),
-        type=str,
-        required=True,
-    )
-
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="path for storing generated artifacts by this example. "
-        "Default ./inception_v3",
-        default="./inception_v3",
-        type=str,
-    )
-
-    args = parser.parse_args()
-
+def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
     # ensure the working directory exist.
@@ -159,3 +134,38 @@ def get_data_loader():
     else:
         for i, k in enumerate(k_val):
             print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./inception_v3",
+        default="./inception_v3",
+        type=str,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/scripts/inception_v4.py b/examples/qualcomm/scripts/inception_v4.py
index e457fef0f7..84b20e6e20 100755
--- a/examples/qualcomm/scripts/inception_v4.py
+++ b/examples/qualcomm/scripts/inception_v4.py
@@ -57,32 +57,7 @@ def get_data_loader():
     return inputs, targets, input_list
 
 
-if __name__ == "__main__":
-    parser = setup_common_args_and_variables()
-
-    parser.add_argument(
-        "-d",
-        "--dataset",
-        help=(
-            "path to the validation folder of ImageNet dataset. "
-            "e.g. --dataset imagenet-mini/val "
-            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
-        ),
-        type=str,
-        required=True,
-    )
-
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="path for storing generated artifacts by this example. "
-        "Default ./inception_v4",
-        default="./inception_v4",
-        type=str,
-    )
-
-    args = parser.parse_args()
-
+def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
     # ensure the working directory exist.
@@ -158,3 +133,38 @@ def get_data_loader():
     else:
         for i, k in enumerate(k_val):
             print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./inception_v4",
+        default="./inception_v4",
+        type=str,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py
index 85aafe7cae..8972ca202f 100755
--- a/examples/qualcomm/scripts/mobilebert_fine_tune.py
+++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py
@@ -220,42 +220,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
     return model.eval(), dataloader_val, labels
 
 
-if __name__ == "__main__":
-    parser = setup_common_args_and_variables()
-
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="path for storing generated artifacts by this example. Default ./mobilebert_fine_tune",
-        default="./mobilebert_fine_tune",
-        type=str,
-    )
-
-    parser.add_argument(
-        "-p",
-        "--pretrained_weight",
-        help="Location of pretrained weight",
-        default=None,
-        type=str,
-    )
-
-    parser.add_argument(
-        "-F",
-        "--use_fp16",
-        help="If specified, will run in fp16 precision and discard ptq setting",
-        action="store_true",
-        default=False,
-    )
-
-    parser.add_argument(
-        "-P",
-        "--ptq",
-        help="If specified, will do PTQ quantization. default is 8bits activation and 8bits weight. Support 8a8w, 16a16w and 16a4w.",
-        default="8a8w",
-    )
-
-    args = parser.parse_args()
-
+def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
     # ensure the working directory exist.
@@ -353,3 +318,48 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
             print(f"\n[{target[0]}]")
             for k, v in target[1].items():
                 print(f"{k}: {v[0]}/{v[1]}")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./mobilebert_fine_tune",
+        default="./mobilebert_fine_tune",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--pretrained_weight",
+        help="Location of pretrained weight",
+        default=None,
+        type=str,
+    )
+
+    parser.add_argument(
+        "-F",
+        "--use_fp16",
+        help="If specified, will run in fp16 precision and discard ptq setting",
+        action="store_true",
+        default=False,
+    )
+
+    parser.add_argument(
+        "-P",
+        "--ptq",
+        help="If specified, will do PTQ quantization. default is 8bits activation and 8bits weight. Support 8a8w, 16a16w and 16a4w.",
+        default="8a8w",
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/scripts/mobilenet_v2.py b/examples/qualcomm/scripts/mobilenet_v2.py
index f642e0172c..3ebdcd5d05 100755
--- a/examples/qualcomm/scripts/mobilenet_v2.py
+++ b/examples/qualcomm/scripts/mobilenet_v2.py
@@ -58,32 +58,7 @@ def get_data_loader():
     return inputs, targets, input_list
 
 
-if __name__ == "__main__":
-    parser = setup_common_args_and_variables()
-
-    parser.add_argument(
-        "-d",
-        "--dataset",
-        help=(
-            "path to the validation folder of ImageNet dataset. "
-            "e.g. --dataset imagenet-mini/val "
-            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
-        ),
-        type=str,
-        required=True,
-    )
-
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="path for storing generated artifacts by this example. "
-        "Default ./mobilenet_v2",
-        default="./mobilenet_v2",
-        type=str,
-    )
-
-    args = parser.parse_args()
-
+def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
     # ensure the working directory exist.
@@ -159,3 +134,38 @@ def get_data_loader():
     else:
         for i, k in enumerate(k_val):
             print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./mobilenet_v2",
+        default="./mobilenet_v2",
+        type=str,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/scripts/mobilenet_v3.py b/examples/qualcomm/scripts/mobilenet_v3.py
index d15827160a..18fd7c849a 100644
--- a/examples/qualcomm/scripts/mobilenet_v3.py
+++ b/examples/qualcomm/scripts/mobilenet_v3.py
@@ -57,32 +57,7 @@ def get_data_loader():
     return inputs, targets, input_list
 
 
-if __name__ == "__main__":
-    parser = setup_common_args_and_variables()
-
-    parser.add_argument(
-        "-d",
-        "--dataset",
-        help=(
-            "path to the validation folder of ImageNet dataset. "
-            "e.g. --dataset imagenet-mini/val "
-            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
-        ),
-        type=str,
-        required=True,
-    )
-
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="path for storing generated artifacts by this example. "
-        "Default ./mobilenet_v3",
-        default="./mobilenet_v3",
-        type=str,
-    )
-
-    args = parser.parse_args()
-
+def main(args):
     skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
 
     # ensure the working directory exist.
@@ -157,3 +132,38 @@ def get_data_loader():
     else:
         for i, k in enumerate(k_val):
             print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./mobilenet_v3",
+        default="./mobilenet_v3",
+        type=str,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/scripts/torchvision_vit.py b/examples/qualcomm/scripts/torchvision_vit.py
index cd5463c8a2..cfdbe5d075 100755
--- a/examples/qualcomm/scripts/torchvision_vit.py
+++ b/examples/qualcomm/scripts/torchvision_vit.py
@@ -56,29 +56,7 @@ def get_data_loader():
     return inputs, targets, input_list
 
 
-if __name__ == "__main__":
-    parser = setup_common_args_and_variables()
-    parser.add_argument(
-        "-d",
-        "--dataset",
-        help=(
-            "path to the validation folder of ImageNet dataset. "
-            "e.g. --dataset imagenet-mini/val "
-            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
-        ),
-        type=str,
-        required=True,
-    )
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="path for storing generated artifacts by this example. " "Default ./vit",
-        default="./vit",
-        type=str,
-    )
-
-    args = parser.parse_args()
-
+def main(args):
     # ensure the working directory exist.
     os.makedirs(args.artifact, exist_ok=True)
 
@@ -140,3 +118,35 @@ def get_data_loader():
     else:
         for i, k in enumerate(k_val):
             print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. " "Default ./vit",
+        default="./vit",
+        type=str,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/scripts/utils.py b/examples/qualcomm/scripts/utils.py
index d803932179..1e4b1c6968 100755
--- a/examples/qualcomm/scripts/utils.py
+++ b/examples/qualcomm/scripts/utils.py
@@ -187,6 +187,7 @@ def build_executorch_binary(
         quantizer = QnnQuantizer()
         quantizer.add_custom_quant_annotations(custom_annotations)
         quantizer.set_per_channel_linear_quant(per_channel_linear)
+        quantizer.set_per_channel_conv_quant(True)
 
         if quant_dtype == QuantDtype.use_8a8w:
             pass  # default setting
@@ -214,7 +215,6 @@ def build_executorch_binary(
             for data in dataset:
                 annotated_model(*data)
         quantized_model = convert_pt2e(annotated_model)
-
         edge_prog = capture_program(quantized_model, inputs)
     else:
         edge_prog = capture_program(model, inputs)

From f695f8e8de71ce028e0414f7ed8fde416a64b822 Mon Sep 17 00:00:00 2001
From: Matthias Cremon <matthiascremon@meta.com>
Date: Mon, 29 Jul 2024 15:05:34 -0700
Subject: [PATCH 10/75] Support qmatmul with different dims tensors (#4438)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4438

MobileBERT exposes an issue in our kernel, where tensors have compatible (for PyTorch) but different batch dimensions.

This diff changes the meta kernel to support that (the kernel can already do it).

Reviewed By: dulinriley

Differential Revision: D60314979

fbshipit-source-id: a0cde9d328098992787c353611ece64223d6c739
---
 backends/cadence/aot/ops_registrations.py | 46 ++++++++++++-----------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index c877a7149d..adcf086873 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from math import prod
 from typing import Optional, Tuple
 
 import torch
@@ -186,28 +187,29 @@ def quantized_matmul_meta(
     X_size = list(X.size())
     Y_size = list(Y.size())
 
-    assert len(X_size) == len(
-        Y_size
-    ), "quantized matmul not supported for tensors of different dimensions"
-
-    if len(X_size) == 3:
-        assert (
-            X_size[0] == Y_size[0]
-        ), "quantized matmul only supported for batch dimension of same size"
-        if transposed:
-            assert X_size[2] == Y_size[2], "matrices cannot be multiplied"
-            out_size = X_size[:2] + [Y_size[1]]
-        else:
-            assert X_size[2] == Y_size[1], "matrices cannot be multiplied"
-            out_size = X_size[:2] + [Y_size[2]]
-    elif len(X_size) == 2:
-        if transposed:
-            assert X_size[1] == Y_size[1], "matrices cannot be multiplied"
-            out_size = [X_size[0], Y_size[0]]
-        else:
-            assert X_size[1] == Y_size[0], "matrices cannot be multiplied"
-            out_size = [X_size[0], Y_size[1]]
+    # Get the batch dimensions for both tensors
+    X_batch_dims = X_size[:-2]
+    Y_batch_dims = Y_size[:-2]
+
+    # If they don't match, check that they're compatible
+    if X_batch_dims != Y_batch_dims:
+        assert prod(X_batch_dims) == prod(
+            Y_batch_dims
+        ), f"Batch dimensions of X and Y do not match: {X_batch_dims} vs {Y_batch_dims}"
+
+    # Get the matmul output size
+    if transposed:
+        assert X_size[-1] == Y_size[-1], "matrices cannot be multiplied"
+        mat_size = [X_size[-2], Y_size[-2]]
     else:
-        raise AssertionError("quantized matmul only supported for 2D or 3D tensors")
+        assert X_size[-1] == Y_size[-2], "matrices cannot be multiplied"
+        mat_size = [X_size[-2], Y_size[-1]]
+
+    # Combine the larger batch dimensions with the matmul output size
+    out_size = (
+        X_batch_dims + mat_size
+        if len(X_batch_dims) > len(Y_batch_dims)
+        else Y_batch_dims + mat_size
+    )
 
     return X.new_empty(out_size, dtype=X.dtype)

From e6684f7662fe067673e0a3c36066f1245127d95d Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 29 Jul 2024 16:26:30 -0700
Subject: [PATCH 11/75] Use linux.24xlarge for llava test (#4446)

Summary:
Attempt to fix the OOM error after https://github.com/pytorch/executorch/pull/4430

Pull Request resolved: https://github.com/pytorch/executorch/pull/4446

Reviewed By: shoumikhin

Differential Revision: D60418070

Pulled By: huydhn

fbshipit-source-id: 7e119ce52645bfd452064b674e5b8896df3642a0
---
 .github/workflows/pull.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 36099ca651..bbbb976385 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -193,7 +193,7 @@ jobs:
     strategy:
       fail-fast: false
     with:
-      runner: linux.12xlarge
+      runner: linux.24xlarge
       docker-image: executorch-ubuntu-22.04-clang12
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}

From 711ecec40feafea8b74383d46dae342b39936e65 Mon Sep 17 00:00:00 2001
From: Yidi Wu <yidi@meta.com>
Date: Mon, 29 Jul 2024 17:05:44 -0700
Subject: [PATCH 12/75] fix zero arg export in training_ir and constant tensor
 handling (#4382)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4382

Prepare for the re-land of D60006710.

Previously, some buffers are not correctly identified. D60006710 fixes it but causes test failures. This pr did a patch to the test to avoid test failures.

Reviewed By: BoyuanFeng

Differential Revision: D60137883

fbshipit-source-id: d919e25525347a86afc6a895d2b0eb94d161b5ad
---
 exir/backend/test/test_partitioner.py | 28 ++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/exir/backend/test/test_partitioner.py b/exir/backend/test/test_partitioner.py
index d492c291f3..3ee6202ae8 100644
--- a/exir/backend/test/test_partitioner.py
+++ b/exir/backend/test/test_partitioner.py
@@ -36,7 +36,7 @@
 )
 from executorch.extension.pytree import tree_flatten
 from torch._export import capture_pre_autograd_graph
-from torch._export.utils import is_buffer, is_param
+from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
 from torch.export import export
 from torch.fx.passes.operator_support import any_chain
 
@@ -235,7 +235,11 @@ def partition(
         self.assertEqual(
             len(owning_program.state_dict) + len(owning_program.constants), 3
         )
-        self.assertEqual(len(owning_program.graph_signature.buffers), 2)
+        self.assertEqual(
+            len(owning_program.graph_signature.buffers)
+            + len(owning_program.graph_signature.lifted_tensor_constants),
+            2,
+        )
         self.assertEqual(len(owning_program.graph_signature.parameters), 1)
 
         # Check Lowered Module Exported Program does not have any constant data
@@ -290,6 +294,7 @@ def partition(
                     if node.op == "placeholder" and (
                         is_param(edge_exported_program, node)
                         or is_buffer(edge_exported_program, node)
+                        or is_lifted_tensor_constant(edge_exported_program, node)
                     ):
                         delegation_tag = "tag0"
                         node.meta["delegation_tag"] = delegation_tag
@@ -324,7 +329,11 @@ def partition(
         )
         delegated_ep = lower_module.original_module
         self.assertEqual(len(delegated_ep.state_dict) + len(delegated_ep.constants), 3)
-        self.assertEqual(len(delegated_ep.graph_signature.buffers), 2)
+        self.assertEqual(
+            len(delegated_ep.graph_signature.buffers)
+            + len(delegated_ep.graph_signature.lifted_tensor_constants),
+            2,
+        )
         self.assertEqual(len(delegated_ep.graph_signature.parameters), 1)
 
         # check exported program is still runnable
@@ -380,7 +389,11 @@ def partition(
         self.assertEqual(
             len(owning_program.state_dict) + len(owning_program.constants), 2
         )
-        self.assertEqual(len(owning_program.graph_signature.buffers), 2)
+        self.assertEqual(
+            len(owning_program.graph_signature.buffers)
+            + len(owning_program.graph_signature.lifted_tensor_constants),
+            2,
+        )
         self.assertEqual(len(owning_program.graph_signature.parameters), 0)
 
         # Check Lowered Module Exported Program does not own any buffers
@@ -503,6 +516,7 @@ def partition(
                     if node.op == "placeholder" and (
                         is_param(edge_exported_program, node)
                         or is_buffer(edge_exported_program, node)
+                        or is_lifted_tensor_constant(edge_exported_program, node)
                     ):
                         delegation_tag = "tag0"
                         node.meta["delegation_tag"] = delegation_tag
@@ -519,9 +533,9 @@ def partition(
         with self.assertRaises(RuntimeError) as error:
             _ = edge.to_backend(PartitionerTagData())
 
-        self.assertEqual(
-            "constant data node (b_const) is tagged with (tag0) but has user (aten_sub_tensor) which has tag (None)",
-            str(error.exception),
+        self.assertTrue(
+            "is tagged with (tag0) but has user (aten_sub_tensor) which has tag (None)"
+            in str(error.exception),
         )
 
     def test_not_delegate_mutable_buffers(self) -> None:

From 7f6a3416c4e183a57f023969004afec74a4ea480 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Mon, 29 Jul 2024 18:15:47 -0700
Subject: [PATCH 13/75] Remove redundant generate_*_compile_spec funcs (#3869)

Summary:
tidying up a redundant wrapper function.

Pull Request resolved: https://github.com/pytorch/executorch/pull/3869

Reviewed By: mergennachin

Differential Revision: D58301772

Pulled By: digantdesai

fbshipit-source-id: cd906e2aa307408e7e2f877ace6544e91cd972fc
---
 backends/arm/arm_backend.py      | 37 --------------------------------
 examples/arm/aot_arm_compiler.py | 14 ++++++------
 2 files changed, 8 insertions(+), 43 deletions(-)

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 8ef5a79d3f..f187191fee 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -166,43 +166,6 @@ def get_intermediate_path(compile_spec: List[CompileSpec]) -> str:
     return None
 
 
-def generate_ethosu_compile_spec(
-    config: str,
-    permute_memory_to_nhwc: Optional[bool] = None,
-    quantize_io: Optional[bool] = None,
-    system_config: Optional[str] = None,
-    memory_mode: Optional[str] = None,
-    extra_flags: Optional[str] = None,
-    config_ini: Optional[str] = "Arm/vela.ini",
-) -> List[CompileSpec]:
-    return (
-        ArmCompileSpecBuilder()
-        .ethosu_compile_spec(
-            config,
-            system_config=system_config,
-            memory_mode=memory_mode,
-            extra_flags=extra_flags,
-            config_ini=config_ini,
-        )
-        .set_permute_memory_format(permute_memory_to_nhwc)
-        .set_quantize_io(quantize_io)
-        .build()
-    )
-
-
-def generate_tosa_compile_spec(
-    permute_memory_to_nhwc: Optional[bool] = None,
-    output_path: Optional[str] = None,
-) -> List[CompileSpec]:
-    return (
-        ArmCompileSpecBuilder()
-        .tosa_compile_spec()
-        .set_permute_memory_format(permute_memory_to_nhwc)
-        .dump_intermediate_artifacts_to(output_path)
-        .build()
-    )
-
-
 @final
 class ArmBackend(BackendDetails):
     @staticmethod
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 916a766f7c..f854a081fa 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -11,7 +11,8 @@
 import logging
 
 import torch
-from executorch.backends.arm.arm_backend import generate_ethosu_compile_spec
+
+from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder
 from executorch.backends.arm.arm_partitioner import ArmPartitioner
 from executorch.backends.arm.quantizer.arm_quantizer import (
     ArmQuantizer,
@@ -212,12 +213,13 @@ def forward(self, x):
     if args.delegate is True:
         edge = edge.to_backend(
             ArmPartitioner(
-                generate_ethosu_compile_spec(
-                    "ethos-u55-128",
-                    permute_memory_to_nhwc=args.model_name
-                    in MODEL_NAME_TO_MODEL.keys(),
-                    quantize_io=True,
+                ArmCompileSpecBuilder()
+                .ethosu_compile_spec("ethos-u55-128")
+                .set_permute_memory_format(
+                    args.model_name in MODEL_NAME_TO_MODEL.keys()
                 )
+                .set_quantize_io(True)
+                .build()
             )
         )
         logging.debug(f"Lowered graph:\n{edge.exported_program().graph}")

From da24d185579911df48f031468a3842fee56f9cb2 Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Mon, 29 Jul 2024 18:22:09 -0700
Subject: [PATCH 14/75] Add slice op to Arm backend (#4072)

Summary:
Implements node visitor and tests.

Also implements a io_config in ArmQuantizer
as a fallback. The io_config
QuantizationConfig is applied to placeholders
and outputs that miss annotation after all
other annotation is applied.

The intended use is for unit testing
quantization of operations
without quantization annotators.

Change-Id: Iae7dc3f1dc2afe23776566f0e9904271cde0892a

Pull Request resolved: https://github.com/pytorch/executorch/pull/4072

Reviewed By: manuelcandales

Differential Revision: D59259968

Pulled By: digantdesai

fbshipit-source-id: 253c4e9e6fd47bfe1fb18847edc33efa2a94f5d4
---
 backends/arm/arm_partitioner.py               |   1 +
 backends/arm/operators/__init__.py            |   1 +
 backends/arm/operators/op_slice.py            |  55 +++++++++
 backends/arm/quantizer/arm_quantizer.py       |  38 ++++++
 backends/arm/quantizer/arm_quantizer_utils.py |   2 +-
 backends/arm/test/ops/test_clone.py           |  22 ++--
 backends/arm/test/ops/test_slice.py           | 116 ++++++++++++++++++
 backends/arm/test/ops/test_view.py            |  22 ++--
 8 files changed, 234 insertions(+), 23 deletions(-)
 create mode 100644 backends/arm/operators/op_slice.py
 create mode 100644 backends/arm/test/ops/test_slice.py

diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py
index 54cfafcc9b..56dac5d248 100644
--- a/backends/arm/arm_partitioner.py
+++ b/backends/arm/arm_partitioner.py
@@ -47,6 +47,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.avg_pool2d.default,
             exir_ops.edge.aten.sigmoid.default,
             exir_ops.edge.aten._softmax.default,
+            exir_ops.edge.aten.slice_copy.Tensor,
             exir_ops.edge.aten.sub.Tensor,
             exir_ops.edge.aten.view_copy.default,
             exir_ops.edge.aten.clone.default,
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index 79c507816d..e868b584cf 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -19,6 +19,7 @@
     op_permute,
     op_quant,
     op_sigmoid,
+    op_slice,
     op_softmax,
     op_sub,
     op_view,
diff --git a/backends/arm/operators/op_slice.py b/backends/arm/operators/op_slice.py
new file mode 100644
index 0000000000..8d59835ff0
--- /dev/null
+++ b/backends/arm/operators/op_slice.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List
+
+import serializer.tosa_serializer as ts
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class SliceVisitor(NodeVisitor):
+    target = "aten.slice_copy.Tensor"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+
+        # aten.slice_copy supports slicing in 1d at a time.
+        # The arguments are dimension of slicing, start index and end index.
+        assert len(inputs) == 4
+        input_node, dim, start, end = inputs
+
+        # Translate and check parameters in Pytorch dim order.
+        shape = input_node.shape
+        dim = dim.number
+        end = (shape[dim] + end.number) % shape[dim]
+        size = end - start.number
+        assert size > 0
+        assert size <= shape[dim]
+
+        # Convert aten args to Tosa's start and size attributes and in TOSA dim order.
+        attr = ts.TosaSerializerAttribute()
+        start_attr = [start.number if i == dim else 0 for i in input_node.dim_order]
+        size_attr = [size if i == dim else shape[i] for i in input_node.dim_order]
+        attr.SliceAttribute(start_attr, size_attr)
+
+        tosa_graph.addOperator(
+            TosaOp.Op().SLICE, [input_node.name], [output.name], attr
+        )
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
index 3e1aceefe1..397ba68565 100644
--- a/backends/arm/quantizer/arm_quantizer.py
+++ b/backends/arm/quantizer/arm_quantizer.py
@@ -17,8 +17,11 @@
 
 import torch
 import torch.nn.functional as F
+
+from executorch.backends.arm.quantizer import arm_quantizer_utils
 from executorch.backends.arm.quantizer.arm_quantizer_utils import (
     convert_scalars_to_attrs,
+    mark_nodes_as_annotated,
     propagate_annotation,
 )
 from executorch.backends.arm.quantizer.quantization_annotation import (
@@ -41,6 +44,10 @@
 )
 from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
 from torch.ao.quantization.quantizer import QuantizationSpec, Quantizer
+from torch.ao.quantization.quantizer.utils import (
+    _annotate_input_qspec_map,
+    _annotate_output_qspec,
+)
 from torch.fx import GraphModule, Node
 
 __all__ = [
@@ -263,6 +270,7 @@ class ArmQuantizer(Quantizer):
     def __init__(self) -> None:
         super().__init__()
         self.global_config: Optional[QuantizationConfig] = None
+        self.io_config: Optional[QuantizationConfig] = None
         self.module_type_config: Dict[Callable, Optional[QuantizationConfig]] = {}
         self.module_name_config: Dict[str, Optional[QuantizationConfig]] = {}
 
@@ -294,6 +302,11 @@ def set_module_name(
         self.module_name_config[module_name] = quantization_config
         return self
 
+    def set_io(self, quantization_config):
+        """Set quantization_config for input and output nodes."""
+        self.io_config = quantization_config
+        return self
+
     def transform_for_annotation(self, model: GraphModule) -> GraphModule:
         """An initial pass for transforming the graph to prepare it for annotation.
         Currently transforms scalar values to tensor attributes.
@@ -358,8 +371,33 @@ def _annotate_for_static_quantization_config(
             self.global_config,
             _get_not_module_type_or_name_filter(tp_list, module_name_list),
         )
+
+        if self.io_config:
+            self._annotate_io(model, self.io_config)
+
         return model
 
+    def _annotate_io(
+        self,
+        model: GraphModule,
+        quantization_config: QuantizationConfig,
+    ):
+        for node in model.graph.nodes:
+            if arm_quantizer_utils.is_annotated(node):
+                continue
+            if node.op == "placeholder":
+                _annotate_output_qspec(
+                    node,
+                    quantization_config.get_output_act_qspec(),
+                )
+                mark_nodes_as_annotated([node])
+            if node.op == "output":
+                parent = node.all_input_nodes[0]
+                _annotate_input_qspec_map(
+                    node, parent, quantization_config.get_input_act_qspec()
+                )
+                mark_nodes_as_annotated([node])
+
     def validate(self, model: GraphModule) -> None:
         pass
 
diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py
index ee2844e668..89703f89b0 100644
--- a/backends/arm/quantizer/arm_quantizer_utils.py
+++ b/backends/arm/quantizer/arm_quantizer_utils.py
@@ -140,7 +140,7 @@ def is_share_obs_or_fq_op(op: Callable) -> bool:
         torch.ops.aten.adaptive_avg_pool2d.default,
         torch.ops.aten.view_copy.default,
         torch.ops.aten.view.default,
-        torch.ops.aten.slice_copy.Tensor,
+        torch.ops.aten.slice.Tensor,
         torch.ops.aten.flatten.using_ints,
         torch.ops.aten.dropout.default,
     ]
diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py
index edfaafbcc2..2fc9b338cf 100644
--- a/backends/arm/test/ops/test_clone.py
+++ b/backends/arm/test/ops/test_clone.py
@@ -8,16 +8,20 @@
 # Tests the clone op which copies the data of the input tensor (possibly with new data format)
 #
 
-import logging
 import unittest
 from typing import Tuple
 
 import torch
+
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    ArmQuantizer,
+    get_symmetric_quantization_config,
+)
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from parameterized import parameterized
 
-logger = logging.getLogger(__name__)
+from executorch.backends.xnnpack.test.tester.tester import Quantize
+from parameterized import parameterized
 
 
 class TestSimpleClone(unittest.TestCase):
@@ -53,13 +57,14 @@ def _test_clone_tosa_MI_pipeline(
     def _test_clone_tosa_BI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(),
             )
-            .quantize()
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
             .check_count({"torch.ops.aten.clone.default": 1})
             .to_edge()
@@ -72,13 +77,14 @@ def _test_clone_tosa_BI_pipeline(
     def _test_clone_tosa_u55_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_u55_compile_spec(),
             )
-            .quantize()
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
             .check_count({"torch.ops.aten.clone.default": 1})
             .to_edge()
@@ -91,16 +97,10 @@ def _test_clone_tosa_u55_pipeline(
     def test_clone_tosa_MI(self, test_tensor: torch.Tensor):
         self._test_clone_tosa_MI_pipeline(self.Clone(), (test_tensor,))
 
-    # Expected to fail since ArmQuantizer cannot quantize a Clone layer
-    # TODO MLETROCH-125
     @parameterized.expand(Clone.test_parameters)
-    @unittest.expectedFailure
     def test_clone_tosa_BI(self, test_tensor: torch.Tensor):
         self._test_clone_tosa_BI_pipeline(self.Clone(), (test_tensor,))
 
-    # Expected to fail since ArmQuantizer cannot quantize a Clone layer
-    # TODO MLETROCH-125
     @parameterized.expand(Clone.test_parameters)
-    @unittest.expectedFailure
     def test_clone_u55_BI(self, test_tensor: torch.Tensor):
         self._test_clone_tosa_u55_pipeline(self.Clone(), (test_tensor,))
diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py
new file mode 100644
index 0000000000..a1c1e29cbc
--- /dev/null
+++ b/backends/arm/test/ops/test_slice.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    ArmQuantizer,
+    get_symmetric_quantization_config,
+)
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.xnnpack.test.tester.tester import Quantize
+from parameterized import parameterized
+
+
+class TestSimpleSlice(unittest.TestCase):
+
+    class Slice(torch.nn.Module):
+
+        sizes = [(10), (10, 10), (10, 10, 10), ((1, 12, 10, 10))]
+        test_tensors = [(torch.ones(n),) for n in sizes]
+
+        def forward(self, x: torch.Tensor):
+            if x.dim() == 1:
+                return x[3:-3]
+            elif x.dim() == 2:
+                return x[1:3, 3:5]
+            elif x.dim() == 3:
+                return x[0:7, 0:1, 0:8]
+            elif x.dim() == 4:
+                return x[:, 2:5, 3:5, 4:5]
+
+    def _test_slice_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: torch.Tensor
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .export()
+            .check(["torch.ops.aten.slice.Tensor"])
+            .to_edge()
+            .check(["executorch_exir_dialects_edge__ops_aten_slice_copy"])
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_slice_tosa_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor], permute: bool
+    ):
+
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(
+                    permute_memory_to_nhwc=permute
+                ),
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check(["torch.ops.aten.slice.Tensor"])
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
+        )
+
+    def _test_slice_u55_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_u55_compile_spec(),
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check(["torch.ops.aten.slice.Tensor"])
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+
+    @parameterized.expand(Slice.test_tensors)
+    def test_slice_tosa_MI(self, tensor):
+        self._test_slice_tosa_MI_pipeline(self.Slice(), (tensor,))
+
+    @parameterized.expand(Slice.test_tensors[:2])
+    def test_slice_nchw_tosa_BI(self, test_tensor: torch.Tensor):
+        self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,), False)
+
+    @parameterized.expand(Slice.test_tensors[2:])
+    def test_slice_nhwc_tosa_BI(self, test_tensor: torch.Tensor):
+        self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,), True)
+
+    # Fails during Vela compilation when trying to use a Tuple as a Named tuple,
+    # Could be Vela Issue, wait until Regor.
+    @parameterized.expand(Slice.test_tensors)
+    @unittest.expectedFailure
+    def test_slice_u55_BI(self, test_tensor: torch.Tensor):
+        self._test_slice_u55_BI_pipeline(self.Slice(), (test_tensor,))
diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py
index 5dcd1fe73f..7eda0d9cc2 100644
--- a/backends/arm/test/ops/test_view.py
+++ b/backends/arm/test/ops/test_view.py
@@ -8,16 +8,20 @@
 # Tests the view op which changes the size of a Tensor without changing the underlying data.
 #
 
-import logging
 import unittest
 from typing import Tuple
 
 import torch
+
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    ArmQuantizer,
+    get_symmetric_quantization_config,
+)
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from parameterized import parameterized
 
-logger = logging.getLogger(__name__)
+from executorch.backends.xnnpack.test.tester.tester import Quantize
+from parameterized import parameterized
 
 
 class TestSimpleView(unittest.TestCase):
@@ -50,13 +54,14 @@ def _test_view_tosa_MI_pipeline(
     def _test_view_tosa_BI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(),
             )
-            .quantize()
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
             .check_count({"torch.ops.aten.view.default": 1})
             .to_edge()
@@ -69,13 +74,14 @@ def _test_view_tosa_BI_pipeline(
     def _test_view_u55_BI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
+        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_u55_compile_spec(),
             )
-            .quantize()
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
             .check_count({"torch.ops.aten.view.default": 1})
             .to_edge()
@@ -88,16 +94,10 @@ def _test_view_u55_BI_pipeline(
     def test_view_tosa_MI(self, test_tensor: torch.Tensor):
         self._test_view_tosa_MI_pipeline(self.View(), (test_tensor,))
 
-    # Expected to fail since ArmQuantizer cannot quantize a View layer.
-    # TODO MLETROCH-125
     @parameterized.expand(View.test_parameters)
-    @unittest.expectedFailure
     def test_view_tosa_BI(self, test_tensor: torch.Tensor):
         self._test_view_tosa_BI_pipeline(self.View(), (test_tensor,))
 
-    # Expected to fail since ArmQuantizer cannot quantize a View layer.
-    # TODO MLETROCH-125
     @parameterized.expand(View.test_parameters)
-    @unittest.expectedFailure
     def test_view_u55_BI(self, test_tensor: torch.Tensor):
         self._test_view_u55_BI_pipeline(self.View(), (test_tensor,))

From 318a178e365cca0900f3f8c06783b7934726749e Mon Sep 17 00:00:00 2001
From: Dave Bort <dbort@meta.com>
Date: Mon, 29 Jul 2024 18:34:14 -0700
Subject: [PATCH 15/75] Delete hooks.h (#4448)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4448

There are no more implementations of this, and users have switched to using the PAL.

Reviewed By: tarun292

Differential Revision: D60408510

fbshipit-source-id: 97efbb64bed64e4d981cb33be355254e9a1eb47e
---
 runtime/platform/hooks.h      | 25 -------------------------
 runtime/platform/profiler.cpp |  1 -
 runtime/platform/targets.bzl  |  1 -
 3 files changed, 27 deletions(-)
 delete mode 100644 runtime/platform/hooks.h

diff --git a/runtime/platform/hooks.h b/runtime/platform/hooks.h
deleted file mode 100644
index 28518ff788..0000000000
--- a/runtime/platform/hooks.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <stdint.h>
-
-namespace torch {
-namespace executor {
-
-// The stubs defined in this file are expected to be implemented/provided on
-// a per platform basis. e.g. we'll have one for Linux running on x86 and
-// another one maybe for a system running a RTOS on an ARM SoC.
-
-// This is expected to return a 64 bit value that contains the most
-// granular time representation available on the system. It could be
-// ticks, cycle count or time in microseconds etc.
-// TODO(T157580075): delete this file and merge functionality into Platform.hå
-uint64_t get_curr_time(void);
-
-} // namespace executor
-} // namespace torch
diff --git a/runtime/platform/profiler.cpp b/runtime/platform/profiler.cpp
index 92a00bc1c4..ac2e8e187b 100644
--- a/runtime/platform/profiler.cpp
+++ b/runtime/platform/profiler.cpp
@@ -9,7 +9,6 @@
 #include <string.h>
 
 #include <executorch/runtime/platform/assert.h>
-#include <executorch/runtime/platform/hooks.h>
 #include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/profiler.h>
 #include <inttypes.h>
diff --git a/runtime/platform/targets.bzl b/runtime/platform/targets.bzl
index 2dfad34a59..42bb851e2c 100644
--- a/runtime/platform/targets.bzl
+++ b/runtime/platform/targets.bzl
@@ -99,7 +99,6 @@ def define_common_targets():
             "platform.h",
             "system.h",
             "types.h",
-            "hooks.h",
         ],
         exported_deps = [
             ":compiler",

From db1c4d838b021bce225ad382926c1942c3eae425 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Mon, 29 Jul 2024 18:45:01 -0700
Subject: [PATCH 16/75] Add an option to turn on/off sdpa_with_kv_cache (#4444)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4444

As titled, need to test both options in export_llava.py.

Reviewed By: tarun292, iseeyuan

Differential Revision: D60406655

fbshipit-source-id: a423c65c6d134515e7399a8ef14ea54b76b34154
---
 .github/workflows/pull.yml            | 14 +++++++++++---
 examples/models/llava/export_llava.py | 27 +++++++++++++++++++++++++--
 examples/models/llava/model.py        | 21 ++++++++++++++-------
 3 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index bbbb976385..591a0328b7 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -210,11 +210,19 @@ jobs:
         bash examples/models/llava/install_requirements.sh
 
         # run export_llava.sh
-        python examples/models/llava/export_llava.py
+        python examples/models/llava/export_llava.py --use-sdpa-with-kv-cache --pte-name llava_custom_sdpa.pte
 
         # verify file exists
-        if [ ! -f "llava_combined_xnnpack.pte" ]; then
-            echo "llava_combined_xnnpack.pte not found!"
+        if [ ! -f "llava_custom_sdpa.pte" ]; then
+            echo "llava_custom_sdpa.pte not found!"
+            exit 1
+        fi
+
+        python examples/models/llava/export_llava.py --no-use-sdpa-with-kv-cache --pte-name llava.pte
+
+        # verify file exists
+        if [ ! -f "llava.pte" ]; then
+            echo "llava.pte not found!"
             exit 1
         fi
 
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
index 8c19cb977e..f57823a90a 100644
--- a/examples/models/llava/export_llava.py
+++ b/examples/models/llava/export_llava.py
@@ -4,6 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
+from argparse import ArgumentParser, BooleanOptionalAction
+
 import torch
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
     XnnpackDynamicallyQuantizedPartitioner,
@@ -30,6 +33,9 @@
 from torch.export import Dim
 from torch.nn.attention import SDPBackend
 
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+
 
 class LlavaEdgeManager(LLMEdgeManager):
     def capture_pre_autograd_graph(self) -> "LlavaEdgeManager":
@@ -155,7 +161,23 @@ def export_token_embedding(llava, prompt):
 
 
 def main():
-    llava_model = LlavaModel()
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--use-sdpa-with-kv-cache",
+        default=True,
+        action=BooleanOptionalAction,
+        help="Use sdpa_with_kv_cache custom op in LLava text model.",
+    )
+    parser.add_argument(
+        "--pte-name",
+        default="llava_combined_xnnpack.pte",
+        help="Name of the exported ExecuTorch program.",
+    )
+    args = parser.parse_args()
+    logging.info(
+        f"Exporting Llava model to ExecuTorch with sdpa_with_kv_cache: {args.use_sdpa_with_kv_cache}"
+    )
+    llava_model = LlavaModel(use_sdpa_with_kv_cache_op=args.use_sdpa_with_kv_cache)
     llava = llava_model.get_eager_model()
 
     prompt_before_image, resized, prompt_after_image = (
@@ -193,8 +215,9 @@ def main():
         }
     ).to_executorch()
 
-    with open("llava_combined_xnnpack.pte", "wb") as f:
+    with open(args.pte_name, "wb") as f:
         executorch_program.write_to_file(f)
+    logging.info(f"Exported ExecuTorch program to {args.pte_name}")
 
 
 if __name__ == "__main__":
diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py
index 31270b9042..35831192b4 100644
--- a/examples/models/llava/model.py
+++ b/examples/models/llava/model.py
@@ -62,8 +62,10 @@ def __init__(
         llava_model: LlavaMetaForCausalLM,
         image_processor: CLIPVisionTower,
         config: PreprocessConfig,
+        use_sdpa_with_kv_cache_op: bool = True,
     ):
         super().__init__()
+        self.use_sdpa_with_kv_cache_op = use_sdpa_with_kv_cache_op
         self.config = config
         self.model_ = llava_model
         self.text_model_args = ModelArgs(
@@ -73,7 +75,7 @@ def __init__(
             max_batch_size=1,  # doesn't work with default batch size 32
             ffn_dim_multiplier=1,  # TODO: a hack to make rotary embedding happy
             enable_dynamic_shape=True,  # allow parallel prefill
-            use_sdpa_with_kv_cache_op=True,  # use sdpa_with_kv_cache op
+            use_sdpa_with_kv_cache_op=use_sdpa_with_kv_cache_op,  # use sdpa_with_kv_cache op
             use_hf_rope=True,
         )
         self.embed_tokens = nn.Embedding(
@@ -83,7 +85,8 @@ def __init__(
         )
         self.text_model = Transformer(self.text_model_args)
         # use custom op for SDPA.
-        self.text_model = replace_sdpa_with_custom_op(self.text_model)
+        if use_sdpa_with_kv_cache_op:
+            self.text_model = replace_sdpa_with_custom_op(self.text_model)
         # load state dict
         self.text_model.load_state_dict(
             state_dict=self._translate_state_dict_for_text_model(),
@@ -273,7 +276,8 @@ def get_conv_mode(model_name: str) -> str:
 
 
 class LlavaModel(EagerModelBase):
-    def __init__(self):
+    def __init__(self, use_sdpa_with_kv_cache_op=True):
+        self.use_sdpa_with_kv_cache_op = use_sdpa_with_kv_cache_op
         self.model_path = "liuhaotian/llava-v1.5-7b"
         self.tokenizer, self.model, self.image_processor, context_len = (
             load_pretrained_model(
@@ -316,7 +320,12 @@ def __init__(self):
         self.resized_image = None
 
     def get_eager_model(self):
-        model = Llava(self.model, self.image_processor, self.config)
+        model = Llava(
+            self.model,
+            self.image_processor,
+            self.config,
+            self.use_sdpa_with_kv_cache_op,
+        )
         model.to(dtype=torch.float32)
         return model
 
@@ -368,8 +377,6 @@ def _get_image_dynamic_shapes(self):
         return dynamic_shapes
 
     def _get_prompt_dynamic_shapes(self):
-        dim = torch.export.Dim(
-            "token_dim", min=1, max=self.model.config.max_position_embeddings - 1
-        )
+        dim = torch.export.Dim("token_dim", min=2, max=2048)
         text_model_dynamic_shapes = ({0: 1}, {1: dim})
         return text_model_dynamic_shapes

From 1e143339d463262694244c56f2fdb698ec28e3c5 Mon Sep 17 00:00:00 2001
From: Matthias Cremon <matthiascremon@meta.com>
Date: Mon, 29 Jul 2024 19:44:46 -0700
Subject: [PATCH 17/75] Add exportable baby llama example (#4345)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4345

Add a small LLaMa model, based on the babyllama paper. Note that this test case is only one layer by default, and the number of layers can be adjusted in the test.

Removed some pyre changes that broke the OSS AoT export, and added some required passes and operators.

Reviewed By: dulinriley

Differential Revision: D60073137

fbshipit-source-id: 8379296ad0aa4099b09d033b33479165d7c7c5c9
---
 backends/cadence/aot/TARGETS                  |   4 +-
 backends/cadence/aot/compiler.py              |  20 ++--
 backends/cadence/aot/functions.yaml           |  20 ++++
 backends/cadence/aot/passes.py                | 103 +++++++++++++++++-
 backends/cadence/aot/quantizer/TARGETS        |   1 -
 backends/cadence/aot/quantizer/quantizer.py   |  25 ++---
 .../reference/operators/CMakeLists.txt        |   6 +-
 .../operators/quantized_matmul_out.cpp        |  42 +++----
 examples/cadence/models/babyllama.py          |  42 +++++++
 9 files changed, 212 insertions(+), 51 deletions(-)
 create mode 100644 examples/cadence/models/babyllama.py

diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
index bd4ec660a6..79646c1293 100644
--- a/backends/cadence/aot/TARGETS
+++ b/backends/cadence/aot/TARGETS
@@ -28,13 +28,13 @@ python_library(
         "compiler.py",
     ],
     deps = [
-        "fbsource//third-party/pypi/pyre-extensions:pyre-extensions",
         ":passes",
         ":utils",
         "//caffe2:torch",
         "//executorch/backends/cadence/aot/quantizer:fusion_pass",
         "//executorch/backends/cadence/aot/quantizer:quantizer",
         "//executorch/backends/transforms:decompose_sdpa",
+        "//executorch/backends/transforms:remove_clone_ops",
         "//executorch/exir:lib",
     ],
 )
@@ -49,5 +49,7 @@ python_library(
         "//caffe2:torch",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
+        "//executorch/exir/passes:lib",
+        "//executorch/exir/passes:spec_prop_pass",
     ],
 )
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
index 302252c42a..39511ae917 100644
--- a/backends/cadence/aot/compiler.py
+++ b/backends/cadence/aot/compiler.py
@@ -11,23 +11,23 @@
 import torch
 
 from executorch.backends.cadence.aot.passes import (
+    InitializePipeline,
+    RemoveNopExpandOpPass,
     RemoveZeroSizedCatArgsPass,
+    ReplaceLogicalNotBooleanWhereWithWherePass,
     ReplacePT2DequantWithCadenceDequantPass,
     ReplacePT2QuantWithCadenceQuantPass,
     ReplaceScalarTensorWithFullPass,
     ReplaceSqueezeAndUnsqueezeWithViewPass,
 )
 from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion
-from executorch.backends.cadence.aot.quantizer.quantizer import (
-    CadenceAtenQuantizer,
-    CadenceQuantizer,
-)
+from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer
 from executorch.backends.cadence.aot.utils import model_is_quantized
 from executorch.backends.transforms.decompose_sdpa import (
     DecomposeScaledDotProductAttention,
 )
+from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
 from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
-from pyre_extensions import assert_is_instance
 from torch._export import capture_pre_autograd_graph
 from torch.ao.quantization.pt2e.export_utils import model_is_exported
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
@@ -63,10 +63,8 @@ def quantize_pt2(
     converted_model = convert_pt2e(prepared_model)
 
     # Get patterns and apply fusion of dq -> op -> q to qop
-    patterns = [
-        assert_is_instance(q, CadenceAtenQuantizer).pattern
-        for q in quantizer.quantizers
-    ]
+    # pyre-ignore[16]: no attribute
+    patterns = [q.pattern for q in quantizer.quantizers]
     QuantFusion(patterns)(converted_model)
 
     return converted_model
@@ -148,8 +146,12 @@ def export_to_cadence(
     # Run a couple required passes for quant/dequant ops
     cadence_program_manager = edge_program_manager.transform(
         [
+            InitializePipeline(),
             RemoveZeroSizedCatArgsPass(),
+            ReplaceLogicalNotBooleanWhereWithWherePass(),
             ReplaceScalarTensorWithFullPass(),
+            RemoveCloneOpsTransform(),
+            RemoveNopExpandOpPass(),
             ReplaceSqueezeAndUnsqueezeWithViewPass(),
             ReplacePT2QuantWithCadenceQuantPass(),
             ReplacePT2DequantWithCadenceDequantPass(),
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
index f79d5f870d..dbfe1e3639 100644
--- a/backends/cadence/aot/functions.yaml
+++ b/backends/cadence/aot/functions.yaml
@@ -62,16 +62,31 @@
     - arg_meta: null
       kernel_name: torch::executor::full_out
 
+- op: mean.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::mean_dim_out
+
 - op: mul.out
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::mul_out
 
+- op: mul.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::mul_scalar_out
+
 - op: permute_copy.out
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::permute_copy_out
 
+- op: rsqrt.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::rsqrt_out
+
 - op: sigmoid.out
   kernels:
     - arg_meta: null
@@ -134,3 +149,8 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::reference::quantized_relu_out
+
+func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_matmul_out
diff --git a/backends/cadence/aot/passes.py b/backends/cadence/aot/passes.py
index ca8a44f00c..db419bfb5e 100644
--- a/backends/cadence/aot/passes.py
+++ b/backends/cadence/aot/passes.py
@@ -4,18 +4,19 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, Dict, Tuple
+# pyre-strict
+
+from typing import Any, cast, Dict, Sequence, Tuple
 
 import torch
 from executorch.backends.cadence.aot.utils import get_edge_overload_packet
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
+from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue
+from executorch.exir.passes import dead_code_elimination_pass
+from executorch.exir.passes.spec_prop_pass import SpecPropPass
 from torch._subclasses import FakeTensor
 from torch.utils._pytree import tree_map_only
 
-
-# pyre-strict
-
 # Similar to what's done in executorch/exir/pass_base.py
 Argument = Any  # pyre-ignore
 
@@ -173,3 +174,95 @@ def call_operator(
         init_args[0] = new_args
         args = tuple(args)
         return super().call_operator(op, args, kwargs, meta)
+
+
+class RemoveNopExpandOpPass(ExportPass):
+    """
+    For an expand op, if the operator shape matches the expand shape, then the
+    expand is a nop.
+    """
+
+    def call_operator(
+        self,
+        op,  # pyre-ignore
+        args: tuple[Argument, ...],
+        kwargs: dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if get_edge_overload_packet(op) not in {
+            exir_ops.edge.aten.expand_copy,
+            exir_ops.edge.aten.expand,
+        }:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # Parse the args, and check for nop condition
+        arg0 = cast(ProxyValue, args[0])
+        arg1 = cast(Sequence[int], args[1])
+        in_tensor = arg0.to_tensor()
+        if list(in_tensor.shape) == list(arg1):
+            return arg0
+
+        return super().call_operator(op, args, kwargs, meta)
+
+
+class ReplaceLogicalNotBooleanWhereWithWherePass(ExportPass):
+    """
+    A where op with a logical_not and a boolean tensor can be replaced
+    by a where op with flipped inputs and the initial boolean tensor.
+    """
+
+    def replace_logical_nop_where_with_where(
+        self, graph_module: torch.fx.GraphModule
+    ) -> None:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            # We are only interested in where nodes
+            if node.target != exir_ops.edge.aten.where.self:
+                continue
+
+            # If the third arg is not a logical_not, bail.
+            if node.args[0].target != exir_ops.edge.aten.logical_not.default:
+                continue
+
+            # Get the third arg node and its input
+            logical_not_node = node.args[0]
+            logical_not_input_tensor = (
+                logical_not_node.args[0].to_tensor()
+                if isinstance(logical_not_node.args[0], ProxyValue)
+                else logical_not_node.args[0]
+            )
+
+            # If the logical_not input is not a boolean tensor, bail.
+            if logical_not_input_tensor.meta["spec"].dtype != torch.bool:
+                continue
+
+            # Replace the where op with another one, flipping the inputs and using the boolean
+            # tensor from logical_not.
+            with graph.inserting_before(node):
+                linear_node = graph.call_function(
+                    exir_ops.edge.aten.where.self,
+                    args=(logical_not_node.args[0], node.args[2], node.args[1]),
+                )
+            # Replace all the uses
+            node.replace_all_uses_with(linear_node)
+
+        graph_module.recompile()
+        graph_module.graph.eliminate_dead_code()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        self.replace_logical_nop_where_with_where(graph_module)
+        result = super().call(graph_module)
+        return result
+
+
+class InitializePipeline(ExportPass):
+    """
+    Initialize the Jarvis pipeline. This should invariably be the first pass to
+    run.
+    """
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        dead_code_elimination_pass(graph_module)
+        result = SpecPropPass()(graph_module)
+        assert result is not None
+        return result
diff --git a/backends/cadence/aot/quantizer/TARGETS b/backends/cadence/aot/quantizer/TARGETS
index 8b3449cd85..6290626216 100644
--- a/backends/cadence/aot/quantizer/TARGETS
+++ b/backends/cadence/aot/quantizer/TARGETS
@@ -31,7 +31,6 @@ python_library(
     ],
     typing = True,
     deps = [
-        "fbsource//third-party/pypi/pyre-extensions:pyre-extensions",
         ":patterns",
         ":utils",
         "//caffe2:torch",
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
index 4cd3c6bfb4..51bace9168 100644
--- a/backends/cadence/aot/quantizer/quantizer.py
+++ b/backends/cadence/aot/quantizer/quantizer.py
@@ -26,7 +26,6 @@
     is_annotated,
     no_outside_users,
 )
-from pyre_extensions import assert_is_instance
 
 from torch import fx
 
@@ -100,14 +99,11 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                 continue
 
             for output, *custom_spec in anchors.output:
-                assert_is_instance(output, fx.Node).meta["quantization_annotation"] = (
-                    QuantizationAnnotation(
-                        # pyre-ignore[6]: incompatible parameter type
-                        output_qspec=(
-                            custom_spec[0] if custom_spec else output_act_qspec
-                        ),
-                        _annotated=True,
-                    )
+                # pyre-ignore[16]: no attribute
+                output.meta["quantization_annotation"] = QuantizationAnnotation(
+                    # pyre-ignore[6]: incompatible parameter type
+                    output_qspec=(custom_spec[0] if custom_spec else output_act_qspec),
+                    _annotated=True,
                 )
 
             def annotate_inputs(
@@ -118,16 +114,17 @@ def annotate_inputs(
                 spec: Optional[QuantizationSpec],
             ) -> None:
                 for node, idx, *custom_spec in inputs:
-                    _node = assert_is_instance(node, fx.Node)
-                    annotation = _node.meta.get(
+                    # pyre-ignore[16]: no attribute
+                    annotation = node.meta.get(
                         "quantization_annotation",
                         QuantizationAnnotation(_annotated=True),
                     )
-                    # pyre-ignore[6]: incompatible parameter type
-                    annotation.input_qspec_map[_node.args[idx]] = (
+                    # pyre-ignore[16]: no attribute
+                    annotation.input_qspec_map[node.args[idx]] = (
                         custom_spec[0] if custom_spec else spec
                     )
-                    _node.meta["quantization_annotation"] = annotation
+                    # pyre-ignore[16]: no attribute
+                    node.meta["quantization_annotation"] = annotation
 
             annotate_inputs(anchors.inputs, input_act_qspec)
             annotate_inputs(anchors.weights, weight_qspec)
diff --git a/backends/cadence/reference/operators/CMakeLists.txt b/backends/cadence/reference/operators/CMakeLists.txt
index c22dc0c997..c81e934850 100644
--- a/backends/cadence/reference/operators/CMakeLists.txt
+++ b/backends/cadence/reference/operators/CMakeLists.txt
@@ -32,12 +32,15 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mean.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_rsqrt.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sigmoid.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp"
@@ -60,7 +63,8 @@ target_include_directories(aten_ops_cadence PUBLIC ${ROOT_DIR}/..
 add_library(
   custom_ops "quantized_linear_out.cpp" "quantized_conv_out.cpp"
   "quantized_relu_out.cpp" "quantized_layer_norm.cpp"
-  "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp")
+  "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp"
+  "quantized_matmul_out.cpp")
 target_include_directories(custom_ops PUBLIC ${ROOT_DIR}/..
                                              ${CMAKE_BINARY_DIR}
                                              ${_common_include_directories})
diff --git a/backends/cadence/reference/operators/quantized_matmul_out.cpp b/backends/cadence/reference/operators/quantized_matmul_out.cpp
index 95df35caba..49dd222a96 100644
--- a/backends/cadence/reference/operators/quantized_matmul_out.cpp
+++ b/backends/cadence/reference/operators/quantized_matmul_out.cpp
@@ -13,6 +13,9 @@ namespace impl {
 namespace reference {
 namespace native {
 
+using Tensor = exec_aten::Tensor;
+using RuntimeContext = torch::executor::RuntimeContext;
+
 // The quantized matmul. The quantized matmul accumulates in a wider register,
 // whose type is TA.
 template <
@@ -50,27 +53,32 @@ __attribute__((noinline)) void qmatmul(
   }
 }
 
-template <ctype>
+template <typename T>
 void inline _typed_quantized_matmul(
     const Tensor& X,
     int64_t X_zero_point,
     const Tensor& Y,
     int64_t Y_zero_point,
-    const c10::optional<Tensor>& bias,
+    const exec_aten::optional<Tensor>& bias,
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
     bool transposed,
     Tensor& out) {
-  ctype* __restrict__ out_data = out.mutable_data_ptr<ctype>();
-  const ctype* __restrict__ X_data = X.const_data_ptr<ctype>();
-  const ctype* __restrict__ Y_data = Y.const_data_ptr<ctype>();
+  size_t batch_size = getLeadingDims(X, X.dim() - 2);
+  size_t leading_dim = X.size(X.dim() - 2);
+  size_t out_dim = Y.size(Y.dim() - 1 - transposed);
+  size_t in_dim = X.size(X.dim() - 1);
+
+  T* __restrict__ out_data = out.mutable_data_ptr<T>();
+  const T* __restrict__ X_data = X.const_data_ptr<T>();
+  const T* __restrict__ Y_data = Y.const_data_ptr<T>();
   for (size_t i = 0; i < batch_size; ++i) {
-    const ctype* x = X_data + i * leading_dim * in_dim;
-    const ctype* y = Y_data + i * in_dim * out_dim;
-    ctype* z = out_data + i * leading_dim * out_dim;
+    const T* x = X_data + i * leading_dim * in_dim;
+    const T* y = Y_data + i * in_dim * out_dim;
+    T* z = out_data + i * leading_dim * out_dim;
     if (transposed) {
-      qmatmul<ctype, int32_t, true>(
+      qmatmul<T, int32_t, true>(
           z,
           static_cast<int32_t>(out_multiplier),
           static_cast<int32_t>(out_shift),
@@ -83,7 +91,7 @@ void inline _typed_quantized_matmul(
           in_dim,
           out_dim);
     } else {
-      qmatmul<ctype, int32_t, false>(
+      qmatmul<T, int32_t, false>(
           z,
           static_cast<int32_t>(out_multiplier),
           static_cast<int32_t>(out_shift),
@@ -101,24 +109,18 @@ void inline _typed_quantized_matmul(
 }
 
 void quantized_matmul_out(
+    RuntimeContext& ctx,
     const Tensor& X,
     int64_t X_zero_point,
     const Tensor& Y,
     int64_t Y_zero_point,
-    const c10::optional<Tensor>& bias,
+    const exec_aten::optional<Tensor>& bias,
     int64_t out_multiplier,
     int64_t out_shift,
     int64_t out_zero_point,
     bool transposed,
     Tensor& out) {
-  (void)bias;
-
-  size_t batch_size = getLeadingDims(X, X.dim() - 2);
-  size_t leading_dim = X.size(X.dim() - 2);
-  size_t out_dim = Y.size(Y.dim() - 1 - transposed);
-  size_t in_dim = X.size(X.dim() - 1);
-
-  if (out.ScalarType() == at::ScalarType::Byte) {
+  if (out.scalar_type() == at::ScalarType::Byte) {
     _typed_quantized_matmul<uint8_t>(
         X,
         X_zero_point,
@@ -130,7 +132,7 @@ void quantized_matmul_out(
         out_zero_point,
         transposed,
         out);
-  } else if (out.ScalarType() == at::ScalarType::Char) {
+  } else if (out.scalar_type() == at::ScalarType::Char) {
     _typed_quantized_matmul<int8_t>(
         X,
         X_zero_point,
diff --git a/examples/cadence/models/babyllama.py b/examples/cadence/models/babyllama.py
new file mode 100644
index 0000000000..603eb5f3d9
--- /dev/null
+++ b/examples/cadence/models/babyllama.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Example script for exporting simple models to flatbuffer
+
+import logging
+
+from executorch.backends.cadence.aot.ops_registrations import *  # noqa
+
+import torch
+
+from executorch.backends.cadence.aot.export_example import export_model
+
+from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer
+
+
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+
+def main() -> None:
+    args = ModelArgs(
+        dim=512,
+        vocab_size=512,
+        hidden_dim=1024,
+        n_heads=8,
+        # use_kv_cache=True,
+        n_layers=1,
+    )
+    seq = 64
+    b = 1
+    model = Transformer(args)
+    example_inputs = (torch.randint(0, 10, [b, seq], dtype=torch.int64),)
+
+    export_model(model, example_inputs)
+
+
+if __name__ == "__main__":
+    main()

From 3d5a1491a75509856884f01ecc13f3710267e99c Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Mon, 29 Jul 2024 20:43:34 -0700
Subject: [PATCH 18/75] Add FVP tests for linear op (#4393)

Summary:
The tests ran without modifications.

Change-Id: I6bdae84c17b5da47935035b0a46696881c085c44

Pull Request resolved: https://github.com/pytorch/executorch/pull/4393

Reviewed By: cccclai

Differential Revision: D60403878

Pulled By: digantdesai

fbshipit-source-id: 638e25c960fb94a9bfc5e95379b46a75740c0285
---
 backends/arm/test/ops/test_linear.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index 0e6747fe27..61117ad7fa 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -155,7 +155,7 @@ def _test_linear_tosa_BI_pipeline(
     def _test_linear_tosa_u55_BI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -169,8 +169,12 @@ def _test_linear_tosa_u55_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
 
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+
     @parameterized.expand(test_data_suite_rank1 + test_data_suite_rank4)
     def test_linear_tosa_MI(
         self,

From c659b9c23ef790f62db96e38d982433e9d9358e9 Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Mon, 29 Jul 2024 20:49:00 -0700
Subject: [PATCH 19/75] Add flakyness mark to conv BI test (#4390)

Summary:
Change-Id: I391003f8480283872fdc0566e489bb9bb3926c6f

Pull Request resolved: https://github.com/pytorch/executorch/pull/4390

Reviewed By: cccclai

Differential Revision: D60403912

Pulled By: digantdesai

fbshipit-source-id: 04043b2c3f03aac03be19b467e70138d082481e6
---
 backends/arm/test/ops/test_depthwise_conv.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index 7eacbac432..8389b423e5 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -9,6 +9,8 @@
 
 from typing import Tuple
 
+import pytest
+
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.ops.test_conv import Conv2d
@@ -189,7 +191,9 @@ def _test_dw_conv2d_u55_BI_pipeline(
     def test_dw_conv2d_tosa_MI(self, test_name, model):
         self._test_dw_conv2d_tosa_MI_pipeline(model, model.get_inputs())
 
+    # TODO: Investigate flakyness (MLTORCH-307)
     @parameterized.expand(testsuite)
+    @pytest.mark.flaky(reruns=3)
     def test_dw_conv2d_tosa_BI(self, test_name, model):
         self._test_dw_conv2d_tosa_BI_pipeline(model, model.get_inputs())
 

From 38724d072dcc0285776212a44113390ddd73d3c6 Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Mon, 29 Jul 2024 20:51:52 -0700
Subject: [PATCH 20/75] Add test debug features (#4144)

Summary:
- Functions to get unbuilt default CompileSpec to change it before passing to ArmTester
- dump_operator_distribution to print a list with all operators in the graph and the number of times they appear.
- dump_dtype_distribution to print a list with the dtype of all placeholders and the number of times they appear.
- Cast data in tensor to float and correct shape if tensor dtype is FP32 when dumping Partition artifact.

    Signed-off-by: Erik Lundell <erik.lundell@arm.com>
    Change-Id: I7196527d060ba182b8ada8e48535d4bb7681ab68

Change-Id: I4678d19a40d5ee6ccab68798fdec7090db0eb8f8

Pull Request resolved: https://github.com/pytorch/executorch/pull/4144

Reviewed By: cccclai

Differential Revision: D59568414

Pulled By: digantdesai

fbshipit-source-id: 1c5928b6c6d1969ad497e0dd29e2d28fcb441cde
---
 backends/arm/test/common.py                |  29 +++++-
 backends/arm/test/misc/test_debug_feats.py |  25 +++++
 backends/arm/test/runner_utils.py          |  16 +++
 backends/arm/test/tester/arm_tester.py     | 108 +++++++++++++++++++--
 4 files changed, 166 insertions(+), 12 deletions(-)

diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index 906164aac3..f85fd1f2da 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -89,17 +89,26 @@ def get_tosa_compile_spec(permute_memory_to_nhwc=True, custom_path=None):
     """
     Default compile spec for TOSA tests.
     """
+    return get_tosa_compile_spec_unbuilt(permute_memory_to_nhwc, custom_path).build()
+
+
+def get_tosa_compile_spec_unbuilt(
+    permute_memory_to_nhwc=False, custom_path=None
+) -> ArmCompileSpecBuilder:
+    """Get the ArmCompileSpecBuilder for the default TOSA tests, to modify
+    the compile spec before calling .build() to finalize it.
+    """
     intermediate_path = custom_path or tempfile.mkdtemp(prefix="arm_tosa_")
     if not os.path.exists(intermediate_path):
         os.makedirs(intermediate_path, exist_ok=True)
-    compile_spec = (
+    compile_spec_builder = (
         ArmCompileSpecBuilder()
         .tosa_compile_spec()
         .set_permute_memory_format(permute_memory_to_nhwc)
         .dump_intermediate_artifacts_to(intermediate_path)
-        .build()
     )
-    return compile_spec
+
+    return compile_spec_builder
 
 
 def get_u55_compile_spec(
@@ -108,7 +117,20 @@ def get_u55_compile_spec(
     """
     Default compile spec for Ethos-U55 tests.
     """
+    return get_u55_compile_spec_unbuilt(
+        permute_memory_to_nhwc, quantize_io=quantize_io, custom_path=custom_path
+    ).build()
+
+
+def get_u55_compile_spec_unbuilt(
+    permute_memory_to_nhwc=False, quantize_io=False, custom_path=None
+) -> ArmCompileSpecBuilder:
+    """Get the ArmCompileSpecBuilder for the default TOSA tests, to modify
+    the compile spec before calling .build() to finalize it.
+    """
     artifact_path = custom_path or tempfile.mkdtemp(prefix="arm_u55_")
+    if not os.path.exists(artifact_path):
+        os.makedirs(artifact_path, exist_ok=True)
     compile_spec = (
         ArmCompileSpecBuilder()
         .ethosu_compile_spec(
@@ -120,6 +142,5 @@ def get_u55_compile_spec(
         .set_quantize_io(is_option_enabled("quantize_io") or quantize_io)
         .set_permute_memory_format(permute_memory_to_nhwc)
         .dump_intermediate_artifacts_to(artifact_path)
-        .build()
     )
     return compile_spec
diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py
index 9a0702c900..bf2a3aebd2 100644
--- a/backends/arm/test/misc/test_debug_feats.py
+++ b/backends/arm/test/misc/test_debug_feats.py
@@ -120,3 +120,28 @@ def test_numerical_diff_prints(self):
             pass  # Implicit pass test
         else:
             self.fail()
+
+
+class TestDumpOperatorsAndDtypes(unittest.TestCase):
+    def test_dump_ops_and_dtypes(self):
+        model = Linear(20, 30)
+        (
+            ArmTester(
+                model,
+                example_inputs=model.get_inputs(),
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize()
+            .dump_dtype_distribution()
+            .dump_operator_distribution()
+            .export()
+            .dump_dtype_distribution()
+            .dump_operator_distribution()
+            .to_edge()
+            .dump_dtype_distribution()
+            .dump_operator_distribution()
+            .partition()
+            .dump_dtype_distribution()
+            .dump_operator_distribution()
+        )
+        # Just test that there are no execeptions.
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index 19d76e13b4..58c99a9201 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -510,4 +510,20 @@ def dbg_tosa_fb_to_json(tosa_fb: bytes) -> Dict:
     with open(os.path.join(tmp, "output.json"), "r") as f:
         json_out = json.load(f)
 
+    # Cast float tensors to proper dtype.
+    try:
+        for region in json_out["regions"]:
+            for block in region["blocks"]:
+                for tensor in block["tensors"]:
+                    if "data" in tensor:
+                        if tensor["type"] == "FP32":
+                            data = np.array(tensor["data"])
+                            data = data.astype(np.int8)
+                            data = np.frombuffer(data, dtype=np.float32)
+                        data = data.reshape(tensor["shape"])
+                        tensor["data"] = data
+    except Exception:
+        # This is just nice-to-have if it works, don't care if it fails.
+        pass
+
     return json_out
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 97ab67b3d1..be5ea7dd71 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -4,7 +4,10 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from typing import Any, List, Literal, Optional, Tuple
+
+from collections import Counter
+from pprint import pformat
+from typing import Any, List, Literal, Optional, Tuple, Union
 
 import executorch.backends.xnnpack.test.tester.tester as tester
 
@@ -31,6 +34,7 @@
 from executorch.backends.xnnpack.test.tester import Tester
 from executorch.exir import EdgeCompileConfig
 from executorch.exir.backend.compile_spec_schema import CompileSpec
+from torch.fx import Graph
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -39,7 +43,6 @@
 class Partition(tester.Partition):
     def dump_artifact(self, path_to_dump: Optional[str]):
         super().dump_artifact(path_to_dump)
-        from pprint import pformat
 
         to_print = None
         for spec in self.graph_module.lowered_module_0.compile_specs:
@@ -55,12 +58,7 @@ def dump_artifact(self, path_to_dump: Optional[str]):
                     to_print = f"\n Vela command stream: \n{to_print}"
                 break
         assert to_print is not None, "No TOSA nor Vela compile spec found"
-
-        if path_to_dump:
-            with open(path_to_dump, "a") as fp:
-                fp.write(to_print)
-        else:
-            print(to_print)
+        _dump_str(to_print, path_to_dump)
 
 
 class Serialize(tester.Serialize):
@@ -272,6 +270,66 @@ def run_method_and_compare_outputs(
 
         return self
 
+    def get_graph(self, stage: str | None = None) -> Graph:
+        if stage is None:
+            stage = self.cur
+        artifact = self.get_artifact(stage)
+        if self.cur == self.stage_name(tester.ToEdge) or self.cur == self.stage_name(
+            Partition
+        ):
+            graph = artifact.exported_program().graph
+        elif self.cur == self.stage_name(tester.Export) or self.cur == self.stage_name(
+            tester.Quantize
+        ):
+            graph = artifact.graph
+        else:
+            raise RuntimeError(
+                "Can only get a graph from Quantize, ToEdge, Export, and Partition stages."
+            )
+
+        return graph
+
+    def dump_operator_distribution(
+        self, path_to_dump: Optional[str] = None
+    ) -> ArmQuantizer:
+        """Dump a dictionary with {operator: operator count} for the operators in the
+        graph of the current stage.
+
+        Returns self for daisy-chaining.
+        """
+        graph = self.get_graph(self.cur)
+        op_dist = _get_operator_distribution(graph)
+        to_print = self.cur + " operators: " + _format_dict(op_dist) + "\n"
+        _dump_str(to_print, path_to_dump)
+        return self
+
+    def dump_dtype_distribution(
+        self, path_to_dump: Optional[str] = None
+    ) -> ArmQuantizer:
+        """Dump a dictionary with {dtype: dtype count} for the dtypes of the nodes in the
+        graph of the current stage.
+
+        Returns self for daisy-chaining.
+        """
+        graph = self.get_graph(self.cur)
+        op_dist = _get_dtype_distribution(graph)
+        to_print = self.cur + " placeholder data types: " + _format_dict(op_dist) + "\n"
+        _dump_str(to_print, path_to_dump)
+        return self
+
+    @staticmethod
+    def _calculate_reference_output(
+        module: Union[torch.fx.GraphModule, torch.nn.Module], inputs
+    ) -> torch.Tensor:
+        """
+        Note: I'd prefer to use the base class method here, but since it use the
+        exported program, I can't. The partitioner stage clears the state_dict
+        of the exported program, which causes an issue when evaluating the
+        module.
+        """
+
+        return module.forward(*inputs)
+
     def transpose_data_format(
         self, data: Tuple[torch.Tensor], to: Literal["NHWC", "NCHW"]
     ):
@@ -331,3 +389,37 @@ def _compare_outputs(
             )
             logger.error(f"{atol=}, {rtol=}, {qtol=}")
             raise e
+
+
+def _get_dtype_distribution(graph: Graph) -> dict:
+    """Counts the occurences of placeholder data types in a graph.
+    The result is a dict {'data type':'number of placeholders'}
+    """
+    return Counter(
+        [
+            node.meta["val"].dtype
+            for node in list(graph.nodes)
+            if node.op == "placeholder"
+        ]
+    )
+
+
+def _get_operator_distribution(graph: Graph) -> dict[str, int]:
+    """Counts the occurences of operator names in a graph.
+    The result is a dict {'operator name':'number of nodes'}
+    """
+    return Counter(
+        [str(node.target) for node in list(graph.nodes) if node.op == "call_function"]
+    )
+
+
+def _dump_str(to_print: str, path_to_dump: Optional[str] = None):
+    if path_to_dump:
+        with open(path_to_dump, "a") as fp:
+            fp.write(to_print)
+    else:
+        print(to_print)
+
+
+def _format_dict(to_print: dict) -> str:
+    return pformat(to_print, compact=True, indent=1)

From 3c25aec9cae58de560e0a807efbb1939ab89f2fe Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Mon, 29 Jul 2024 20:57:02 -0700
Subject: [PATCH 21/75] Add docstrings to all unittest.TestCase:s (#4391)

Summary:
This avoids logging a long default docstring which makes the output of test collection a lot easier to read and thus debug.

I also updated the mv2net weight parameter as the current way of calling it is deprecated. This also removes a warning from test collection.

Change-Id: I05f000e9ef42ab9d63f234539da3309f69ccbe16

Pull Request resolved: https://github.com/pytorch/executorch/pull/4391

Reviewed By: cccclai

Differential Revision: D60404067

Pulled By: digantdesai

fbshipit-source-id: 4ad44eb35887278cacd7e2c0ac93114b125e72af
---
 backends/arm/test/misc/test_debug_feats.py         | 4 ++++
 backends/arm/test/models/test_mobilenet_v2_arm.py  | 3 ++-
 backends/arm/test/ops/test_add.py                  | 2 ++
 backends/arm/test/ops/test_avg_pool.py             | 2 ++
 backends/arm/test/ops/test_batch_norm.py           | 2 ++
 backends/arm/test/ops/test_clone.py                | 2 ++
 backends/arm/test/ops/test_conv.py                 | 2 ++
 backends/arm/test/ops/test_conv_combos.py          | 2 ++
 backends/arm/test/ops/test_depthwise_conv.py       | 3 +++
 backends/arm/test/ops/test_div.py                  | 2 ++
 backends/arm/test/ops/test_full.py                 | 2 ++
 backends/arm/test/ops/test_linear.py               | 1 +
 backends/arm/test/ops/test_mean_dim.py             | 2 ++
 backends/arm/test/ops/test_softmax.py              | 2 ++
 backends/arm/test/ops/test_view.py                 | 2 ++
 backends/arm/test/passes/test_tag_io_quant_pass.py | 1 +
 16 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py
index bf2a3aebd2..aa9703f9eb 100644
--- a/backends/arm/test/misc/test_debug_feats.py
+++ b/backends/arm/test/misc/test_debug_feats.py
@@ -41,6 +41,8 @@ def forward(self, x):
 
 
 class TestDumpPartitionedArtifact(unittest.TestCase):
+    """Tests dumping the partition artifact in ArmTester. Both to file and to stdout."""
+
     def _tosa_MI_pipeline(self, module: torch.nn.Module, dump_file=None):
         (
             ArmTester(
@@ -96,6 +98,8 @@ def test_BI_artifact(self):
 
 
 class TestNumericalDiffPrints(unittest.TestCase):
+    """Tests trigging the exception printout from the ArmTester's run and compare function."""
+
     def test_numerical_diff_prints(self):
         model = Linear(20, 30)
         tester = (
diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py
index eae5d4358a..248153a518 100644
--- a/backends/arm/test/models/test_mobilenet_v2_arm.py
+++ b/backends/arm/test/models/test_mobilenet_v2_arm.py
@@ -22,8 +22,9 @@
 
 
 class TestMobileNetV2(unittest.TestCase):
+    """Tests MobileNetV2."""
 
-    mv2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights)
+    mv2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT)
     mv2 = mv2.eval()
     normalize = transforms.Normalize(
         mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index 622d811822..3bd2b2605c 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -17,6 +17,8 @@
 
 
 class TestSimpleAdd(unittest.TestCase):
+    """Tests a single add op, x+x and x+y."""
+
     class Add(torch.nn.Module):
         test_parameters = [
             (torch.FloatTensor([1, 2, 3, 5, 7]),),
diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py
index fb2609939f..32a0e5555a 100644
--- a/backends/arm/test/ops/test_avg_pool.py
+++ b/backends/arm/test/ops/test_avg_pool.py
@@ -28,6 +28,8 @@
 
 
 class TestAvgPool2d(unittest.TestCase):
+    """Tests AvgPool2d."""
+
     class AvgPool2d(torch.nn.Module):
         def __init__(
             self,
diff --git a/backends/arm/test/ops/test_batch_norm.py b/backends/arm/test/ops/test_batch_norm.py
index 0d6f9dea2c..4935e910d6 100644
--- a/backends/arm/test/ops/test_batch_norm.py
+++ b/backends/arm/test/ops/test_batch_norm.py
@@ -497,6 +497,8 @@
 
 
 class TestBatchNorm2d(unittest.TestCase):
+    """Tests BatchNorm2d."""
+
     class BatchNorm2d(torch.nn.Module):
         def __init__(
             self,
diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py
index 2fc9b338cf..8386283f24 100644
--- a/backends/arm/test/ops/test_clone.py
+++ b/backends/arm/test/ops/test_clone.py
@@ -25,6 +25,8 @@
 
 
 class TestSimpleClone(unittest.TestCase):
+    """Tests clone."""
+
     class Clone(torch.nn.Module):
         sizes = [10, 15, 50, 100]
         test_parameters = [(torch.ones(n),) for n in sizes]
diff --git a/backends/arm/test/ops/test_conv.py b/backends/arm/test/ops/test_conv.py
index 614d056072..9ebfe77da2 100644
--- a/backends/arm/test/ops/test_conv.py
+++ b/backends/arm/test/ops/test_conv.py
@@ -244,6 +244,8 @@ def forward(self, x):
 
 
 class TestConv2D(unittest.TestCase):
+    """Tests Conv2D, both single ops and multiple Convolutions in series."""
+
     def _test_conv2d_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index 41f76ccbb7..88006df1a0 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -154,6 +154,8 @@ def forward(self, x):
 
 
 class TestConvCombos(unittest.TestCase):
+    """Tests conv combined with other ops."""
+
     def _test_conv_combo_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index 8389b423e5..9b3f79e6a1 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -132,6 +132,9 @@
 
 
 class TestDepthwiseConv2D(unittest.TestCase):
+    """Tests Conv2D where groups == in_channels and out_channels = K * in_channels. This
+    is a special case enables depthwise convolution."""
+
     def _test_dw_conv2d_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py
index b13581dca1..60a0b8a4cc 100644
--- a/backends/arm/test/ops/test_div.py
+++ b/backends/arm/test/ops/test_div.py
@@ -78,6 +78,8 @@
 
 
 class TestDiv(unittest.TestCase):
+    """Tests division"""
+
     class Div(torch.nn.Module):
         def __init__(
             self,
diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py
index 4f01b1c8f9..1be7f59ab8 100644
--- a/backends/arm/test/ops/test_full.py
+++ b/backends/arm/test/ops/test_full.py
@@ -19,6 +19,8 @@
 
 
 class TestFull(unittest.TestCase):
+    """Tests the full op which creates a tensor of a given shape filled with a given value."""
+
     class Full(torch.nn.Module):
         # A single full op
         def forward(self):
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index 61117ad7fa..33f62955ec 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -91,6 +91,7 @@
 
 
 class TestLinear(unittest.TestCase):
+    """tests the linear operation y = Ax + b"""
 
     _edge_compile_config: EdgeCompileConfig = EdgeCompileConfig(
         _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index 433661e99e..e0db958f74 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -40,6 +40,8 @@
 
 
 class TestMeanDim(unittest.TestCase):
+    """Tests MeanDim, called AdaptiveAvgPool2d in Pytorch."""
+
     class MeanDim(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py
index b2ef115dad..b3b6230daa 100644
--- a/backends/arm/test/ops/test_softmax.py
+++ b/backends/arm/test/ops/test_softmax.py
@@ -28,6 +28,8 @@
 
 
 class TestSoftmax(unittest.TestCase):
+    """Tests softmax."""
+
     class Softmax(torch.nn.Module):
         def __init__(self, dim: int = -1):
             super().__init__()
diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py
index 7eda0d9cc2..1f51261bf7 100644
--- a/backends/arm/test/ops/test_view.py
+++ b/backends/arm/test/ops/test_view.py
@@ -25,6 +25,8 @@
 
 
 class TestSimpleView(unittest.TestCase):
+    """Tests the view operation."""
+
     class View(torch.nn.Module):
 
         sizes = [10, 15, 50, 100]
diff --git a/backends/arm/test/passes/test_tag_io_quant_pass.py b/backends/arm/test/passes/test_tag_io_quant_pass.py
index 8757cf99d8..9f292bb7ca 100644
--- a/backends/arm/test/passes/test_tag_io_quant_pass.py
+++ b/backends/arm/test/passes/test_tag_io_quant_pass.py
@@ -22,6 +22,7 @@ def forward(self, x):
 
 
 class TestTagIOQuantPass(unittest.TestCase):
+    """Tests the TagIOQuantPass which tags q/dq nodes on model inputs and outputs to not include them in our partitions."""
 
     def _tosa_BI_u55_pipeline(self, module: torch.nn.Module):
         (

From 28cfabb58e01c81dc1587180597f974a870a6309 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Mon, 29 Jul 2024 21:29:25 -0700
Subject: [PATCH 22/75] Fix use_sdpa_with_kv_cache option (#4456)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4456

As titled. In `export_llava.py` `export_text_model()` needs to
respect `use_sdpa_with_kv_cache_op` option.

Reviewed By: cccclai

Differential Revision: D60431561

fbshipit-source-id: 63d49f39339435fb16f0c1c62288fd31c86b3be8
---
 examples/models/llava/export_llava.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
index f57823a90a..7cf14e07d1 100644
--- a/examples/models/llava/export_llava.py
+++ b/examples/models/llava/export_llava.py
@@ -83,11 +83,14 @@ def forward(self, input_pos, embeddings):
     )
     quant_transform = get_quant_weight_transform(args, dtype_override, False)
     pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
-
+    source_transforms = []
+    if llava.use_sdpa_with_kv_cache_op:
+        source_transforms.append(replace_sdpa_with_custom_op)
+    source_transforms.append(quant_transform)
     manager = (
         text_model_em.set_output_dir("./")
         .to_dtype(dtype_override)
-        .source_transform([replace_sdpa_with_custom_op, quant_transform])
+        .source_transform(source_transforms)
         .capture_pre_autograd_graph()
         .pt2e_quantize(quantizers)
     )

From b7c8378d57b0e18d30ff30197125a89744d16d70 Mon Sep 17 00:00:00 2001
From: Alexey Kozhevnikov <akozhevnikov@meta.com>
Date: Tue, 30 Jul 2024 09:38:13 -0700
Subject: [PATCH 23/75] nop validation during build (#4449)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4449

Adding validation logic to "build targets" implementation. Actual validation is nop in this diff, implementaion is in next diffs in stack.

I need to use late bindings in order to keep validation logic in separate crate because `buck2_build_api` depends on validation, while validation depends on materialization from `buck2_build_api`.

Reviewed By: stepancheg

Differential Revision: D60238806

fbshipit-source-id: e1484731ce099189555bd306c1f93bab91da7de8
---
 shim/third-party/rust/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shim/third-party/rust/Cargo.toml b/shim/third-party/rust/Cargo.toml
index 0b8fa9f08e..718d9ea5a6 100644
--- a/shim/third-party/rust/Cargo.toml
+++ b/shim/third-party/rust/Cargo.toml
@@ -169,7 +169,7 @@ rustyline = "11.0"
 scopeguard = "1.0.0"
 sequence_trie = "0.3.6"
 serde = { version = "1.0.173", features = ["derive", "rc"] }
-serde_json = "1.0.48"
+serde_json = { version = "1.0.48", features = ["raw_value"] }
 sha1 = "0.10"
 sha2 = "0.10"
 shlex = "1.3"

From da7ca6ff22804ae519b8d3f9dd085c885b94e5bc Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Tue, 30 Jul 2024 12:10:32 -0700
Subject: [PATCH 24/75] Fix build error (#4464)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4464

Fix an internal build error with BUCK.

Reviewed By: jorgep31415

Differential Revision: D60458756

fbshipit-source-id: a13b3f2de6754dda86ac73eb0f8e24de60a0a98a
---
 backends/vulkan/tools/gpuinfo/TARGETS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/vulkan/tools/gpuinfo/TARGETS b/backends/vulkan/tools/gpuinfo/TARGETS
index e9dd22e92d..10e3acb4b8 100644
--- a/backends/vulkan/tools/gpuinfo/TARGETS
+++ b/backends/vulkan/tools/gpuinfo/TARGETS
@@ -23,6 +23,7 @@ buck_filegroup(
 
 vulkan_spv_shader_lib(
     name = "gpuinfo_shader_lib",
+    is_fbcode = True,
     spv_filegroups = {
         ":gpuinfo_shaders": "glsl",
     },

From ea0c017224bb16c0455e5b8deb700b5638103652 Mon Sep 17 00:00:00 2001
From: Esteban Padilla Cerdio <estebanpadilla@meta.com>
Date: Tue, 30 Jul 2024 13:31:12 -0700
Subject: [PATCH 25/75] Add 3D Texture Bandwidth metric (#4336)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4336

This diff introduces a profiler that obtains the maximum and minimum bandwidth for reading unique addresses from 3D textures in each of its dimensions, using the following shader, where A is a 3D texture and B is a writeonly buffer.

The calculation of the texel position will depend on the dimension that is being benchmarked

x : pos = ivec3(offset, 0, 0)
y : pos = ivec3(0, offset, 0)
z : pos = ivec3(0, 0, offset)

  void main() {
    vec4 sum = vec4(0);
    const uint workgroup_width = local_group_size * niter * ${NUNROLL};
    uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;

    int i = 0;
    for (; i < niter; ++i)
    {
        sum *= texelFetch(A, pos, 0);
        offset = (offset + local_group_size) & addr_mask;
        ...
        ...
        sum *= texelFetch(A, pos, 0);
        offset = (offset + local_group_size) & addr_mask;
    }

    vec4 zero = vec4(i>>31);

    B[gl_LocalInvocationID[0]] = sum + zero;
  }

The address mask allows us to control how many unique addresses we are accessing. If the number of unique vectors we want to read is 3, the offset will jump between three unique addresses throughout the iterations, giving us the bandwidth for that specific size of data. If the size of the unique data read is larger than the work group size, then each run will have its own block of data to read, defined by the initial offset calculation, where the offset is obtained through the workgroup ID and the local invocation ID.

Finally, we make sure to use the `sum` and `i	` variables so that the compiler's optimizer does not flatten the loops.

For a Samsung S22, the bandwidth behaves like this for each of the dimensions.
{F1767497386}

Comparing the bandwidth for the X dimension to OpenCL, which was obtained through [ArchProbe](https://github.com/microsoft/ArchProbe), we can observe that, although the behavior is the same, Vulkan has an increased bandwidth for most access sizes.

{F1767497972}

Comparing to the bandwidth for buffers, we can observe that the bandwidth is similar to regular buffers, but still much smaller than UBOs at small access sizes.

 {F1767497707}

Reviewed By: jorgep31415

Differential Revision: D59980139

fbshipit-source-id: acc696ef21e6d07cf6f12d3790084faa64377093
---
 .../tools/gpuinfo/glsl/tex_bandwidth.glsl     |  59 +++++++++
 .../tools/gpuinfo/glsl/tex_bandwidth.yaml     |  15 +++
 backends/vulkan/tools/gpuinfo/src/app.cpp     | 112 ++++++++++++++++++
 3 files changed, 186 insertions(+)
 create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
 create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml

diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
new file mode 100644
index 0000000000..d848fc0475
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_sampler(0, "r", "A", DTYPE)}
+${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int niter = 1;
+layout(constant_id = 4) const int nvec = 1;
+layout(constant_id = 5) const int local_group_size = 1;
+
+void main() {
+    // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
+    // This will help us limit address accessing to a specific set of unique
+    // addresses depending on the access size we want to measure.
+    const int addr_mask = nvec - 1;
+    vec4 sum = vec4(0);
+
+    // This is to distribute the accesses to unique addresses across the workgroups, once the
+    // size of the access excedes the workgroup width.
+    const uint workgroup_width = local_group_size * niter * ${NUNROLL};
+    uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;
+
+    int i = 0;
+    for (; i < niter; ++i){
+      VEC4_T in_texel;
+      $for j in range(int(NUNROLL)):
+        $if DIM == 0:
+            in_texel = texelFetch(A, ivec3(offset, 0, 0), 0);
+        $elif DIM == 1:
+            in_texel = texelFetch(A, ivec3(0, offset, 0), 0);
+        $elif DIM == 2:
+            in_texel = texelFetch(A, ivec3(0, 0, offset), 0);
+
+        sum *= in_texel;
+
+        // On each unroll, a new unique address will be accessed through the offset,
+        // limited by the address mask to a specific set of unique addresses
+        offset = (offset + local_group_size) & addr_mask;
+    }
+
+    // This is to ensure no compiler optimizations occur
+    vec4 zero = vec4(i>>31);
+
+    B[gl_LocalInvocationID[0]] = sum + zero;
+}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml
new file mode 100644
index 0000000000..84da6938fd
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+tex_bandwidth:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NUNROLL: "16"
+  generate_variant_forall:
+    DIM:
+      - RANGE: [0, 2]
+  shader_variants:
+    - NAME: tex_bandwidth
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
index 8facdb5160..92eef84068 100644
--- a/backends/vulkan/tools/gpuinfo/src/app.cpp
+++ b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -22,6 +22,9 @@ class App {
   uint32_t sm_count_;
   uint32_t nthread_logic_;
   uint32_t subgroup_size_;
+  uint32_t max_tex_width_;
+  uint32_t max_tex_height_;
+  uint32_t max_tex_depth_;
 
  public:
   App() {
@@ -36,6 +39,9 @@ class App {
     nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
     buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
     max_shared_mem_size_ = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+    max_tex_width_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_WIDTH>();
+    max_tex_height_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_HEIGHT>();
+    max_tex_depth_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_DEPTH>();
 
     VkPhysicalDeviceSubgroupProperties subgroup_props{};
     VkPhysicalDeviceProperties2 props2{};
@@ -54,6 +60,9 @@ class App {
     std::cout << "Cache Size," << buf_cache_size_ << std::endl;
     std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl;
     std::cout << "SubGroup Size," << subgroup_size_ << std::endl;
+    std::cout << "MaxTexWidth," << max_tex_width_ << std::endl;
+    std::cout << "MaxTexHeight," << max_tex_height_ << std::endl;
+    std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl;
   }
 
   void reg_count() {
@@ -308,6 +317,15 @@ class App {
               << std::endl;
   }
 
+  std::vector<int64_t> _whd_to_nchw(std::vector<int64_t> sizes) {
+    const int64_t W = sizes[0];
+    const int64_t H = sizes[1];
+    const int64_t D = sizes[2];
+
+    // Channels-packed: {W, H, D} = {W, H, (C / 4) * N}
+    return {1, D * 4, H, W};
+  }
+
  public:
   void buf_bandwidth() {
     std::cout << "\n------ Memory Bandwidth ------" << std::endl;
@@ -323,12 +341,105 @@ class App {
     const uint32_t RANGE = 128 * 1024 * 1024;
     _bandwidth("UBO", RANGE);
   }
+
   void shared_mem_bandwidth() {
     std::cout << "\n------ Shared Bandwidth ------" << std::endl;
     const uint32_t RANGE = max_shared_mem_size_;
     _bandwidth("Shared", RANGE);
   }
 
+  void tex_bandwidth() {
+    for (int dim = 0; dim < 3; dim++) {
+      std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
+                << std::endl;
+      const uint32_t MAX_SIZE = dim == 0 ? max_tex_width_
+          : dim == 1                     ? max_tex_height_
+                                         : max_tex_depth_;
+
+      // rgba, float
+      const uint32_t VEC_WIDTH = 4;
+      const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
+      const uint32_t NVEC = MAX_SIZE;
+
+      const uint32_t RANGE = NVEC * VEC_SIZE;
+
+      // Cache lines flushed
+      const uint32_t NFLUSH = 4;
+      // Number of loop unrolls. Changing this value requires an equal change in
+      // tex_bandwidth.yaml
+      const uint32_t NUNROLL = 16;
+      // Number of iterations. Increasing this value reduces noise in exchange
+      // for higher latency.
+      const uint32_t NITER = 10;
+      // Number of memory reads per thread
+      const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
+      // Number of threads needed to read all texells
+      const uint32_t NTHREAD = NVEC;
+      // Occupy all threads
+      const uint32_t local_x = nthread_logic_;
+      // Ensure that global is a multiple of local, and distribute across all
+      // SMs
+      const uint32_t global_x =
+          (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH;
+
+      auto shader_name = "tex_bandwidth_" + std::to_string(dim);
+
+      std::vector<int64_t> sizes_whd = {MAX_SIZE, 1, 1};
+      if (dim == 1) {
+        sizes_whd = {1, MAX_SIZE, 1};
+      } else if (dim == 2) {
+        sizes_whd = {1, 1, MAX_SIZE};
+      }
+      auto sizes_nchw = _whd_to_nchw(sizes_whd);
+
+      vTensor in_tensor =
+          api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
+
+      auto bench = [&](uint32_t access_size, uint32_t dim) {
+        // Number of texels that fit in this iteration
+        const uint32_t ntexel_access = access_size / VEC_SIZE;
+
+        StorageBuffer out_buf(
+            context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
+        vkapi::PipelineBarrier pipeline_barrier{};
+
+        auto time = benchmark_on_gpu(shader_name, 10, [&]() {
+          context()->submit_compute_job(
+              VK_KERNEL_FROM_STR(shader_name),
+              pipeline_barrier,
+              {global_x, 1, 1},
+              {local_x, 1, 1},
+              {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)},
+              VK_NULL_HANDLE,
+              0,
+              in_tensor.image(),
+              out_buf.buffer());
+        });
+
+        const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
+        double gbps = SIZE_TRANS * 1e-3 / time;
+        std::cout << "Texture bandwidth accessing \t" << access_size
+                  << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
+                  << "\tus)" << std::endl;
+        return gbps;
+      };
+
+      double max_bandwidth = 0;
+      double min_bandwidth = DBL_MAX;
+      for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
+           access_size *= 2) {
+        double gbps = bench(access_size, dim);
+        max_bandwidth = std::max(gbps, max_bandwidth);
+        min_bandwidth = std::min(gbps, min_bandwidth);
+      }
+
+      std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth
+                << std::endl;
+      std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth
+                << std::endl;
+    }
+  }
+
   // Warp size is a difficult metric to obtain because the hardware limitations
   // do not always coincide with the way the SM divides the workload. For
   // instance, the hardware can have a warp size of 64 threads, but an SM might
@@ -492,6 +603,7 @@ int main(int argc, const char** argv) {
   app.ubo_bandwidth();
   app.shared_mem_bandwidth();
   app.warp_size();
+  app.tex_bandwidth();
 
   return 0;
 }

From 298b625a9fdd7eb1695f7552faa8d9c8a88208ef Mon Sep 17 00:00:00 2001
From: Esteban Padilla Cerdio <estebanpadilla@meta.com>
Date: Tue, 30 Jul 2024 13:31:12 -0700
Subject: [PATCH 26/75] Add config file support for constants and test control
 (#4337)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4337

Now that the tool is getting larger, a configuration file for defining which tests to run and which to skip, as well as specifying some values like thresholds and ranges, comes in handy. This diff adds support for a JSON config file with specifications for each test.

Reviewed By: jorgep31415

Differential Revision: D60060188

fbshipit-source-id: d6ee9cbff52b3ab13e9a06a42dd54aec002fae11
---
 backends/vulkan/tools/gpuinfo/config.json |  43 ++++++
 backends/vulkan/tools/gpuinfo/src/app.cpp | 151 +++++++++++++++++-----
 2 files changed, 161 insertions(+), 33 deletions(-)
 create mode 100644 backends/vulkan/tools/gpuinfo/config.json

diff --git a/backends/vulkan/tools/gpuinfo/config.json b/backends/vulkan/tools/gpuinfo/config.json
new file mode 100644
index 0000000000..1efb9690fe
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/config.json
@@ -0,0 +1,43 @@
+{
+  "reg_count": {
+    "enabled": true,
+    "threshold": 3,
+    "compensate": 0.1
+  },
+  "buf_cacheline_size": {
+    "enabled": true,
+    "threshold": 10,
+    "compensate": 0.1
+  },
+  "buffer_bandwidth": {
+    "enabled": true,
+    "range": 134217728,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  },
+  "ubo_bandwidth": {
+    "enabled": true,
+    "range": 134217728,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  },
+  "shared_mem_bandwidth": {
+    "enabled": true,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  },
+  "warp_size": {
+    "enabled": true,
+    "threshold": 3,
+    "compensate": 0.1
+  },
+  "tex_bandwidth": {
+    "enabled": true,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  }
+}
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
index 92eef84068..c33e8a011d 100644
--- a/backends/vulkan/tools/gpuinfo/src/app.cpp
+++ b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
+#include <folly/json.h>
+#include <fstream>
 #include <iostream>
 
 #include "stats.h"
@@ -25,6 +27,46 @@ class App {
   uint32_t max_tex_width_;
   uint32_t max_tex_height_;
   uint32_t max_tex_depth_;
+  folly::dynamic config_;
+
+  std::vector<int64_t> _whd_to_nchw(std::vector<int64_t> sizes) {
+    const int64_t W = sizes[0];
+    const int64_t H = sizes[1];
+    const int64_t D = sizes[2];
+
+    // Channels-packed: {W, H, D} = {W, H, (C / 4) * N}
+    return {1, D * 4, H, W};
+  }
+
+  float _get_config(const std::string& test, const std::string& key) {
+    if (config_[test].empty()) {
+      throw std::runtime_error("Missing config for " + test);
+    }
+
+    if (!config_[test][key].isNumber()) {
+      throw std::runtime_error(
+          "Config for " + test + "." + key + " is not a number");
+    }
+
+    float value;
+    if (config_[test][key].isDouble()) {
+      value = config_[test][key].getDouble();
+    } else {
+      value = config_[test][key].getInt();
+    }
+
+    std::cout << "Read value for " << test << "." << key << " = " << value
+              << std::endl;
+    return value;
+  }
+
+  bool _enabled(const std::string& test) {
+    if (config_.empty() || config_[test].empty() ||
+        !config_[test]["enabled"].isBool()) {
+      return true;
+    }
+    return config_[test]["enabled"].getBool();
+  }
 
  public:
   App() {
@@ -65,16 +107,32 @@ class App {
     std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl;
   }
 
+  void load_config(std::string file_path) {
+    std::ifstream file(file_path);
+    std::stringstream buffer;
+    buffer << file.rdbuf();
+    const std::string json_str = buffer.str();
+    if (json_str.empty()) {
+      throw std::runtime_error(
+          "Failed to read config file from " + file_path + ".");
+    }
+    config_ = folly::parseJson(json_str);
+  }
+
   void reg_count() {
+    if (!_enabled("reg_count")) {
+      std::cout << "Skipped Register Count" << std::endl;
+      return;
+    }
+
     std::cout << std::endl;
     std::cout << "------ Register Count ------" << std::endl;
     const uint32_t NREG_MIN = 1;
     const uint32_t NREG_MAX = 512;
     const uint32_t NREG_STEP = 1;
 
-    // TODO: Make these values configurable
-    const double COMPENSATE = 0.01;
-    const double THRESHOLD = 3;
+    const double COMPENSATE = _get_config("reg_count", "compensate");
+    const double THRESHOLD = _get_config("reg_count", "threshold");
 
     const uint32_t NGRP_MIN = 1;
     const uint32_t NGRP_MAX = 64;
@@ -175,12 +233,16 @@ class App {
   }
 
   void buf_cacheline_size() {
+    if (!_enabled("buf_cacheline_size")) {
+      std::cout << "Skipped Buffer Cacheline Size" << std::endl;
+      return;
+    }
+
     std::cout << std::endl;
     std::cout << "------ Buffer Cacheline Size ------" << std::endl;
 
-    // TODO: Make these values configurable
-    const double COMPENSATE = 0.01;
-    const double THRESHOLD = 10;
+    const double COMPENSATE = _get_config("buf_cacheline_size", "compensate");
+    const double THRESHOLD = _get_config("buf_cacheline_size", "threshold");
 
     const uint32_t PITCH = buf_cache_size_ / nthread_logic_;
     const uint32_t BUF_SIZE = buf_cache_size_;
@@ -237,15 +299,23 @@ class App {
 
  private:
   void _bandwidth(std::string memtype, uint32_t range) {
-    // TODO: Make these values configurable
+    auto memtype_lower = memtype;
+    std::transform(
+        memtype_lower.begin(),
+        memtype_lower.end(),
+        memtype_lower.begin(),
+        [](unsigned char c) { return std::tolower(c); });
+
+    auto test_name = memtype_lower + "_bandwidth";
+
     // Cache lines flushed
-    const uint32_t NFLUSH = 4;
+    const uint32_t NFLUSH = _get_config(test_name, "nflush");
     // Number of loop unrolls. Changing this value requires an equal change in
     // buf_bandwidth.yaml
-    const uint32_t NUNROLL = 16;
+    const uint32_t NUNROLL = _get_config(test_name, "nunroll");
     // Number of iterations. Increasing this value reduces noise in exchange for
     // higher latency.
-    const uint32_t NITER = 10;
+    const uint32_t NITER = _get_config(test_name, "niter");
     // Vector dimensions (vec4)
     const uint32_t VEC_WIDTH = 4;
     const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
@@ -273,12 +343,6 @@ class App {
           context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
       vkapi::PipelineBarrier pipeline_barrier{};
 
-      auto memtype_lower = memtype;
-      std::transform(
-          memtype_lower.begin(),
-          memtype_lower.end(),
-          memtype_lower.begin(),
-          [](unsigned char c) { return std::tolower(c); });
       auto shader_name = "buf_bandwidth_" + memtype_lower;
 
       auto time = benchmark_on_gpu(shader_name, 10, [&]() {
@@ -317,38 +381,49 @@ class App {
               << std::endl;
   }
 
-  std::vector<int64_t> _whd_to_nchw(std::vector<int64_t> sizes) {
-    const int64_t W = sizes[0];
-    const int64_t H = sizes[1];
-    const int64_t D = sizes[2];
-
-    // Channels-packed: {W, H, D} = {W, H, (C / 4) * N}
-    return {1, D * 4, H, W};
-  }
-
  public:
   void buf_bandwidth() {
+    if (!_enabled("buffer_bandwidth")) {
+      std::cout << "Skipped Memory Bandwidth" << std::endl;
+      return;
+    }
+
     std::cout << "\n------ Memory Bandwidth ------" << std::endl;
     // Maximum memory space read - 128MB
     // For regular devices, bandwidth plateaus at less memory than this, so more
     // is not needed.
-    const uint32_t RANGE = 128 * 1024 * 1024;
+    const uint32_t RANGE = _get_config("buffer_bandwidth", "range");
     _bandwidth("Buffer", RANGE);
   }
 
   void ubo_bandwidth() {
+    if (!_enabled("ubo_bandwidth")) {
+      std::cout << "Skipped UBO Bandwidth" << std::endl;
+      return;
+    }
+
     std::cout << "\n------ UBO Bandwidth ------" << std::endl;
-    const uint32_t RANGE = 128 * 1024 * 1024;
+    const uint32_t RANGE = _get_config("ubo_bandwidth", "range");
     _bandwidth("UBO", RANGE);
   }
 
   void shared_mem_bandwidth() {
+    if (!_enabled("shared_mem_bandwidth")) {
+      std::cout << "Skipped Shared Memory Bandwidth" << std::endl;
+      return;
+    }
+
     std::cout << "\n------ Shared Bandwidth ------" << std::endl;
     const uint32_t RANGE = max_shared_mem_size_;
     _bandwidth("Shared", RANGE);
   }
 
   void tex_bandwidth() {
+    if (!_enabled("tex_bandwidth")) {
+      std::cout << "Skipped Texture Bandwidth" << std::endl;
+      return;
+    }
+
     for (int dim = 0; dim < 3; dim++) {
       std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
                 << std::endl;
@@ -364,13 +439,13 @@ class App {
       const uint32_t RANGE = NVEC * VEC_SIZE;
 
       // Cache lines flushed
-      const uint32_t NFLUSH = 4;
+      const uint32_t NFLUSH = _get_config("tex_bandwidth", "nflush");
       // Number of loop unrolls. Changing this value requires an equal change in
       // tex_bandwidth.yaml
-      const uint32_t NUNROLL = 16;
+      const uint32_t NUNROLL = _get_config("tex_bandwidth", "nunroll");
       // Number of iterations. Increasing this value reduces noise in exchange
       // for higher latency.
-      const uint32_t NITER = 10;
+      const uint32_t NITER = _get_config("tex_bandwidth", "niter");
       // Number of memory reads per thread
       const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
       // Number of threads needed to read all texells
@@ -458,6 +533,11 @@ class App {
   // In Case 2, like in Adreno, the driver might decide to pack multiple works
   // together and dispatch them at once.
   void warp_size(bool verbose = false) {
+    if (!_enabled("warp_size")) {
+      std::cout << "Skipped Warp Size" << std::endl;
+      return;
+    }
+
     std::cout << "\n------ Warp Size ------" << std::endl;
 
     // Method A: Stress test with a kernel that uses complex ALU operations like
@@ -467,8 +547,8 @@ class App {
     // This timing-based method helps us identify physical warp sizes. It also
     // helps with Case 2, when threads of multiple warps are managed by the same
     // scheduler at the same time.
-    const double COMPENSATE = 0.01;
-    const double THRESHOLD = 3;
+    const double COMPENSATE = _get_config("warp_size", "compensate");
+    const double THRESHOLD = _get_config("warp_size", "threshold");
 
     uint32_t NITER;
 
@@ -596,7 +676,12 @@ class App {
 int main(int argc, const char** argv) {
   App app;
 
-  // TODO: Allow user to skip tests
+  std::string file_path = "config.json";
+  if (argc > 1) {
+    file_path = argv[1];
+  };
+  app.load_config(file_path);
+
   app.reg_count();
   app.buf_cacheline_size();
   app.buf_bandwidth();

From 5867129887571ea6f4c064f5447702e77135a923 Mon Sep 17 00:00:00 2001
From: Esteban Padilla Cerdio <estebanpadilla@meta.com>
Date: Tue, 30 Jul 2024 13:31:12 -0700
Subject: [PATCH 27/75] Add metric for 3D texture max concurrent cache read
 (#4421)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4421

This diff introduces a metric to calculate the maximum concurrent cache line accesses for each dimension of a 3D texture. The experiment works by allowing each thread to access a different texel on the texture and slowly increasing the number of threads, until the cache line is no longer able to handle all simultaneous accesses. By detecting a jump in latency, we can define the optimal maximum size that can be accessed concurrently on each dimension.

NOTE: ArchProbe uses this information to[ obtain a supposed cache line size for textures](https://fburl.com/98xiou3g). However, it is unclear why they define the cache line size as being the ratio between the larger concurrency value over the lower, times the texel size. It is also unclear how to extend their calculations to three dimensions.

TODO: Understand the relationship between concurrency and cache line size, and modify this metric to output the cache line size.

For a Samsung S22, the latency graph looks like this:

 {F1780375117}

Reviewed By: copyrightly

Differential Revision: D60246121

fbshipit-source-id: c2bac010077bf14e95f70bb6038acbb47a534dde
---
 backends/vulkan/tools/gpuinfo/config.json     |  5 +
 .../gpuinfo/glsl/tex_cacheline_concurr.glsl   | 39 ++++++++
 .../gpuinfo/glsl/tex_cacheline_concurr.yaml   | 14 +++
 backends/vulkan/tools/gpuinfo/src/app.cpp     | 98 ++++++++++++++++++-
 4 files changed, 155 insertions(+), 1 deletion(-)
 create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl
 create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml

diff --git a/backends/vulkan/tools/gpuinfo/config.json b/backends/vulkan/tools/gpuinfo/config.json
index 1efb9690fe..7307f29503 100644
--- a/backends/vulkan/tools/gpuinfo/config.json
+++ b/backends/vulkan/tools/gpuinfo/config.json
@@ -39,5 +39,10 @@
     "nflush": 4,
     "nunroll": 16,
     "niter": 10
+  },
+  "tex_cacheline_concurr": {
+    "enabled": true,
+    "threshold": 3,
+    "compensate": 0.1
   }
 }
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl
new file mode 100644
index 0000000000..62659c7bb8
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_sampler(0, "r", "in_tex", DTYPE)}
+${layout_declare_buffer(1, "w", "out_buf", DTYPE, "PRECISION", False)}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int niter = 1;
+
+void main() {
+    vec4 sum = vec4(0);
+    int i = 0;
+    for (; i < niter; ++i){
+        $if DIM == 0:
+            sum += texelFetch(in_tex, ivec3(gl_GlobalInvocationID[0], 0, 0), 0);
+        $elif DIM == 1:
+            sum +=  texelFetch(in_tex, ivec3(0, gl_GlobalInvocationID[0], 0), 0);
+        $elif DIM == 2:
+            sum +=  texelFetch(in_tex, ivec3(0, 0, gl_GlobalInvocationID[0]), 0);
+    }
+
+    // This is to ensure no compiler optimizations occur
+    vec4 zero = vec4(i>>31);
+
+    out_buf[0] = sum + zero;
+}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml
new file mode 100644
index 0000000000..6b557c9f66
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+tex_cacheline_concurr:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DIM:
+      - RANGE: [0, 2]
+  shader_variants:
+    - NAME: tex_cacheline_concurr
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
index c33e8a011d..2b1621db62 100644
--- a/backends/vulkan/tools/gpuinfo/src/app.cpp
+++ b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -291,12 +291,107 @@ class App {
     if (stride >= MAX_STRIDE) {
       std::cout << "Unable to conclude a top level buffer cacheline size."
                 << std::endl;
-      cacheline_size = MAX_STRIDE;
+      cacheline_size = MAX_STRIDE * sizeof(float);
     }
 
     std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
   }
 
+  // Textures are drastically different from buffers in terms of data layout.
+  // While buffers are a contiguous range of memory, textures are opaque objects
+  // defined by the vendor and it is possible that nearby points of data are not
+  // neighboring in memory. Likewise, data points are accessed in
+  // multi-dimensional patches instead of simple lines. This makes the stride
+  // method for figuring out the cache line size not applicable. To go around
+  // this, this experiment runs an increasing amount of threads accessing
+  // different datapoints in the texture and measures latency. If the cache line
+  // is big enough to contain all requested data for the amount of threads,
+  // latency will be low. When there are more threads and hence more data than
+  // what a single cache line can handle, a second line must be fetched,
+  // increasing latency in a measurable way.
+  void tex_cacheline_concurr() {
+    if (!_enabled("tex_cacheline_concurr")) {
+      std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl;
+      return;
+    }
+
+    const uint32_t TEXEL_WIDTH = 4;
+    const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH;
+
+    const double COMPENSATE =
+        _get_config("tex_cacheline_concurr", "compensate");
+    const double THRESHOLD = _get_config("tex_cacheline_concurr", "threshold");
+
+    for (int dim = 0; dim < 3; ++dim) {
+      std::cout << std::endl;
+      std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim
+                << ") ------" << std::endl;
+
+      uint32_t NITER;
+
+      const uint32_t IMG_OTHER_EDGE = dim == 0 ? max_tex_width_
+          : dim == 1                           ? max_tex_height_
+                                               : max_tex_depth_;
+
+      const uint32_t MAX_NTHREAD = std::min(nthread_logic_, IMG_OTHER_EDGE);
+
+      auto bench = [&](uint32_t nthread) {
+        std::vector<int64_t> sizes_whd = {
+            max_tex_width_, max_tex_height_, max_tex_depth_};
+
+        auto sizes_nchw = _whd_to_nchw(sizes_whd);
+
+        vTensor in_tensor =
+            api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
+
+        StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
+
+        vkapi::PipelineBarrier pipeline_barrier{};
+
+        auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim);
+
+        auto time = benchmark_on_gpu(shader_name, 100, [&]() {
+          context()->submit_compute_job(
+              VK_KERNEL_FROM_STR(shader_name),
+              pipeline_barrier,
+              {nthread, 1, 1},
+              {nthread, 1, 1},
+              {SV(NITER)},
+              VK_NULL_HANDLE,
+              0,
+              in_tensor.image(),
+              out_buf.buffer());
+        });
+        return time;
+      };
+
+      ensure_min_niter(1000, NITER, [&]() { return bench(1); });
+
+      DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+      uint32_t nthread = 1;
+      for (; nthread <= MAX_NTHREAD; ++nthread) {
+        double time = bench(nthread);
+        std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
+                  << std::endl;
+
+        if (dj.push(time)) {
+          auto max_concurrency = nthread - 1;
+          std::cout << "TextureCachelineConcurrencyDim" << dim << " (B),"
+                    << max_concurrency * TEXEL_SIZE << std::endl;
+          break;
+        }
+      }
+      if (nthread >= MAX_NTHREAD) {
+        std::cout
+            << "Unable to conclude an optimal texture cacheline concurrency for dim "
+            << dim << std::endl;
+      };
+    }
+
+    // TODO: Use concurrency information to obtain the cache line size for
+    // textures as done in https://fburl.com/98xiou3g
+  }
+
  private:
   void _bandwidth(std::string memtype, uint32_t range) {
     auto memtype_lower = memtype;
@@ -689,6 +784,7 @@ int main(int argc, const char** argv) {
   app.shared_mem_bandwidth();
   app.warp_size();
   app.tex_bandwidth();
+  app.tex_cacheline_concurr();
 
   return 0;
 }

From e03181d1ff078d8b534e53aeab7ed4cce77ea7e3 Mon Sep 17 00:00:00 2001
From: Esteban Padilla Cerdio <estebanpadilla@meta.com>
Date: Tue, 30 Jul 2024 13:31:12 -0700
Subject: [PATCH 28/75] Refactor and class split (#4432)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4432

Big classes are scary ☹️

This diff subdivides the tests into categories, places them as functions inside the gpuinfo namespace, instead of as part of the App class, and the App class is now only for persisting device information and configuration.

Reviewed By: jorgep31415

Differential Revision: D60290882

fbshipit-source-id: b57f6e824be33320c01eebc5d5b72cbd2ad4c0cf
---
 backends/vulkan/tools/gpuinfo/config.json     |   2 +-
 backends/vulkan/tools/gpuinfo/include/app.h   | 114 +++
 .../tools/gpuinfo/include/architecture.h      | 285 +++++++
 .../vulkan/tools/gpuinfo/include/buffers.h    | 203 +++++
 .../vulkan/tools/gpuinfo/include/textures.h   | 207 +++++
 backends/vulkan/tools/gpuinfo/include/utils.h |   9 +
 backends/vulkan/tools/gpuinfo/src/app.cpp     | 790 ------------------
 backends/vulkan/tools/gpuinfo/src/main.cpp    |  40 +
 8 files changed, 859 insertions(+), 791 deletions(-)
 create mode 100644 backends/vulkan/tools/gpuinfo/include/app.h
 create mode 100644 backends/vulkan/tools/gpuinfo/include/architecture.h
 create mode 100644 backends/vulkan/tools/gpuinfo/include/buffers.h
 create mode 100644 backends/vulkan/tools/gpuinfo/include/textures.h
 delete mode 100644 backends/vulkan/tools/gpuinfo/src/app.cpp
 create mode 100644 backends/vulkan/tools/gpuinfo/src/main.cpp

diff --git a/backends/vulkan/tools/gpuinfo/config.json b/backends/vulkan/tools/gpuinfo/config.json
index 7307f29503..afb5cbc6c5 100644
--- a/backends/vulkan/tools/gpuinfo/config.json
+++ b/backends/vulkan/tools/gpuinfo/config.json
@@ -23,7 +23,7 @@
     "nunroll": 16,
     "niter": 10
   },
-  "shared_mem_bandwidth": {
+  "shared_bandwidth": {
     "enabled": true,
     "nflush": 4,
     "nunroll": 16,
diff --git a/backends/vulkan/tools/gpuinfo/include/app.h b/backends/vulkan/tools/gpuinfo/include/app.h
new file mode 100644
index 0000000000..a46e9e6b9a
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/include/app.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+#include <folly/json.h>
+#include <fstream>
+#include <iostream>
+
+#include "utils.h"
+
+namespace gpuinfo {
+
+class App {
+ private:
+  folly::dynamic config_;
+
+ public:
+  size_t buf_cache_size;
+  uint32_t max_shared_mem_size;
+  uint32_t sm_count;
+  uint32_t nthread_logic;
+  uint32_t subgroup_size;
+  uint32_t max_tex_width;
+  uint32_t max_tex_height;
+  uint32_t max_tex_depth;
+
+  App() {
+    context()->initialize_querypool();
+
+    std::cout << context()->adapter_ptr()->stringize() << std::endl
+              << std::endl;
+
+    auto cl_device = get_cl_device();
+
+    sm_count = cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+    nthread_logic = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+    buf_cache_size = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
+    max_shared_mem_size = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+    max_tex_width = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_WIDTH>();
+    max_tex_height = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_HEIGHT>();
+    max_tex_depth = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_DEPTH>();
+
+    VkPhysicalDeviceSubgroupProperties subgroup_props{};
+    VkPhysicalDeviceProperties2 props2{};
+
+    props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+    props2.pNext = &subgroup_props;
+    subgroup_props.sType =
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
+    vkGetPhysicalDeviceProperties2(
+        context()->adapter_ptr()->physical_handle(), &props2);
+    subgroup_size = subgroup_props.subgroupSize;
+
+    std::cout << std::endl;
+    std::cout << "SM count," << sm_count << std::endl;
+    std::cout << "Logic Thread Count," << nthread_logic << std::endl;
+    std::cout << "Cache Size," << buf_cache_size << std::endl;
+    std::cout << "Shared Memory Size," << max_shared_mem_size << std::endl;
+    std::cout << "SubGroup Size," << subgroup_size << std::endl;
+    std::cout << "MaxTexWidth," << max_tex_width << std::endl;
+    std::cout << "MaxTexHeight," << max_tex_height << std::endl;
+    std::cout << "MaxTexDepth," << max_tex_depth << std::endl;
+  }
+
+  float get_config(const std::string& test, const std::string& key) const {
+    if (config_[test].empty()) {
+      throw std::runtime_error("Missing config for " + test);
+    }
+
+    if (!config_[test][key].isNumber()) {
+      throw std::runtime_error(
+          "Config for " + test + "." + key + " is not a number");
+    }
+
+    float value;
+    if (config_[test][key].isDouble()) {
+      value = config_[test][key].getDouble();
+    } else {
+      value = config_[test][key].getInt();
+    }
+
+    std::cout << "Read value for " << test << "." << key << " = " << value
+              << std::endl;
+    return value;
+  }
+
+  bool enabled(const std::string& test) const {
+    if (config_.empty() || config_[test].empty() ||
+        !config_[test]["enabled"].isBool()) {
+      return true;
+    }
+    return config_[test]["enabled"].getBool();
+  }
+
+  void load_config(std::string file_path) {
+    std::ifstream file(file_path);
+    std::stringstream buffer;
+    buffer << file.rdbuf();
+    const std::string json_str = buffer.str();
+    if (json_str.empty()) {
+      throw std::runtime_error(
+          "Failed to read config file from " + file_path + ".");
+    }
+    config_ = folly::parseJson(json_str);
+  }
+};
+} // namespace gpuinfo
diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h
new file mode 100644
index 0000000000..0d312ee87c
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/include/architecture.h
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
+
+#include "app.h"
+#include "stats.h"
+#include "utils.h"
+
+using namespace vkapi;
+
+namespace gpuinfo {
+
+void reg_count(const App& app) {
+  if (!app.enabled("reg_count")) {
+    std::cout << "Skipped Register Count" << std::endl;
+    return;
+  }
+
+  std::cout << std::endl;
+  std::cout << "------ Register Count ------" << std::endl;
+  const uint32_t NREG_MIN = 1;
+  const uint32_t NREG_MAX = 512;
+  const uint32_t NREG_STEP = 1;
+
+  const double COMPENSATE = app.get_config("reg_count", "compensate");
+  const double THRESHOLD = app.get_config("reg_count", "threshold");
+
+  const uint32_t NGRP_MIN = 1;
+  const uint32_t NGRP_MAX = 64;
+  const uint32_t NGRP_STEP = 1;
+
+  uint32_t NITER;
+
+  auto bench = [&](uint32_t ngrp, uint32_t nreg) {
+    StorageBuffer buffer(context(), vkapi::kFloat, 1);
+    vkapi::PipelineBarrier pipeline_barrier{};
+
+    auto shader_name = "reg_count_" + std::to_string(nreg);
+
+    auto time = benchmark_on_gpu(shader_name, 30, [&]() {
+      context()->submit_compute_job(
+          VK_KERNEL_FROM_STR(shader_name),
+          pipeline_barrier,
+          {1, ngrp, 1},
+          {1, 1, 1},
+          {SV(NITER)},
+          VK_NULL_HANDLE,
+          0,
+          buffer.buffer());
+    });
+    return time;
+  };
+
+  ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
+
+  uint32_t nreg_max;
+
+  DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+  uint32_t nreg = NREG_MIN;
+  for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
+    double time = bench(1, nreg);
+    std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << "\tus"
+              << std::endl;
+    if (dj.push(time)) {
+      nreg -= NREG_STEP;
+      nreg_max = nreg;
+      break;
+    }
+  }
+  if (nreg >= NREG_MAX) {
+    std::cout << "Unable to conclude a maximal register count" << std::endl;
+    nreg_max = NREG_STEP;
+  } else {
+    std::cout << nreg_max << " registers are available at most" << std::endl;
+  }
+
+  auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
+    DtJumpFinder<3> dj(COMPENSATE, THRESHOLD);
+    for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
+      auto time = bench(ngrp, nreg);
+      std::cout << "Testing occupation (nreg=\t" << nreg << "\t); ngrp=\t"
+                << ngrp << "\t, time=\t" << time << "\tus" << std::endl;
+
+      if (dj.push(time)) {
+        ngrp -= NGRP_STEP;
+        std::cout << "Using " << nreg << " registers can have " << ngrp
+                  << " concurrent single-thread workgroups" << std::endl;
+        return ngrp;
+      }
+    }
+    std::cout
+        << "Unable to conclude a maximum number of concurrent single-thread workgroups when "
+        << nreg << " registers are occupied" << std::endl;
+    return (uint32_t)1;
+  };
+
+  uint32_t ngrp_full, ngrp_half;
+  ngrp_full = find_ngrp_by_nreg(nreg_max);
+  ngrp_half = find_ngrp_by_nreg(nreg_max / 2);
+
+  std::string reg_ty;
+
+  if (ngrp_full * 1.5 < ngrp_half) {
+    std::cout << "All physical threads in an sm share " << nreg_max
+              << " registers" << std::endl;
+    reg_ty = "Pooled";
+
+  } else {
+    std::cout << "Each physical thread has " << nreg_max << " registers"
+              << std::endl;
+    reg_ty = "Dedicated";
+  }
+
+  std::cout << std::endl << std::endl;
+  std::cout << "MaxRegisters," << nreg_max << std::endl;
+  std::cout << "ConcurrentWorkgroupsFullReg," << ngrp_full << std::endl;
+  std::cout << "ConcurrentWorkgroupsHalfReg," << ngrp_half << std::endl;
+  std::cout << "RegisterType," << reg_ty << std::endl;
+}
+
+// Warp size is a difficult metric to obtain because the hardware limitations
+// do not always coincide with the way the SM divides the workload. For
+// instance, the hardware can have a warp size of 64 threads, but an SM might
+// be able to simulate concurrency of 128 threads with a single scheduler.
+
+// Because of this, it is important to measure the warp size different ways,
+// that can evidence both the physical limitations of the hardware, and the
+// actual behavior of the driver.
+
+// Additionally,the SM can behave in two different ways when the assigned
+// workload is smaller than the warp size.
+
+// In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty
+// threads and maintain a uniform workload.
+
+// In Case 2, like in Adreno, the driver might decide to pack multiple works
+// together and dispatch them at once.
+void warp_size(const App& app, const bool verbose = false) {
+  if (!app.enabled("warp_size")) {
+    std::cout << "Skipped Warp Size" << std::endl;
+    return;
+  }
+
+  std::cout << "\n------ Warp Size ------" << std::endl;
+
+  // Method A: Stress test with a kernel that uses complex ALU operations like
+  // integer division to avoid latency hiding. Increase the number of threads
+  // until a jump in latency is detected.
+
+  // This timing-based method helps us identify physical warp sizes. It also
+  // helps with Case 2, when threads of multiple warps are managed by the same
+  // scheduler at the same time.
+  const double COMPENSATE = app.get_config("warp_size", "compensate");
+  const double THRESHOLD = app.get_config("warp_size", "threshold");
+
+  uint32_t NITER;
+
+  auto bench = [&](uint32_t nthread) {
+    StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
+    vkapi::PipelineBarrier pipeline_barrier{};
+
+    auto shader_name = "warp_size_physical";
+
+    auto time = benchmark_on_gpu(shader_name, 10, [&]() {
+      context()->submit_compute_job(
+          VK_KERNEL_FROM_STR(shader_name),
+          pipeline_barrier,
+          // Large number of work groups selected to potentially saturate all
+          // ALUs and thus have a better baseline for comparison.
+          {nthread, 1024, 1},
+          {nthread, 1, 1},
+          {SV(NITER)},
+          VK_NULL_HANDLE,
+          0,
+          out_buf.buffer());
+    });
+
+    return time;
+  };
+
+  ensure_min_niter(1000, NITER, [&]() { return bench(1); });
+
+  uint32_t warp_size = app.subgroup_size;
+  DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+
+  // We increase the number of threads until we hit a jump in the data.
+  uint32_t nthread = 1;
+  for (; nthread <= app.nthread_logic; ++nthread) {
+    double time = bench(nthread);
+    std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)"
+              << std::endl;
+    if (dj.push(time)) {
+      warp_size = nthread - 1;
+      break;
+    }
+  }
+  if (nthread >= app.nthread_logic) {
+    std::cout
+        << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size"
+        << std::endl;
+  }
+
+  // Method B: Let all the threads in a warp race and atomically fetch-add
+  // a counter, then store the counter values to the output buffer in the
+  // scheduling order of these threads. If all the order numbers follow an
+  // ascending order, then the threads are likely executing within a warp.
+  // Threads in different warps are not managed by the same scheduler, so they
+  // would race for a same ID out of order, unaware of each other.
+
+  // This method evidences the actual driver behavior when running
+  // concurrency, regardless of the physical limitations of the hardware.
+
+  // Likewise, this method helps us identify warp sizes when the SM
+  // sub-divides its ALUs into independent groups, like the three execution
+  // engines in a Mali G76 core. It helps warp-probing in Case 1 because it
+  // doesn't depend on kernel timing, so the extra wait time doesn't lead to
+  // inaccuracy.
+  auto bench_sm = [&](uint32_t nthread) {
+    StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
+    vkapi::PipelineBarrier pipeline_barrier{};
+
+    auto shader_name = "warp_size_scheduler";
+
+    benchmark_on_gpu(shader_name, 1, [&]() {
+      context()->submit_compute_job(
+          VK_KERNEL_FROM_STR(shader_name),
+          pipeline_barrier,
+          {nthread, 1, 1},
+          {nthread, 1, 1},
+          {},
+          VK_NULL_HANDLE,
+          0,
+          out_buf.buffer());
+    });
+
+    std::vector<int32_t> data(app.nthread_logic);
+    copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes());
+
+    if (verbose) {
+      std::stringstream ss;
+      for (auto j = 0; j < nthread; ++j) {
+        ss << data[j] << " ";
+      }
+      std::cout << ss.str() << std::endl;
+    }
+
+    // Check until which point is the data in ascending order.
+    int32_t last = -1;
+    int32_t j = 0;
+    for (; j < nthread; ++j) {
+      if (last >= data[j]) {
+        break;
+      }
+      last = data[j];
+    }
+
+    return j;
+  };
+
+  // Test increasing sizes until the data is no longer in ascending order.
+  uint32_t warp_size_scheduler = warp_size;
+  int i = 1;
+  for (; i <= app.nthread_logic; ++i) {
+    uint32_t nascend = bench_sm(i);
+    if (nascend != i) {
+      warp_size_scheduler = nascend;
+      break;
+    }
+  }
+  if (i > app.nthread_logic) {
+    std::cout << "Unable to conclude an SM Warp Size." << std::endl;
+  }
+
+  std::cout << "PhysicalWarpSize," << warp_size << std::endl;
+  std::cout << "SMWarpSize," << warp_size_scheduler << std::endl;
+}
+}; // namespace gpuinfo
diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h
new file mode 100644
index 0000000000..8cb0da49ca
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/include/buffers.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include "app.h"
+#include "stats.h"
+#include "utils.h"
+
+using namespace vkapi;
+
+namespace gpuinfo {
+
+void buf_cacheline_size(const App& app) {
+  if (!app.enabled("buf_cacheline_size")) {
+    std::cout << "Skipped Buffer Cacheline Size" << std::endl;
+    return;
+  }
+
+  std::cout << std::endl;
+  std::cout << "------ Buffer Cacheline Size ------" << std::endl;
+
+  const double COMPENSATE = app.get_config("buf_cacheline_size", "compensate");
+  const double THRESHOLD = app.get_config("buf_cacheline_size", "threshold");
+
+  const uint32_t PITCH = app.buf_cache_size / app.nthread_logic;
+  const uint32_t BUF_SIZE = app.buf_cache_size;
+  const uint32_t MAX_STRIDE = PITCH;
+
+  uint32_t NITER;
+
+  auto bench = [&](int stride) {
+    StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
+    StorageBuffer out_buf(context(), vkapi::kFloat, 1);
+    vkapi::PipelineBarrier pipeline_barrier{};
+
+    auto shader_name = "buf_cacheline_size";
+
+    auto time = benchmark_on_gpu(shader_name, 100, [&]() {
+      context()->submit_compute_job(
+          VK_KERNEL_FROM_STR(shader_name),
+          pipeline_barrier,
+          {app.nthread_logic, 1, 1},
+          {app.nthread_logic, 1, 1},
+          {SV(NITER), SV(stride), SV(PITCH)},
+          VK_NULL_HANDLE,
+          0,
+          in_buf.buffer(),
+          out_buf.buffer());
+    });
+    return time;
+  };
+
+  ensure_min_niter(1000, NITER, [&]() { return bench(1); });
+
+  uint32_t cacheline_size;
+
+  DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+  uint32_t stride = 1;
+  for (; stride <= MAX_STRIDE; ++stride) {
+    double time = bench(stride);
+    std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time
+              << std::endl;
+
+    if (dj.push(time)) {
+      cacheline_size = stride * sizeof(float);
+      break;
+    }
+  }
+  if (stride >= MAX_STRIDE) {
+    std::cout << "Unable to conclude a top level buffer cacheline size."
+              << std::endl;
+    cacheline_size = MAX_STRIDE * sizeof(float);
+  }
+
+  std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
+}
+
+void _bandwidth(
+    const App& app,
+    const std::string memtype,
+    const uint32_t range) {
+  auto memtype_lower = memtype;
+  std::transform(
+      memtype_lower.begin(),
+      memtype_lower.end(),
+      memtype_lower.begin(),
+      [](unsigned char c) { return std::tolower(c); });
+
+  auto test_name = memtype_lower + "_bandwidth";
+
+  // Cache lines flushed
+  const uint32_t NFLUSH = app.get_config(test_name, "nflush");
+  // Number of loop unrolls. Changing this value requires an equal change in
+  // buf_bandwidth.yaml
+  const uint32_t NUNROLL = app.get_config(test_name, "nunroll");
+  // Number of iterations. Increasing this value reduces noise in exchange for
+  // higher latency.
+  const uint32_t NITER = app.get_config(test_name, "niter");
+  // Vector dimensions (vec4)
+  const uint32_t VEC_WIDTH = 4;
+  const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
+  // Number of vectors that fit in the selected memory space
+  const uint32_t NVEC = range / VEC_SIZE;
+  // Number of memory reads per thread
+  const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
+  // Number of threads needed to read al l vectors
+  // The thread count doesn't divide by thread workload in shared memory
+  // because of the limited memory size.
+  const uint32_t NTHREAD = memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD;
+  // Occupy all threads
+  const uint32_t local_x = app.nthread_logic;
+  // Ensure that global is a multiple of local, and distribute across all SMs
+  const uint32_t global_x =
+      (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH;
+
+  auto bench = [&](uint32_t access_size) {
+    // Number of vectors that fit in this iteration
+    const uint32_t nvec_access = access_size / VEC_SIZE;
+
+    StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
+    StorageBuffer out_buf(
+        context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
+    vkapi::PipelineBarrier pipeline_barrier{};
+
+    auto shader_name = "buf_bandwidth_" + memtype_lower;
+
+    auto time = benchmark_on_gpu(shader_name, 10, [&]() {
+      context()->submit_compute_job(
+          VK_KERNEL_FROM_STR(shader_name),
+          pipeline_barrier,
+          {global_x, 1, 1},
+          {local_x, 1, 1},
+          {SV(NITER), SV(nvec_access), SV(local_x)},
+          VK_NULL_HANDLE,
+          0,
+          in_buf.buffer(),
+          out_buf.buffer());
+    });
+
+    const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
+    auto gbps = SIZE_TRANS * 1e-3 / time;
+    std::cout << memtype << " bandwidth accessing \t" << access_size
+              << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
+              << "\tus)" << std::endl;
+    return gbps;
+  };
+
+  double max_bandwidth = 0;
+  double min_bandwidth = DBL_MAX;
+  for (uint32_t access_size = VEC_SIZE; access_size < range; access_size *= 2) {
+    double gbps = bench(access_size);
+    max_bandwidth = std::max(gbps, max_bandwidth);
+    min_bandwidth = std::min(gbps, min_bandwidth);
+  }
+
+  std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth
+            << std::endl;
+  std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth
+            << std::endl;
+}
+
+void buf_bandwidth(const App& app) {
+  if (!app.enabled("buffer_bandwidth")) {
+    std::cout << "Skipped Memory Bandwidth" << std::endl;
+    return;
+  }
+
+  std::cout << "\n------ Memory Bandwidth ------" << std::endl;
+  // Maximum memory space read - 128MB
+  // For regular devices, bandwidth plateaus at less memory than this, so more
+  // is not needed.
+  const uint32_t RANGE = app.get_config("buffer_bandwidth", "range");
+  _bandwidth(app, "Buffer", RANGE);
+}
+
+void ubo_bandwidth(const App& app) {
+  if (!app.enabled("ubo_bandwidth")) {
+    std::cout << "Skipped UBO Bandwidth" << std::endl;
+    return;
+  }
+
+  std::cout << "\n------ UBO Bandwidth ------" << std::endl;
+  const uint32_t RANGE = app.get_config("ubo_bandwidth", "range");
+  _bandwidth(app, "UBO", RANGE);
+}
+
+void shared_mem_bandwidth(const App& app) {
+  if (!app.enabled("shared_bandwidth")) {
+    std::cout << "Skipped Shared Memory Bandwidth" << std::endl;
+    return;
+  }
+
+  std::cout << "\n------ Shared Bandwidth ------" << std::endl;
+  const uint32_t RANGE = app.max_shared_mem_size;
+  _bandwidth(app, "Shared", RANGE);
+}
+} // namespace gpuinfo
diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h
new file mode 100644
index 0000000000..bb8a3371a9
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/include/textures.h
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include "app.h"
+#include "stats.h"
+#include "utils.h"
+
+namespace gpuinfo {
+
+// Textures are drastically different from buffers in terms of data layout.
+// While buffers are a contiguous range of memory, textures are opaque objects
+// defined by the vendor and it is possible that nearby points of data are not
+// neighboring in memory. Likewise, data points are accessed in
+// multi-dimensional patches instead of simple lines. This makes the stride
+// method for figuring out the cache line size not applicable. To go around
+// this, this experiment runs an increasing amount of threads accessing
+// different datapoints in the texture and measures latency. If the cache line
+// is big enough to contain all requested data for the amount of threads,
+// latency will be low. When there are more threads and hence more data than
+// what a single cache line can handle, a second line must be fetched,
+// increasing latency in a measurable way.
+void tex_cacheline_concurr(const App& app) {
+  if (!app.enabled("tex_cacheline_concurr")) {
+    std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl;
+    return;
+  }
+
+  const uint32_t TEXEL_WIDTH = 4;
+  const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH;
+
+  const double COMPENSATE =
+      app.get_config("tex_cacheline_concurr", "compensate");
+  const double THRESHOLD = app.get_config("tex_cacheline_concurr", "threshold");
+
+  for (int dim = 0; dim < 3; ++dim) {
+    std::cout << std::endl;
+    std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim
+              << ") ------" << std::endl;
+
+    uint32_t NITER;
+
+    const uint32_t IMG_OTHER_EDGE = dim == 0 ? app.max_tex_width
+        : dim == 1                           ? app.max_tex_height
+                                             : app.max_tex_depth;
+
+    const uint32_t MAX_NTHREAD = std::min(app.nthread_logic, IMG_OTHER_EDGE);
+
+    auto bench = [&](uint32_t nthread) {
+      std::vector<int64_t> sizes_whd = {
+          app.max_tex_width, app.max_tex_height, app.max_tex_depth};
+
+      auto sizes_nchw = whd_to_nchw(sizes_whd);
+
+      vTensor in_tensor =
+          api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
+
+      StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
+
+      vkapi::PipelineBarrier pipeline_barrier{};
+
+      auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim);
+
+      auto time = benchmark_on_gpu(shader_name, 100, [&]() {
+        context()->submit_compute_job(
+            VK_KERNEL_FROM_STR(shader_name),
+            pipeline_barrier,
+            {nthread, 1, 1},
+            {nthread, 1, 1},
+            {SV(NITER)},
+            VK_NULL_HANDLE,
+            0,
+            in_tensor.image(),
+            out_buf.buffer());
+      });
+      return time;
+    };
+
+    ensure_min_niter(1000, NITER, [&]() { return bench(1); });
+
+    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+    uint32_t nthread = 1;
+    for (; nthread <= MAX_NTHREAD; ++nthread) {
+      double time = bench(nthread);
+      std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
+                << std::endl;
+
+      if (dj.push(time)) {
+        auto max_concurrency = nthread - 1;
+        std::cout << "TextureCachelineConcurrencyDim" << dim << " (B),"
+                  << max_concurrency * TEXEL_SIZE << std::endl;
+        break;
+      }
+    }
+    if (nthread >= MAX_NTHREAD) {
+      std::cout
+          << "Unable to conclude an optimal texture cacheline concurrency for dim "
+          << dim << std::endl;
+    };
+  }
+
+  // TODO: Use concurrency information to obtain the cache line size for
+  // textures as done in https://fburl.com/98xiou3g
+}
+
+void tex_bandwidth(const App& app) {
+  if (!app.enabled("tex_bandwidth")) {
+    std::cout << "Skipped Texture Bandwidth" << std::endl;
+    return;
+  }
+
+  for (int dim = 0; dim < 3; dim++) {
+    std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
+              << std::endl;
+    const uint32_t MAX_SIZE = dim == 0 ? app.max_tex_width
+        : dim == 1                     ? app.max_tex_height
+                                       : app.max_tex_depth;
+
+    // rgba, float
+    const uint32_t VEC_WIDTH = 4;
+    const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
+    const uint32_t NVEC = MAX_SIZE;
+
+    const uint32_t RANGE = NVEC * VEC_SIZE;
+
+    // Cache lines flushed
+    const uint32_t NFLUSH = app.get_config("tex_bandwidth", "nflush");
+    // Number of loop unrolls. Changing this value requires an equal change in
+    // tex_bandwidth.yaml
+    const uint32_t NUNROLL = app.get_config("tex_bandwidth", "nunroll");
+    // Number of iterations. Increasing this value reduces noise in exchange
+    // for higher latency.
+    const uint32_t NITER = app.get_config("tex_bandwidth", "niter");
+    // Number of memory reads per thread
+    const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
+    // Number of threads needed to read all texells
+    const uint32_t NTHREAD = NVEC;
+    // Occupy all threads
+    const uint32_t local_x = app.nthread_logic;
+    // Ensure that global is a multiple of local, and distribute across all
+    // SMs
+    const uint32_t global_x =
+        (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH;
+
+    auto shader_name = "tex_bandwidth_" + std::to_string(dim);
+
+    std::vector<int64_t> sizes_whd = {MAX_SIZE, 1, 1};
+    if (dim == 1) {
+      sizes_whd = {1, MAX_SIZE, 1};
+    } else if (dim == 2) {
+      sizes_whd = {1, 1, MAX_SIZE};
+    }
+    auto sizes_nchw = whd_to_nchw(sizes_whd);
+
+    vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
+
+    auto bench = [&](uint32_t access_size, uint32_t dim) {
+      // Number of texels that fit in this iteration
+      const uint32_t ntexel_access = access_size / VEC_SIZE;
+
+      StorageBuffer out_buf(
+          context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
+      vkapi::PipelineBarrier pipeline_barrier{};
+
+      auto time = benchmark_on_gpu(shader_name, 10, [&]() {
+        context()->submit_compute_job(
+            VK_KERNEL_FROM_STR(shader_name),
+            pipeline_barrier,
+            {global_x, 1, 1},
+            {local_x, 1, 1},
+            {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)},
+            VK_NULL_HANDLE,
+            0,
+            in_tensor.image(),
+            out_buf.buffer());
+      });
+
+      const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
+      double gbps = SIZE_TRANS * 1e-3 / time;
+      std::cout << "Texture bandwidth accessing \t" << access_size
+                << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
+                << "\tus)" << std::endl;
+      return gbps;
+    };
+
+    double max_bandwidth = 0;
+    double min_bandwidth = DBL_MAX;
+    for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
+         access_size *= 2) {
+      double gbps = bench(access_size, dim);
+      max_bandwidth = std::max(gbps, max_bandwidth);
+      min_bandwidth = std::min(gbps, min_bandwidth);
+    }
+
+    std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth
+              << std::endl;
+    std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth
+              << std::endl;
+  }
+}
+} // namespace gpuinfo
diff --git a/backends/vulkan/tools/gpuinfo/include/utils.h b/backends/vulkan/tools/gpuinfo/include/utils.h
index 231fb32c5a..887cb443ef 100644
--- a/backends/vulkan/tools/gpuinfo/include/utils.h
+++ b/backends/vulkan/tools/gpuinfo/include/utils.h
@@ -54,6 +54,15 @@ void ensure_min_niter(
   }
 }
 
+std::vector<int64_t> whd_to_nchw(std::vector<int64_t> sizes) {
+  const int64_t W = sizes[0];
+  const int64_t H = sizes[1];
+  const int64_t D = sizes[2];
+
+  // Channels-packed: {W, H, D} = {W, H, (C / 4) * N}
+  return {1, D * 4, H, W};
+}
+
 cl_platform_id get_cl_platform_id() {
   cl_uint nplatform_id;
   clGetPlatformIDs(0, nullptr, &nplatform_id);
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
deleted file mode 100644
index 2b1621db62..0000000000
--- a/backends/vulkan/tools/gpuinfo/src/app.cpp
+++ /dev/null
@@ -1,790 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
-#include <folly/json.h>
-#include <fstream>
-#include <iostream>
-
-#include "stats.h"
-#include "utils.h"
-
-using namespace vkapi;
-
-class App {
- private:
-  size_t buf_cache_size_;
-  uint32_t max_shared_mem_size_;
-  uint32_t sm_count_;
-  uint32_t nthread_logic_;
-  uint32_t subgroup_size_;
-  uint32_t max_tex_width_;
-  uint32_t max_tex_height_;
-  uint32_t max_tex_depth_;
-  folly::dynamic config_;
-
-  std::vector<int64_t> _whd_to_nchw(std::vector<int64_t> sizes) {
-    const int64_t W = sizes[0];
-    const int64_t H = sizes[1];
-    const int64_t D = sizes[2];
-
-    // Channels-packed: {W, H, D} = {W, H, (C / 4) * N}
-    return {1, D * 4, H, W};
-  }
-
-  float _get_config(const std::string& test, const std::string& key) {
-    if (config_[test].empty()) {
-      throw std::runtime_error("Missing config for " + test);
-    }
-
-    if (!config_[test][key].isNumber()) {
-      throw std::runtime_error(
-          "Config for " + test + "." + key + " is not a number");
-    }
-
-    float value;
-    if (config_[test][key].isDouble()) {
-      value = config_[test][key].getDouble();
-    } else {
-      value = config_[test][key].getInt();
-    }
-
-    std::cout << "Read value for " << test << "." << key << " = " << value
-              << std::endl;
-    return value;
-  }
-
-  bool _enabled(const std::string& test) {
-    if (config_.empty() || config_[test].empty() ||
-        !config_[test]["enabled"].isBool()) {
-      return true;
-    }
-    return config_[test]["enabled"].getBool();
-  }
-
- public:
-  App() {
-    context()->initialize_querypool();
-
-    std::cout << context()->adapter_ptr()->stringize() << std::endl
-              << std::endl;
-
-    auto cl_device = get_cl_device();
-
-    sm_count_ = cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
-    nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
-    buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
-    max_shared_mem_size_ = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
-    max_tex_width_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_WIDTH>();
-    max_tex_height_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_HEIGHT>();
-    max_tex_depth_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_DEPTH>();
-
-    VkPhysicalDeviceSubgroupProperties subgroup_props{};
-    VkPhysicalDeviceProperties2 props2{};
-
-    props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
-    props2.pNext = &subgroup_props;
-    subgroup_props.sType =
-        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
-    vkGetPhysicalDeviceProperties2(
-        context()->adapter_ptr()->physical_handle(), &props2);
-    subgroup_size_ = subgroup_props.subgroupSize;
-
-    std::cout << std::endl;
-    std::cout << "SM count," << sm_count_ << std::endl;
-    std::cout << "Logic Thread Count," << nthread_logic_ << std::endl;
-    std::cout << "Cache Size," << buf_cache_size_ << std::endl;
-    std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl;
-    std::cout << "SubGroup Size," << subgroup_size_ << std::endl;
-    std::cout << "MaxTexWidth," << max_tex_width_ << std::endl;
-    std::cout << "MaxTexHeight," << max_tex_height_ << std::endl;
-    std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl;
-  }
-
-  void load_config(std::string file_path) {
-    std::ifstream file(file_path);
-    std::stringstream buffer;
-    buffer << file.rdbuf();
-    const std::string json_str = buffer.str();
-    if (json_str.empty()) {
-      throw std::runtime_error(
-          "Failed to read config file from " + file_path + ".");
-    }
-    config_ = folly::parseJson(json_str);
-  }
-
-  void reg_count() {
-    if (!_enabled("reg_count")) {
-      std::cout << "Skipped Register Count" << std::endl;
-      return;
-    }
-
-    std::cout << std::endl;
-    std::cout << "------ Register Count ------" << std::endl;
-    const uint32_t NREG_MIN = 1;
-    const uint32_t NREG_MAX = 512;
-    const uint32_t NREG_STEP = 1;
-
-    const double COMPENSATE = _get_config("reg_count", "compensate");
-    const double THRESHOLD = _get_config("reg_count", "threshold");
-
-    const uint32_t NGRP_MIN = 1;
-    const uint32_t NGRP_MAX = 64;
-    const uint32_t NGRP_STEP = 1;
-
-    uint32_t NITER;
-
-    auto bench = [&](uint32_t ngrp, uint32_t nreg) {
-      StorageBuffer buffer(context(), vkapi::kFloat, 1);
-      vkapi::PipelineBarrier pipeline_barrier{};
-
-      auto shader_name = "reg_count_" + std::to_string(nreg);
-
-      auto time = benchmark_on_gpu(shader_name, 100, [&]() {
-        context()->submit_compute_job(
-            VK_KERNEL_FROM_STR(shader_name),
-            pipeline_barrier,
-            {1, ngrp, 1},
-            {1, 1, 1},
-            {SV(NITER)},
-            VK_NULL_HANDLE,
-            0,
-            buffer.buffer());
-      });
-      return time;
-    };
-
-    std::cout << "Calculating NITER..." << std::endl;
-    ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
-    std::cout << "NITER," << NITER << std::endl;
-
-    uint32_t nreg_max;
-
-    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-    uint32_t nreg = NREG_MIN;
-    for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
-      double time = bench(1, nreg);
-      std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time
-                << std::endl;
-      if (dj.push(time)) {
-        nreg -= NREG_STEP;
-        nreg_max = nreg;
-        break;
-      }
-    }
-    if (nreg >= NREG_MAX) {
-      std::cout << "Unable to conclude a maximal register count" << std::endl;
-      nreg_max = NREG_STEP;
-    } else {
-      std::cout << nreg_max << " registers are available at most" << std::endl;
-    }
-
-    auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
-      DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-      for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
-        auto time = bench(ngrp, nreg);
-        std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp
-                  << ", time=" << time << " us" << std::endl;
-
-        if (dj.push(time)) {
-          ngrp -= NGRP_STEP;
-          std::cout << "Using " << nreg << " registers can have " << ngrp
-                    << " concurrent single-thread workgroups" << std::endl;
-          return ngrp;
-        }
-      }
-      std::cout
-          << "Unable to conclude a maximum number of concurrent single-thread workgroups when "
-          << nreg << " registers are occupied" << std::endl;
-      return (uint32_t)1;
-    };
-
-    uint32_t ngrp_full, ngrp_half;
-    ngrp_full = find_ngrp_by_nreg(nreg_max);
-    ngrp_half = find_ngrp_by_nreg(nreg_max / 2);
-
-    std::string reg_ty;
-
-    if (ngrp_full * 1.5 < ngrp_half) {
-      std::cout << "All physical threads in an sm share " << nreg_max
-                << " registers" << std::endl;
-      reg_ty = "Pooled";
-
-    } else {
-      std::cout << "Each physical thread has " << nreg_max << " registers"
-                << std::endl;
-      reg_ty = "Dedicated";
-    }
-
-    std::cout << std::endl << std::endl;
-    std::cout << "NITER," << NITER << std::endl;
-    std::cout << "Max registers," << nreg_max << std::endl;
-    std::cout << "Concurrent full single thread workgroups," << ngrp_full
-              << std::endl;
-    std::cout << "Concurrent half single thread workgroups," << ngrp_half
-              << std::endl;
-    std::cout << "Register type," << reg_ty << std::endl;
-  }
-
-  void buf_cacheline_size() {
-    if (!_enabled("buf_cacheline_size")) {
-      std::cout << "Skipped Buffer Cacheline Size" << std::endl;
-      return;
-    }
-
-    std::cout << std::endl;
-    std::cout << "------ Buffer Cacheline Size ------" << std::endl;
-
-    const double COMPENSATE = _get_config("buf_cacheline_size", "compensate");
-    const double THRESHOLD = _get_config("buf_cacheline_size", "threshold");
-
-    const uint32_t PITCH = buf_cache_size_ / nthread_logic_;
-    const uint32_t BUF_SIZE = buf_cache_size_;
-    const uint32_t MAX_STRIDE = PITCH;
-
-    uint32_t NITER;
-
-    auto bench = [&](int stride) {
-      StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
-      StorageBuffer out_buf(context(), vkapi::kFloat, 1);
-      vkapi::PipelineBarrier pipeline_barrier{};
-
-      auto shader_name = "buf_cacheline_size";
-
-      auto time = benchmark_on_gpu(shader_name, 100, [&]() {
-        context()->submit_compute_job(
-            VK_KERNEL_FROM_STR(shader_name),
-            pipeline_barrier,
-            {nthread_logic_, 1, 1},
-            {nthread_logic_, 1, 1},
-            {SV(NITER), SV(stride), SV(PITCH)},
-            VK_NULL_HANDLE,
-            0,
-            in_buf.buffer(),
-            out_buf.buffer());
-      });
-      return time;
-    };
-
-    ensure_min_niter(1000, NITER, [&]() { return bench(1); });
-
-    uint32_t cacheline_size;
-
-    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-    uint32_t stride = 1;
-    for (; stride <= MAX_STRIDE; ++stride) {
-      double time = bench(stride);
-      std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time
-                << std::endl;
-
-      if (dj.push(time)) {
-        cacheline_size = stride * sizeof(float);
-        break;
-      }
-    }
-    if (stride >= MAX_STRIDE) {
-      std::cout << "Unable to conclude a top level buffer cacheline size."
-                << std::endl;
-      cacheline_size = MAX_STRIDE * sizeof(float);
-    }
-
-    std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
-  }
-
-  // Textures are drastically different from buffers in terms of data layout.
-  // While buffers are a contiguous range of memory, textures are opaque objects
-  // defined by the vendor and it is possible that nearby points of data are not
-  // neighboring in memory. Likewise, data points are accessed in
-  // multi-dimensional patches instead of simple lines. This makes the stride
-  // method for figuring out the cache line size not applicable. To go around
-  // this, this experiment runs an increasing amount of threads accessing
-  // different datapoints in the texture and measures latency. If the cache line
-  // is big enough to contain all requested data for the amount of threads,
-  // latency will be low. When there are more threads and hence more data than
-  // what a single cache line can handle, a second line must be fetched,
-  // increasing latency in a measurable way.
-  void tex_cacheline_concurr() {
-    if (!_enabled("tex_cacheline_concurr")) {
-      std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl;
-      return;
-    }
-
-    const uint32_t TEXEL_WIDTH = 4;
-    const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH;
-
-    const double COMPENSATE =
-        _get_config("tex_cacheline_concurr", "compensate");
-    const double THRESHOLD = _get_config("tex_cacheline_concurr", "threshold");
-
-    for (int dim = 0; dim < 3; ++dim) {
-      std::cout << std::endl;
-      std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim
-                << ") ------" << std::endl;
-
-      uint32_t NITER;
-
-      const uint32_t IMG_OTHER_EDGE = dim == 0 ? max_tex_width_
-          : dim == 1                           ? max_tex_height_
-                                               : max_tex_depth_;
-
-      const uint32_t MAX_NTHREAD = std::min(nthread_logic_, IMG_OTHER_EDGE);
-
-      auto bench = [&](uint32_t nthread) {
-        std::vector<int64_t> sizes_whd = {
-            max_tex_width_, max_tex_height_, max_tex_depth_};
-
-        auto sizes_nchw = _whd_to_nchw(sizes_whd);
-
-        vTensor in_tensor =
-            api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
-
-        StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
-
-        vkapi::PipelineBarrier pipeline_barrier{};
-
-        auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim);
-
-        auto time = benchmark_on_gpu(shader_name, 100, [&]() {
-          context()->submit_compute_job(
-              VK_KERNEL_FROM_STR(shader_name),
-              pipeline_barrier,
-              {nthread, 1, 1},
-              {nthread, 1, 1},
-              {SV(NITER)},
-              VK_NULL_HANDLE,
-              0,
-              in_tensor.image(),
-              out_buf.buffer());
-        });
-        return time;
-      };
-
-      ensure_min_niter(1000, NITER, [&]() { return bench(1); });
-
-      DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-      uint32_t nthread = 1;
-      for (; nthread <= MAX_NTHREAD; ++nthread) {
-        double time = bench(nthread);
-        std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
-                  << std::endl;
-
-        if (dj.push(time)) {
-          auto max_concurrency = nthread - 1;
-          std::cout << "TextureCachelineConcurrencyDim" << dim << " (B),"
-                    << max_concurrency * TEXEL_SIZE << std::endl;
-          break;
-        }
-      }
-      if (nthread >= MAX_NTHREAD) {
-        std::cout
-            << "Unable to conclude an optimal texture cacheline concurrency for dim "
-            << dim << std::endl;
-      };
-    }
-
-    // TODO: Use concurrency information to obtain the cache line size for
-    // textures as done in https://fburl.com/98xiou3g
-  }
-
- private:
-  void _bandwidth(std::string memtype, uint32_t range) {
-    auto memtype_lower = memtype;
-    std::transform(
-        memtype_lower.begin(),
-        memtype_lower.end(),
-        memtype_lower.begin(),
-        [](unsigned char c) { return std::tolower(c); });
-
-    auto test_name = memtype_lower + "_bandwidth";
-
-    // Cache lines flushed
-    const uint32_t NFLUSH = _get_config(test_name, "nflush");
-    // Number of loop unrolls. Changing this value requires an equal change in
-    // buf_bandwidth.yaml
-    const uint32_t NUNROLL = _get_config(test_name, "nunroll");
-    // Number of iterations. Increasing this value reduces noise in exchange for
-    // higher latency.
-    const uint32_t NITER = _get_config(test_name, "niter");
-    // Vector dimensions (vec4)
-    const uint32_t VEC_WIDTH = 4;
-    const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
-    // Number of vectors that fit in the selected memory space
-    const uint32_t NVEC = range / VEC_SIZE;
-    // Number of memory reads per thread
-    const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
-    // Number of threads needed to read al l vectors
-    // The thread count doesn't divide by thread workload in shared memory
-    // because of the limited memory size.
-    const uint32_t NTHREAD =
-        memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD;
-    // Occupy all threads
-    const uint32_t local_x = nthread_logic_;
-    // Ensure that global is a multiple of local, and distribute across all SMs
-    const uint32_t global_x =
-        (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH;
-
-    auto bench = [&](uint32_t access_size) {
-      // Number of vectors that fit in this iteration
-      const uint32_t nvec_access = access_size / VEC_SIZE;
-
-      StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
-      StorageBuffer out_buf(
-          context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
-      vkapi::PipelineBarrier pipeline_barrier{};
-
-      auto shader_name = "buf_bandwidth_" + memtype_lower;
-
-      auto time = benchmark_on_gpu(shader_name, 10, [&]() {
-        context()->submit_compute_job(
-            VK_KERNEL_FROM_STR(shader_name),
-            pipeline_barrier,
-            {global_x, 1, 1},
-            {local_x, 1, 1},
-            {SV(NITER), SV(nvec_access), SV(local_x)},
-            VK_NULL_HANDLE,
-            0,
-            in_buf.buffer(),
-            out_buf.buffer());
-      });
-
-      const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
-      auto gbps = SIZE_TRANS * 1e-3 / time;
-      std::cout << memtype << " bandwidth accessing \t" << access_size
-                << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
-                << "\tus)" << std::endl;
-      return gbps;
-    };
-
-    double max_bandwidth = 0;
-    double min_bandwidth = DBL_MAX;
-    for (uint32_t access_size = VEC_SIZE; access_size < range;
-         access_size *= 2) {
-      double gbps = bench(access_size);
-      max_bandwidth = std::max(gbps, max_bandwidth);
-      min_bandwidth = std::min(gbps, min_bandwidth);
-    }
-
-    std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth
-              << std::endl;
-    std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth
-              << std::endl;
-  }
-
- public:
-  void buf_bandwidth() {
-    if (!_enabled("buffer_bandwidth")) {
-      std::cout << "Skipped Memory Bandwidth" << std::endl;
-      return;
-    }
-
-    std::cout << "\n------ Memory Bandwidth ------" << std::endl;
-    // Maximum memory space read - 128MB
-    // For regular devices, bandwidth plateaus at less memory than this, so more
-    // is not needed.
-    const uint32_t RANGE = _get_config("buffer_bandwidth", "range");
-    _bandwidth("Buffer", RANGE);
-  }
-
-  void ubo_bandwidth() {
-    if (!_enabled("ubo_bandwidth")) {
-      std::cout << "Skipped UBO Bandwidth" << std::endl;
-      return;
-    }
-
-    std::cout << "\n------ UBO Bandwidth ------" << std::endl;
-    const uint32_t RANGE = _get_config("ubo_bandwidth", "range");
-    _bandwidth("UBO", RANGE);
-  }
-
-  void shared_mem_bandwidth() {
-    if (!_enabled("shared_mem_bandwidth")) {
-      std::cout << "Skipped Shared Memory Bandwidth" << std::endl;
-      return;
-    }
-
-    std::cout << "\n------ Shared Bandwidth ------" << std::endl;
-    const uint32_t RANGE = max_shared_mem_size_;
-    _bandwidth("Shared", RANGE);
-  }
-
-  void tex_bandwidth() {
-    if (!_enabled("tex_bandwidth")) {
-      std::cout << "Skipped Texture Bandwidth" << std::endl;
-      return;
-    }
-
-    for (int dim = 0; dim < 3; dim++) {
-      std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
-                << std::endl;
-      const uint32_t MAX_SIZE = dim == 0 ? max_tex_width_
-          : dim == 1                     ? max_tex_height_
-                                         : max_tex_depth_;
-
-      // rgba, float
-      const uint32_t VEC_WIDTH = 4;
-      const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
-      const uint32_t NVEC = MAX_SIZE;
-
-      const uint32_t RANGE = NVEC * VEC_SIZE;
-
-      // Cache lines flushed
-      const uint32_t NFLUSH = _get_config("tex_bandwidth", "nflush");
-      // Number of loop unrolls. Changing this value requires an equal change in
-      // tex_bandwidth.yaml
-      const uint32_t NUNROLL = _get_config("tex_bandwidth", "nunroll");
-      // Number of iterations. Increasing this value reduces noise in exchange
-      // for higher latency.
-      const uint32_t NITER = _get_config("tex_bandwidth", "niter");
-      // Number of memory reads per thread
-      const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
-      // Number of threads needed to read all texells
-      const uint32_t NTHREAD = NVEC;
-      // Occupy all threads
-      const uint32_t local_x = nthread_logic_;
-      // Ensure that global is a multiple of local, and distribute across all
-      // SMs
-      const uint32_t global_x =
-          (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH;
-
-      auto shader_name = "tex_bandwidth_" + std::to_string(dim);
-
-      std::vector<int64_t> sizes_whd = {MAX_SIZE, 1, 1};
-      if (dim == 1) {
-        sizes_whd = {1, MAX_SIZE, 1};
-      } else if (dim == 2) {
-        sizes_whd = {1, 1, MAX_SIZE};
-      }
-      auto sizes_nchw = _whd_to_nchw(sizes_whd);
-
-      vTensor in_tensor =
-          api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
-
-      auto bench = [&](uint32_t access_size, uint32_t dim) {
-        // Number of texels that fit in this iteration
-        const uint32_t ntexel_access = access_size / VEC_SIZE;
-
-        StorageBuffer out_buf(
-            context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
-        vkapi::PipelineBarrier pipeline_barrier{};
-
-        auto time = benchmark_on_gpu(shader_name, 10, [&]() {
-          context()->submit_compute_job(
-              VK_KERNEL_FROM_STR(shader_name),
-              pipeline_barrier,
-              {global_x, 1, 1},
-              {local_x, 1, 1},
-              {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)},
-              VK_NULL_HANDLE,
-              0,
-              in_tensor.image(),
-              out_buf.buffer());
-        });
-
-        const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
-        double gbps = SIZE_TRANS * 1e-3 / time;
-        std::cout << "Texture bandwidth accessing \t" << access_size
-                  << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
-                  << "\tus)" << std::endl;
-        return gbps;
-      };
-
-      double max_bandwidth = 0;
-      double min_bandwidth = DBL_MAX;
-      for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
-           access_size *= 2) {
-        double gbps = bench(access_size, dim);
-        max_bandwidth = std::max(gbps, max_bandwidth);
-        min_bandwidth = std::min(gbps, min_bandwidth);
-      }
-
-      std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth
-                << std::endl;
-      std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth
-                << std::endl;
-    }
-  }
-
-  // Warp size is a difficult metric to obtain because the hardware limitations
-  // do not always coincide with the way the SM divides the workload. For
-  // instance, the hardware can have a warp size of 64 threads, but an SM might
-  // be able to simulate concurrency of 128 threads with a single scheduler.
-
-  // Because of this, it is important to measure the warp size different ways,
-  // that can evidence both the physical limitations of the hardware, and the
-  // actual behavior of the driver.
-
-  // Additionally,the SM can behave in two different ways when the assigned
-  // workload is smaller than the warp size.
-
-  // In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty
-  // threads and maintain a uniform workload.
-
-  // In Case 2, like in Adreno, the driver might decide to pack multiple works
-  // together and dispatch them at once.
-  void warp_size(bool verbose = false) {
-    if (!_enabled("warp_size")) {
-      std::cout << "Skipped Warp Size" << std::endl;
-      return;
-    }
-
-    std::cout << "\n------ Warp Size ------" << std::endl;
-
-    // Method A: Stress test with a kernel that uses complex ALU operations like
-    // integer division to avoid latency hiding. Increase the number of threads
-    // until a jump in latency is detected.
-
-    // This timing-based method helps us identify physical warp sizes. It also
-    // helps with Case 2, when threads of multiple warps are managed by the same
-    // scheduler at the same time.
-    const double COMPENSATE = _get_config("warp_size", "compensate");
-    const double THRESHOLD = _get_config("warp_size", "threshold");
-
-    uint32_t NITER;
-
-    auto bench = [&](uint32_t nthread) {
-      StorageBuffer out_buf(context(), vkapi::kInt, nthread_logic_);
-      vkapi::PipelineBarrier pipeline_barrier{};
-
-      auto shader_name = "warp_size_physical";
-
-      auto time = benchmark_on_gpu(shader_name, 10, [&]() {
-        context()->submit_compute_job(
-            VK_KERNEL_FROM_STR(shader_name),
-            pipeline_barrier,
-            // Large number of work groups selected to potentially saturate all
-            // ALUs and thus have a better baseline for comparison.
-            {nthread, 1024, 1},
-            {nthread, 1, 1},
-            {SV(NITER)},
-            VK_NULL_HANDLE,
-            0,
-            out_buf.buffer());
-      });
-
-      return time;
-    };
-
-    ensure_min_niter(1000, NITER, [&]() { return bench(1); });
-
-    uint32_t warp_size = subgroup_size_;
-    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-
-    // We increase the number of threads until we hit a jump in the data.
-    uint32_t nthread = 1;
-    for (; nthread <= nthread_logic_; ++nthread) {
-      double time = bench(nthread);
-      std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)"
-                << std::endl;
-      if (dj.push(time)) {
-        warp_size = nthread - 1;
-        break;
-      }
-    }
-    if (nthread >= nthread_logic_) {
-      std::cout
-          << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size"
-          << std::endl;
-    }
-
-    // Method B: Let all the threads in a warp race and atomically fetch-add
-    // a counter, then store the counter values to the output buffer in the
-    // scheduling order of these threads. If all the order numbers follow an
-    // ascending order, then the threads are likely executing within a warp.
-    // Threads in different warps are not managed by the same scheduler, so they
-    // would race for a same ID out of order, unaware of each other.
-
-    // This method evidences the actual driver behavior when running
-    // concurrency, regardless of the physical limitations of the hardware.
-
-    // Likewise, this method helps us identify warp sizes when the SM
-    // sub-divides its ALUs into independent groups, like the three execution
-    // engines in a Mali G76 core. It helps warp-probing in Case 1 because it
-    // doesn't depend on kernel timing, so the extra wait time doesn't lead to
-    // inaccuracy.
-    auto bench_sm = [&](uint32_t nthread) {
-      StorageBuffer out_buf(context(), vkapi::kInt, nthread_logic_);
-      vkapi::PipelineBarrier pipeline_barrier{};
-
-      auto shader_name = "warp_size_scheduler";
-
-      benchmark_on_gpu(shader_name, 1, [&]() {
-        context()->submit_compute_job(
-            VK_KERNEL_FROM_STR(shader_name),
-            pipeline_barrier,
-            {nthread, 1, 1},
-            {nthread, 1, 1},
-            {},
-            VK_NULL_HANDLE,
-            0,
-            out_buf.buffer());
-      });
-
-      std::vector<int32_t> data(nthread_logic_);
-      copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes());
-
-      if (verbose) {
-        std::stringstream ss;
-        for (auto j = 0; j < nthread; ++j) {
-          ss << data[j] << " ";
-        }
-        std::cout << ss.str() << std::endl;
-      }
-
-      // Check until which point is the data in ascending order.
-      int32_t last = -1;
-      int32_t j = 0;
-      for (; j < nthread; ++j) {
-        if (last >= data[j]) {
-          break;
-        }
-        last = data[j];
-      }
-
-      return j;
-    };
-
-    // Test increasing sizes until the data is no longer in ascending order.
-    uint32_t warp_size_scheduler = warp_size;
-    int i = 1;
-    for (; i <= nthread_logic_; ++i) {
-      uint32_t nascend = bench_sm(i);
-      if (nascend != i) {
-        warp_size_scheduler = nascend;
-        break;
-      }
-    }
-    if (i > nthread_logic_) {
-      std::cout << "Unable to conclude an SM Warp Size." << std::endl;
-    }
-
-    std::cout << "PhysicalWarpSize," << warp_size << std::endl;
-    std::cout << "SMWarpSize," << warp_size_scheduler << std::endl;
-  }
-};
-
-int main(int argc, const char** argv) {
-  App app;
-
-  std::string file_path = "config.json";
-  if (argc > 1) {
-    file_path = argv[1];
-  };
-  app.load_config(file_path);
-
-  app.reg_count();
-  app.buf_cacheline_size();
-  app.buf_bandwidth();
-  app.ubo_bandwidth();
-  app.shared_mem_bandwidth();
-  app.warp_size();
-  app.tex_bandwidth();
-  app.tex_cacheline_concurr();
-
-  return 0;
-}
diff --git a/backends/vulkan/tools/gpuinfo/src/main.cpp b/backends/vulkan/tools/gpuinfo/src/main.cpp
new file mode 100644
index 0000000000..f0e29aaf1a
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/src/main.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "app.h"
+#include "architecture.h"
+#include "buffers.h"
+#include "textures.h"
+
+using namespace vkapi;
+
+int main(int argc, const char** argv) {
+  gpuinfo::App app;
+
+  std::string file_path = "config.json";
+  if (argc > 1) {
+    file_path = argv[1];
+  };
+  app.load_config(file_path);
+
+  // Architecture
+  gpuinfo::reg_count(app);
+  gpuinfo::warp_size(app);
+
+  // Buffers
+  gpuinfo::buf_cacheline_size(app);
+  gpuinfo::buf_bandwidth(app);
+  gpuinfo::ubo_bandwidth(app);
+  gpuinfo::shared_mem_bandwidth(app);
+
+  // Textures
+  gpuinfo::tex_bandwidth(app);
+  gpuinfo::tex_cacheline_concurr(app);
+
+  return 0;
+}

From 1727aa18c205da4d829c425bdece0b61b6179a6a Mon Sep 17 00:00:00 2001
From: Chen Lai <chenlai@meta.com>
Date: Tue, 30 Jul 2024 14:35:50 -0700
Subject: [PATCH 29/75] fix eval llama (#4469)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4469

Previously the refactor moves files from `examples/...` to `extensions/...`, however llama eval was not covered by CI, fix it here

before:
```
(executorch) chenlai@chenlai-mbp executorch % python -m examples.models.llama2.eval_llama -c /Users/chenlai/Documents/stories110M/stories110M/stories110M.pt  -p /Users/chenlai/Documents/stories110M/stories110M/params.json  -t /Users/chenlai/Documents/stories110M/stories110M/tokenizer.model  -d fp32 --max_seq_len 127 --limit 5
/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
  warn("The installed version of bitsandbytes was compiled without GPU support. "
'NoneType' object has no attribute 'cadam32bit_grad_fp32'
/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:106: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
  impl_abstract("quantized_decomposed::embedding_byte.out")
/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:153: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
  impl_abstract("quantized_decomposed::embedding_byte.dtype_out")
/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:228: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
  impl_abstract("quantized_decomposed::embedding_4bit.out")
/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:281: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
  impl_abstract("quantized_decomposed::embedding_4bit.dtype_out")
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Users/chenlai/executorch/examples/models/llama2/eval_llama.py", line 13, in <module>
    from .eval_llama_lib import build_args_parser, eval_llama
  File "/Users/chenlai/executorch/examples/models/llama2/eval_llama_lib.py", line 19, in <module>
    from executorch.extension.llm.export import LLMEdgeManager
ImportError: cannot import name 'LLMEdgeManager' from 'executorch.extension.llm.export' (/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/extension/llm/export/__init__.py)
(executorch) chenlai@chenlai-mbp executorch %
(executorch) chenlai@chenlai-mbp executorch %
```
after

```
(executorch) chenlai@chenlai-mbp executorch % python -m examples.models.llama2.eval_llama -c /Users/chenlai/Documents/stories110M/stories110M/stories110M.pt  -p /Users/chenlai/Documents/stories110M/stories110M/params.json  -t /Users/chenlai/Documents/stories110M/stories110M/tokenizer.model  -d fp32 --max_seq_len 127 --limit 5
/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
  warn("The installed version of bitsandbytes was compiled without GPU support. "
'NoneType' object has no attribute 'cadam32bit_grad_fp32'
/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:106: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
  impl_abstract("quantized_decomposed::embedding_byte.out")
/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:153: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
  impl_abstract("quantized_decomposed::embedding_byte.dtype_out")
/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:228: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
  impl_abstract("quantized_decomposed::embedding_4bit.out")
/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:281: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
  impl_abstract("quantized_decomposed::embedding_4bit.dtype_out")
2024-07-30:12:36:04,260 INFO     [tokenizer.py:33] #words: 32000 - BOS ID: 1 - EOS ID: 2
2024-07-30:12:36:04,260 INFO     [export_llama_lib.py:419] Applying quantizers: []
2024-07-30:12:36:04,260 INFO     [export_llama_lib.py:594] Loading model with checkpoint=/Users/chenlai/Documents/stories110M/stories110M/stories110M.pt, params=/Users/chenlai/Documents/stories110M/stories110M/params.json, use_kv_cache=False, weight_type=WeightType.LLAMA
/Users/chenlai/executorch/examples/models/llama2/model.py:99: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True)
2024-07-30:12:36:04,315 INFO     [export_llama_lib.py:616] Loaded model with dtype=torch.float32
2024-07-30:12:36:04,395 INFO     [huggingface.py:162] Using device 'cpu'
2024-07-30:12:36:27,262 WARNING  [task.py:763] [Task: wikitext] metric word_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity
2024-07-30:12:36:27,262 WARNING  [task.py:775] [Task: wikitext] metric word_perplexity is defined, but higher_is_better is not. using default higher_is_better=False
2024-07-30:12:36:27,262 WARNING  [task.py:763] [Task: wikitext] metric byte_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity
2024-07-30:12:36:27,262 WARNING  [task.py:775] [Task: wikitext] metric byte_perplexity is defined, but higher_is_better is not. using default higher_is_better=False
2024-07-30:12:36:27,262 WARNING  [task.py:763] [Task: wikitext] metric bits_per_byte is defined, but aggregation is not. using default aggregation=bits_per_byte
2024-07-30:12:36:27,262 WARNING  [task.py:775] [Task: wikitext] metric bits_per_byte is defined, but higher_is_better is not. using default higher_is_better=False
Repo card metadata block was not found. Setting CardData to empty.
2024-07-30:12:36:29,494 WARNING  [repocard.py:107] Repo card metadata block was not found. Setting CardData to empty.
2024-07-30:12:36:30,401 INFO     [task.py:395] Building contexts for wikitext on rank 0...
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 718.57it/s]
2024-07-30:12:36:30,410 INFO     [evaluator.py:362] Running loglikelihood_rolling requests
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:14<00:00,  2.91s/it]
wikitext: {'word_perplexity,none': 10885.215324239069, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 6.144013518032613, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 2.6191813902741017, 'bits_per_byte_stderr,none': 'N/A', 'alias': 'wikitext'}
```
ghstack-source-id: 235865354
exported-using-ghexport

Reviewed By: larryliu0820

Differential Revision: D60466386

fbshipit-source-id: 0032af8b3269f107469fe142382dfacb06751808
---
 extension/llm/export/__init__.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/extension/llm/export/__init__.py b/extension/llm/export/__init__.py
index e69de29bb2..7b17c223c3 100644
--- a/extension/llm/export/__init__.py
+++ b/extension/llm/export/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .builder import LLMEdgeManager
+
+__all__ = [
+    "LLMEdgeManager",
+]

From 1ec3444707649dc78e2ce49805e39283a4a4bb1a Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Tue, 30 Jul 2024 14:49:57 -0700
Subject: [PATCH 30/75] Migrate sampler to extension/llm (#4460)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4460

Move sampler code to extension/llm so that it can be reused by
llava runner.

Reviewed By: helunwencser

Differential Revision: D60458803

fbshipit-source-id: ef8c4d7d3fed4f0777e5ba9cd403da8320efef5a
---
 examples/models/llama2/runner/runner.h                          | 2 +-
 examples/models/llama2/runner/targets.bzl                       | 2 +-
 {examples/models/llama2 => extension/llm}/sampler/TARGETS       | 0
 {examples/models/llama2 => extension/llm}/sampler/sampler.cpp   | 2 +-
 {examples/models/llama2 => extension/llm}/sampler/sampler.h     | 0
 {examples/models/llama2 => extension/llm}/sampler/targets.bzl   | 0
 {examples/models/llama2 => extension/llm}/sampler/test/TARGETS  | 0
 .../models/llama2 => extension/llm}/sampler/test/targets.bzl    | 0
 .../llama2 => extension/llm}/sampler/test/test_sampler.cpp      | 0
 9 files changed, 3 insertions(+), 3 deletions(-)
 rename {examples/models/llama2 => extension/llm}/sampler/TARGETS (100%)
 rename {examples/models/llama2 => extension/llm}/sampler/sampler.cpp (99%)
 rename {examples/models/llama2 => extension/llm}/sampler/sampler.h (100%)
 rename {examples/models/llama2 => extension/llm}/sampler/targets.bzl (100%)
 rename {examples/models/llama2 => extension/llm}/sampler/test/TARGETS (100%)
 rename {examples/models/llama2 => extension/llm}/sampler/test/targets.bzl (100%)
 rename {examples/models/llama2 => extension/llm}/sampler/test/test_sampler.cpp (100%)

diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
index c269a8c585..7b9d2763fc 100644
--- a/examples/models/llama2/runner/runner.h
+++ b/examples/models/llama2/runner/runner.h
@@ -17,7 +17,7 @@
 #include <string>
 #include <unordered_map>
 
-#include <executorch/examples/models/llama2/sampler/sampler.h>
+#include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl
index 26659303e0..d525628174 100644
--- a/examples/models/llama2/runner/targets.bzl
+++ b/examples/models/llama2/runner/targets.bzl
@@ -33,7 +33,7 @@ def define_common_targets():
             ],
             exported_deps = [
                 "//executorch/backends/xnnpack:xnnpack_backend",
-                "//executorch/examples/models/llama2/sampler:sampler" + aten_suffix,
+                "//executorch/extension/llm/sampler:sampler" + aten_suffix,
                 "//executorch/extension/evalue_util:print_evalue" + aten_suffix,
                 "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
                 "//executorch/extension/module:module" + aten_suffix,
diff --git a/examples/models/llama2/sampler/TARGETS b/extension/llm/sampler/TARGETS
similarity index 100%
rename from examples/models/llama2/sampler/TARGETS
rename to extension/llm/sampler/TARGETS
diff --git a/examples/models/llama2/sampler/sampler.cpp b/extension/llm/sampler/sampler.cpp
similarity index 99%
rename from examples/models/llama2/sampler/sampler.cpp
rename to extension/llm/sampler/sampler.cpp
index 1ae4d2f9d7..be3307b715 100644
--- a/examples/models/llama2/sampler/sampler.cpp
+++ b/extension/llm/sampler/sampler.cpp
@@ -32,7 +32,7 @@
  * SOFTWARE.
  */
 
-#include <executorch/examples/models/llama2/sampler/sampler.h>
+#include <executorch/extension/llm/sampler/sampler.h>
 
 namespace torch {
 namespace executor {
diff --git a/examples/models/llama2/sampler/sampler.h b/extension/llm/sampler/sampler.h
similarity index 100%
rename from examples/models/llama2/sampler/sampler.h
rename to extension/llm/sampler/sampler.h
diff --git a/examples/models/llama2/sampler/targets.bzl b/extension/llm/sampler/targets.bzl
similarity index 100%
rename from examples/models/llama2/sampler/targets.bzl
rename to extension/llm/sampler/targets.bzl
diff --git a/examples/models/llama2/sampler/test/TARGETS b/extension/llm/sampler/test/TARGETS
similarity index 100%
rename from examples/models/llama2/sampler/test/TARGETS
rename to extension/llm/sampler/test/TARGETS
diff --git a/examples/models/llama2/sampler/test/targets.bzl b/extension/llm/sampler/test/targets.bzl
similarity index 100%
rename from examples/models/llama2/sampler/test/targets.bzl
rename to extension/llm/sampler/test/targets.bzl
diff --git a/examples/models/llama2/sampler/test/test_sampler.cpp b/extension/llm/sampler/test/test_sampler.cpp
similarity index 100%
rename from examples/models/llama2/sampler/test/test_sampler.cpp
rename to extension/llm/sampler/test/test_sampler.cpp

From 69f3f1c7dc3f60df78c5a86c035bb0b26fa654f1 Mon Sep 17 00:00:00 2001
From: Gyanendra Sinha <gyanendra_sinha@apple.com>
Date: Tue, 30 Jul 2024 15:26:18 -0700
Subject: [PATCH 31/75] Fix prewarming (#4454)

Summary:
Prewarms the model if `config.should_prewarm_model` is `true`. This improves the latency of first inference call as the necessary objects are created when the model is prewarmed.

Testing:
Existing tests

Pull Request resolved: https://github.com/pytorch/executorch/pull/4454

Reviewed By: kirklandsign

Differential Revision: D60469148

Pulled By: cccclai

fbshipit-source-id: d88883e721269d03298265dd420f08cbbe4787ce
---
 .../coreml/runtime/delegate/ETCoreMLModelManager.mm |  2 +-
 .../coreml/runtime/delegate/backend_delegate.mm     | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
index e7846256e6..927df0483f 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
@@ -655,7 +655,7 @@ - (void)prewarmRecentlyUsedAssetsWithMaxCount:(NSUInteger)maxCount {
             
             NSError *prewarmError = nil;
             if (![asset prewarmAndReturnError:&prewarmError]) {
-                ETCoreMLLogError(localError,
+                ETCoreMLLogError(prewarmError,
                                  "%@: Failed to prewarm asset with identifier = %@",
                                  NSStringFromClass(strongSelf.assetManager.class),
                                  asset.identifier);
diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.mm b/backends/apple/coreml/runtime/delegate/backend_delegate.mm
index f6eb7a83fd..efa3dd2472 100644
--- a/backends/apple/coreml/runtime/delegate/backend_delegate.mm
+++ b/backends/apple/coreml/runtime/delegate/backend_delegate.mm
@@ -157,7 +157,7 @@ - (BOOL)_loadAndReturnError:(NSError * _Nullable __autoreleasing *)error {
     if (self.config.should_prewarm_asset) {
         [modelManager prewarmRecentlyUsedAssetsWithMaxCount:1];
     }
-    
+
     return YES;
 }
 
@@ -188,9 +188,14 @@ - (ModelHandle*)loadModelFromAOTData:(NSData*)data
         return nil;
     }
     
-    return [self.impl loadModelFromAOTData:data
-                             configuration:configuration
-                                     error:error];
+    auto handle = [self.impl loadModelFromAOTData:data
+                                    configuration:configuration
+                                            error:error];
+    if ((handle != NULL) && self.config.should_prewarm_model) {
+        [self.impl prewarmModelWithHandle:handle error:nil];
+    }
+
+    return handle;
 }
 
 - (BOOL)executeModelWithHandle:(ModelHandle*)handle

From 9aeceeee3df8096ba7c89f422f584e26ace60733 Mon Sep 17 00:00:00 2001
From: Yujie Hui <huiyujie0105@meta.com>
Date: Tue, 30 Jul 2024 15:29:34 -0700
Subject: [PATCH 32/75] Implement grid_priors op (#4440)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4440

Modify the spec of customized op `grid_prirors` to take a tensor as input. Compared to previous definition, the `height` and `width` arguments will be determined by the input tensor as `height, width = self.shape[-2:]`. The reason we change the spec is: if we want to support dynamic shape, the input should be a tensor.

Implement customized op `grid_priors`.  This op is used to generate mapped x,y points from different level feature map to original images. Op spec:
```
(Tensor self, int stride, float offset) -> Tensor
```

Example:
```
input_tensor = torch.rand(size = [1, 5, 2, 3])
stride = 8
offset = 0.5
output.shape = [3x2, 2]
output = tensor([[ 4.,  4.],
        [12.,  4.],
        [20.,  4.],
        [ 4., 12.],
        [12., 12.],
        [20., 12.]])
```
Add smoke test for now due to some issue to lower customized op to Vulkan backend.

Will add unit test and nn.Module test when be able to lower customized op from PyTorch to Vulkan backend.

bypass-github-export-checks
bypass-github-pytorch-ci-checks
bypass-github-executorch-ci-checks

Reviewed By: copyrightly

Differential Revision: D60203196

fbshipit-source-id: 93e5180e80e07cc0b9acb50890a1187ce0f82951
---
 backends/vulkan/passes/custom_ops_defs.py     |  8 +-
 backends/vulkan/passes/test_custom_ops.py     |  9 ++-
 .../runtime/graph/ops/glsl/grid_priors.glsl   | 38 +++++++++
 .../runtime/graph/ops/glsl/grid_priors.yaml   | 12 +++
 .../runtime/graph/ops/impl/GridPriors.cpp     | 79 +++++++++++++++++++
 .../vulkan/test/vulkan_compute_api_test.cpp   | 72 +++++++++++++++++
 6 files changed, 210 insertions(+), 8 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp

diff --git a/backends/vulkan/passes/custom_ops_defs.py b/backends/vulkan/passes/custom_ops_defs.py
index 67e7db828a..c76f7ebf75 100644
--- a/backends/vulkan/passes/custom_ops_defs.py
+++ b/backends/vulkan/passes/custom_ops_defs.py
@@ -49,11 +49,11 @@ def conv_with_clamp_impl(
 
 
 def grid_priors_impl(
-    height,
-    width,
+    x,
     stride,
     offset,
 ):
+    height, width = x.shape[-2:]
     shift_x = (torch.arange(0, width) + offset) * stride
     shift_y = (torch.arange(0, height) + offset) * stride
     shift_xx, shift_yy = torch.meshgrid(shift_y, shift_x)
@@ -64,6 +64,6 @@ def grid_priors_impl(
 
 
 name = "grid_priors"
-lib.define(f"{name}(int height, int width, int stride, float offset) -> Tensor")
-lib.impl(name, grid_priors_impl)
+lib.define(f"{name}(Tensor self, int stride, float offset) -> Tensor")
+lib.impl(name, grid_priors_impl, "CompositeExplicitAutograd")
 grid_priors_op = getattr(getattr(torch.ops, namespace), name)
diff --git a/backends/vulkan/passes/test_custom_ops.py b/backends/vulkan/passes/test_custom_ops.py
index a1a3a40f67..c68dd6d679 100644
--- a/backends/vulkan/passes/test_custom_ops.py
+++ b/backends/vulkan/passes/test_custom_ops.py
@@ -97,14 +97,15 @@ class GridPriors(torch.nn.Module):
             def __init__(self):
                 super().__init__()
 
-            def forward(self, height, width, stride, offset):
-                return torch.ops.et_vk.grid_priors(height, width, stride, offset)
+            def forward(self, x, stride, offset):
+                return torch.ops.et_vk.grid_priors(x, stride, offset)
 
         model = GridPriors()
-        sample_input = (2, 3, 4, 0.5)
+        sample_input = (torch.rand(2, 5, 2, 3), 4, 0.5)
         custom_out = model(*sample_input)
 
-        def calculate_expected_output(height, width, stride, offset):
+        def calculate_expected_output(x, stride, offset):
+            height, width = x.shape[-2:]
             shift_x = (torch.arange(0, width) + offset) * stride
             shift_y = (torch.arange(0, height) + offset) * stride
             shift_xx, shift_yy = torch.meshgrid(shift_y, shift_x)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl
new file mode 100644
index 0000000000..93a2c53e01
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl
@@ -0,0 +1,38 @@
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_ubo(1, "ivec4", "in_sizes")}
+${layout_declare_ubo(2, "ivec4", "out_sizes")}
+${layout_declare_ubo(3, "int", "stride", "float", "offset")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim);
+
+  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+    return;
+  }
+  int width = in_sizes.x;
+  VEC4_T outtex;
+  if (pos.x == 0) {
+    float value = (pos.y % width + offset) * stride;
+    outtex = VEC4_T(value, 0, 0, 0);
+  } else if (pos.x == 1) {
+    float value = (pos.y / width + offset) * stride;
+    outtex = VEC4_T(value, 0, 0, 0);
+  }
+
+  imageStore(t_out, pos, outtex);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml
new file mode 100644
index 0000000000..654edca610
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml
@@ -0,0 +1,12 @@
+grid_priors:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    PACKING: C_packed
+    STORAGE: texture3d
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: grid_priors
diff --git a/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp b/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp
new file mode 100644
index 0000000000..b0658e37c2
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+struct GridPriorsParam final {
+  int32_t stride;
+  float offset;
+};
+
+void resize_grid_priors_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr in = graph->get_tensor(extra_args[0]);
+  std::vector<int64_t> in_sizes = in->sizes();
+  int64_t height = in_sizes.at(in_sizes.size() - 2);
+  int64_t width = in_sizes.at(in_sizes.size() - 1);
+  std::vector<int64_t> sizes = {height * width, 2};
+  out->virtual_resize(sizes);
+}
+
+void add_grid_priors_node(
+    ComputeGraph& graph,
+    const ValueRef& in,
+    const ValueRef& stride_ref,
+    const ValueRef& offset_ref,
+    const ValueRef& out) {
+  vTensorPtr t_out = graph.get_tensor(out);
+  vTensorPtr t_in = graph.get_tensor(in);
+  int32_t stride = graph.extract_scalar<int32_t>(stride_ref);
+  float offset = graph.extract_scalar<float>(offset_ref);
+
+  std::string kernel_name = "grid_priors";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, *t_out);
+
+  GridPriorsParam param = {stride, offset};
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
+      // Inputs and Outputs
+      {
+          {out, vkapi::MemoryAccessType::WRITE},
+      },
+      // Shader params buffers
+      {
+          t_in->sizes_ubo(),
+          t_out->sizes_ubo(),
+          graph.create_params_buffer(param),
+      },
+      // Specialization Constants
+      {},
+      resize_grid_priors_node,
+      {in}));
+}
+
+void grid_priors(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_grid_priors_node(graph, args[0], args[1], args[2], args[3]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(grid_priors.default, grid_priors);
+}
+} // namespace vkcompute
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 9260475ab6..9d87de8bff 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -2203,3 +2203,75 @@ TEST(VulkanComputeGraphOpsTest, conv2d_prepack_test) {
                                 0, 3, 9, 0, 0, 6, 12, 0, 0, 5,  11,
                                 0, 0, 0, 0, 0, 0, 0,  0, 0, 0});
 }
+
+void test_grid_priors(
+    std::vector<int64_t> input_sizes,
+    std::vector<int64_t> output_sizes,
+    int stride,
+    double offset,
+    const std::vector<float>& data_out_expected) {
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  // Build graph
+  IOValueRef in = graph.add_input_tensor(
+      input_sizes,
+      vkapi::kFloat,
+      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
+  IOValueRef out;
+  out.value = graph.add_tensor(
+      output_sizes,
+      vkapi::kFloat,
+      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
+
+  VK_GET_OP_FN("grid_priors.default")
+  (graph,
+   {in.value,
+    graph.add_scalar<int64_t>(stride),
+    graph.add_scalar<double>(offset),
+    out.value});
+
+  out.staging = graph.set_output_tensor(out.value);
+
+  graph.prepare();
+  graph.encode_prepack();
+  graph.prepack();
+  graph.encode_execute();
+
+  vTensorPtr t_in = graph.get_tensor(in.value);
+  vTensorPtr t_out = graph.get_tensor(out.value);
+  // Resize input
+  graph.propagate_resize();
+
+  // run graph
+  graph.execute();
+
+  std::vector<float> output_data(t_out->gpu_numel());
+  graph.copy_from_staging(out.staging, output_data.data(), output_data.size());
+
+  // check results
+  int h_out = utils::val_at(-2, t_out->sizes());
+  int w_out = utils::val_at(-1, t_out->sizes());
+  for (size_t i = 0; i < h_out; ++i) {
+    for (size_t j = 0; j < w_out; ++j) {
+      size_t idx_out = i * w_out + j;
+      CHECK_VALUE(output_data, idx_out, data_out_expected[idx_out]);
+    }
+  }
+}
+
+TEST(VulkanComputeGraphOpsTest, grid_priors_test) {
+  test_grid_priors(
+      /*input size = */ {1, 5, 2, 3},
+      /*output size = */ {6, 2},
+      /*stride = */ 1,
+      /*offset = */ 0.0,
+      /*data_out_expected = */ {0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 2, 1});
+
+  test_grid_priors(
+      /*input size = */ {1, 5, 2, 3},
+      /*output size = */ {6, 2},
+      /*stride = */ 8,
+      /*offset = */ 0.5,
+      /*data_out_expected = */ {4, 4, 12, 4, 20, 4, 4, 12, 12, 12, 20, 12});
+}

From a567abfd0853c0c59302173cb11c727d5fae3416 Mon Sep 17 00:00:00 2001
From: Chirag Modi <cmodi@meta.com>
Date: Tue, 30 Jul 2024 15:38:18 -0700
Subject: [PATCH 33/75] Porting over ET MultiModal Demo App (#4455)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4455

Adding ET demo app for multimodal support. This is the first diff that supports Llama3.

This includes major changes to the existing Llama Demo app with having the following features:
1. Llama3 support
2. Settings activity
3. UI/UX improvements to the mainactivity
4. Ability to add multi-images in prep for multimodal support.
5. Metrics

Note: You'll need to build the `executorch-llama.aar` and have it placed in the `app/libs` folder.

Reviewed By: kirklandsign

Differential Revision: D60416605

fbshipit-source-id: 262329c30e1ec28c3905da5c040dc661307f8666
---
 .../android/LlamaDemo/app/build.gradle.kts    |   5 +-
 .../app/src/main/AndroidManifest.xml          |  32 +-
 .../example/executorchllamademo/AppLog.java   |  49 ++
 .../DemoSharedPreferences.java                |  90 +++
 .../example/executorchllamademo/ETImage.java  | 116 ++++
 .../executorchllamademo/ETLogging.java        |  54 ++
 .../executorchllamademo/LogsActivity.java     |  86 +++
 .../executorchllamademo/LogsAdapter.java      |  45 ++
 .../executorchllamademo/MainActivity.java     | 631 +++++++++++++++---
 .../example/executorchllamademo/Message.java  |  60 +-
 .../executorchllamademo/MessageAdapter.java   |  67 +-
 .../executorchllamademo/MessageType.java      |  15 +
 .../executorchllamademo/SettingsActivity.java | 325 +++++++++
 .../executorchllamademo/SettingsFields.java   | 135 ++++
 .../src/main/res/drawable/banner_shape.xml    |   7 +
 .../src/main/res/drawable/baseline_add_24.xml |   5 +
 .../baseline_add_photo_alternate_24.xml       |   5 +
 .../main/res/drawable/baseline_article_24.xml |   5 +
 .../main/res/drawable/baseline_close_24.xml   |   5 +
 .../drawable/baseline_delete_forever_24.xml   |   5 +
 .../res/drawable/baseline_restart_alt_24.xml  |   6 +
 .../main/res/drawable/baseline_send_24.xml    |   5 +
 .../res/drawable/baseline_settings_24.xml     |  10 +
 .../main/res/drawable/baseline_stop_24.xml    |   5 +
 .../app/src/main/res/drawable/btn.xml         |   8 +
 .../main/res/drawable/custom_button_round.xml |   7 +
 .../main/res/drawable/input_text_shape.xml    |  10 +
 .../app/src/main/res/drawable/logo.png        | Bin 0 -> 33036 bytes
 .../main/res/drawable/outline_add_box_48.xml  |   5 +
 .../outline_arrow_drop_down_circle_24.xml     |   5 +
 .../res/drawable/outline_camera_alt_48.xml    |   5 +
 .../main/res/drawable/outline_image_48.xml    |   5 +
 .../src/main/res/drawable/prompt_shape.xml    |   6 +
 .../app/src/main/res/layout/activity_logs.xml |  55 ++
 .../app/src/main/res/layout/activity_main.xml | 241 ++++++-
 .../src/main/res/layout/activity_settings.xml | 233 +++++++
 .../app/src/main/res/layout/logs_message.xml  |  16 +
 .../src/main/res/layout/received_message.xml  |  40 +-
 .../app/src/main/res/layout/sent_message.xml  |  58 +-
 .../src/main/res/layout/system_message.xml    |  23 +
 .../app/src/main/res/values/colors.xml        |   4 +-
 .../app/src/main/res/values/strings.xml       |   4 +
 .../app/src/main/res/values/styles.xml        |   4 +
 43 files changed, 2328 insertions(+), 169 deletions(-)
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/logo.png
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/logs_message.xml
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/system_message.xml

diff --git a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
index 3c168689f7..37c8cbf0ba 100644
--- a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
+++ b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
@@ -17,7 +17,7 @@ android {
 
   defaultConfig {
     applicationId = "com.example.executorchllamademo"
-    minSdk = 24
+    minSdk = 28
     targetSdk = 33
     versionCode = 1
     versionName = "1.0"
@@ -56,7 +56,10 @@ dependencies {
   implementation("androidx.camera:camera-core:1.3.0-rc02")
   implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12")
   implementation("com.facebook.fbjni:fbjni:0.5.1")
+  implementation("com.google.code.gson:gson:2.8.6")
   implementation(files("libs/executorch-llama.aar"))
+  implementation("com.google.android.material:material:1.12.0")
+  implementation("androidx.activity:activity:1.9.0")
   testImplementation("junit:junit:4.13.2")
   androidTestImplementation("androidx.test.ext:junit:1.1.5")
   androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
index 3eaf301b5a..bb231420df 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
@@ -3,32 +3,44 @@
     xmlns:tools="http://schemas.android.com/tools"
     package="com.example.executorchllamademo">
 
-    <uses-sdk android:minSdkVersion="19"
-          android:targetSdkVersion="34"
-          android:maxSdkVersion="40" />
+    <uses-sdk
+        android:maxSdkVersion="40"
+        android:minSdkVersion="28"
+        android:targetSdkVersion="34" />
 
     <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
+    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
+    <uses-permission android:name="android.permission.CAMERA" />
+
+    <uses-feature android:name="android.hardware.camera" />
 
     <application
+        android:name=".ETLogging"
         android:allowBackup="true"
         android:dataExtractionRules="@xml/data_extraction_rules"
+        android:extractNativeLibs="true"
         android:fullBackupContent="@xml/backup_rules"
-        android:icon="@mipmap/ic_launcher"
+        android:icon="@drawable/logo"
         android:label="@string/app_name"
-        android:roundIcon="@mipmap/ic_launcher_round"
         android:supportsRtl="true"
-        android:theme="@style/Theme.ExecuTorchLlamaDemo"
-        android:extractNativeLibs="true"
+        android:theme="@style/Theme.AppCompat.Light.NoActionBar"
         tools:targetApi="34">
+        <activity
+            android:name=".LogsActivity"
+            android:exported="false" />
+        <activity
+            android:name=".SettingsActivity"
+            android:exported="false" />
 
-        <uses-native-library android:name="libcdsprpc.so"
-            android:required="false"/>
+        <uses-native-library
+            android:name="libcdsprpc.so"
+            android:required="false" />
 
         <activity
             android:name=".MainActivity"
             android:exported="true"
             android:label="@string/app_name"
-            android:theme="@style/Theme.ExecuTorchLlamaDemo">
+            android:theme="@style/Theme.AppCompat.Light.NoActionBar">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
 
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java
new file mode 100644
index 0000000000..36d0741938
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Locale;
+
+public class AppLog {
+  private final Long timestamp;
+  private final String message;
+
+  public AppLog(String message) {
+    this.timestamp = getCurrentTimeStamp();
+    this.message = message;
+  }
+
+  public Long getTimestamp() {
+    return timestamp;
+  }
+
+  public String getMessage() {
+    return message;
+  }
+
+  public String getFormattedLog() {
+    return "[" + getFormattedTimeStamp() + "] " + message;
+  }
+
+  private Long getCurrentTimeStamp() {
+    return System.currentTimeMillis();
+  }
+
+  private String getFormattedTimeStamp() {
+    return formatDate(timestamp);
+  }
+
+  private String formatDate(long milliseconds) {
+    SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd  HH:mm:ss", Locale.getDefault());
+    Date date = new Date(milliseconds);
+    return formatter.format(date);
+  }
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java
new file mode 100644
index 0000000000..99a94c00eb
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+import android.content.Context;
+import android.content.SharedPreferences;
+import com.google.gson.Gson;
+import com.google.gson.reflect.TypeToken;
+import java.lang.reflect.Type;
+import java.util.ArrayList;
+
+public class DemoSharedPreferences {
+  Context context;
+  SharedPreferences sharedPreferences;
+
+  public DemoSharedPreferences(Context context) {
+    this.context = context;
+    this.sharedPreferences = getSharedPrefs();
+  }
+
+  private SharedPreferences getSharedPrefs() {
+    return context.getSharedPreferences(
+        context.getString(R.string.demo_pref_file_key), Context.MODE_PRIVATE);
+  }
+
+  public String getSavedMessages() {
+    return sharedPreferences.getString(context.getString(R.string.saved_messages_json_key), "");
+  }
+
+  public void addMessages(MessageAdapter messageAdapter) {
+    SharedPreferences.Editor editor = sharedPreferences.edit();
+    Gson gson = new Gson();
+    String msgJSON = gson.toJson(messageAdapter.getSavedMessages());
+    editor.putString(context.getString(R.string.saved_messages_json_key), msgJSON);
+    editor.apply();
+  }
+
+  public void removeExistingMessages() {
+    SharedPreferences.Editor editor = sharedPreferences.edit();
+    editor.remove(context.getString(R.string.saved_messages_json_key));
+    editor.apply();
+  }
+
+  public void addSettings(SettingsFields settingsFields) {
+    SharedPreferences.Editor editor = sharedPreferences.edit();
+    Gson gson = new Gson();
+    String settingsJSON = gson.toJson(settingsFields);
+    editor.putString(context.getString(R.string.settings_json_key), settingsJSON);
+    editor.apply();
+  }
+
+  public String getSettings() {
+    return sharedPreferences.getString(context.getString(R.string.settings_json_key), "");
+  }
+
+  public void saveLogs() {
+    SharedPreferences.Editor editor = sharedPreferences.edit();
+    Gson gson = new Gson();
+    String msgJSON = gson.toJson(ETLogging.getInstance().getLogs());
+    editor.putString(context.getString(R.string.logs_json_key), msgJSON);
+    editor.apply();
+  }
+
+  public void removeExistingLogs() {
+    SharedPreferences.Editor editor = sharedPreferences.edit();
+    editor.remove(context.getString(R.string.logs_json_key));
+    editor.apply();
+  }
+
+  public ArrayList<AppLog> getSavedLogs() {
+    String logsJSONString =
+        sharedPreferences.getString(context.getString(R.string.logs_json_key), null);
+    if (logsJSONString == null || logsJSONString.isEmpty()) {
+      return new ArrayList<>();
+    }
+    Gson gson = new Gson();
+    Type type = new TypeToken<ArrayList<AppLog>>() {}.getType();
+    ArrayList<AppLog> appLogs = gson.fromJson(logsJSONString, type);
+    if (appLogs == null) {
+      return new ArrayList<>();
+    }
+    return appLogs;
+  }
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java
new file mode 100644
index 0000000000..cf3c3e5f0a
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+import android.content.ContentResolver;
+import android.graphics.Bitmap;
+import android.graphics.BitmapFactory;
+import android.graphics.Color;
+import android.net.Uri;
+import androidx.annotation.Nullable;
+import java.io.FileNotFoundException;
+import java.io.InputStream;
+
+public class ETImage {
+  private int width;
+  private int height;
+  private final byte[] bytes;
+  private final Uri uri;
+  private final ContentResolver contentResolver;
+
+  ETImage(ContentResolver contentResolver, Uri uri) {
+    this.contentResolver = contentResolver;
+    this.uri = uri;
+    bytes = getBytesFromImageURI(uri);
+  }
+
+  public int getWidth() {
+    return width;
+  }
+
+  public int getHeight() {
+    return height;
+  }
+
+  public Uri getUri() {
+    return uri;
+  }
+
+  public byte[] getBytes() {
+    return bytes;
+  }
+
+  private byte[] getBytesFromImageURI(Uri uri) {
+    try {
+      int RESIZED_IMAGE_WIDTH = 336;
+      Bitmap bitmap = resizeImage(uri, RESIZED_IMAGE_WIDTH);
+
+      if (bitmap == null) {
+        ETLogging.getInstance().log("Unable to get bytes from Image URI. Bitmap is null");
+        return new byte[0];
+      }
+
+      width = bitmap.getWidth();
+      height = bitmap.getHeight();
+
+      byte[] rgbValues = new byte[width * height * 3];
+
+      for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+          // Get the color of the current pixel
+          int color = bitmap.getPixel(x, y);
+
+          // Extract the RGB values from the color
+          int red = Color.red(color);
+          int green = Color.green(color);
+          int blue = Color.blue(color);
+
+          // Store the RGB values in the byte array
+          rgbValues[(y * width + x) * 3] = (byte) red;
+          rgbValues[(y * width + x) * 3 + 1] = (byte) green;
+          rgbValues[(y * width + x) * 3 + 2] = (byte) blue;
+        }
+      }
+      return rgbValues;
+    } catch (FileNotFoundException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Nullable
+  private Bitmap resizeImage(Uri uri, int maxLength) throws FileNotFoundException {
+    InputStream inputStream = contentResolver.openInputStream(uri);
+    if (inputStream == null) {
+      ETLogging.getInstance().log("Unable to resize image, input streams is null");
+      return null;
+    }
+    Bitmap bitmap = BitmapFactory.decodeStream(inputStream);
+    if (bitmap == null) {
+      ETLogging.getInstance().log("Unable to resize image, bitmap during decode stream is null");
+      return null;
+    }
+
+    float aspectRatio;
+    int finalWidth, finalHeight;
+
+    if (bitmap.getWidth() > bitmap.getHeight()) {
+      // width > height --> width = maxLength, height scale with aspect ratio
+      aspectRatio = bitmap.getWidth() / (float) bitmap.getHeight();
+      finalWidth = maxLength;
+      finalHeight = Math.round(maxLength / aspectRatio);
+    } else {
+      // height >= width --> height = maxLength, width scale with aspect ratio
+      aspectRatio = bitmap.getHeight() / (float) bitmap.getWidth();
+      finalHeight = maxLength;
+      finalWidth = Math.round(maxLength / aspectRatio);
+    }
+
+    return Bitmap.createScaledBitmap(bitmap, finalWidth, finalHeight, false);
+  }
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java
new file mode 100644
index 0000000000..e595348945
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+import android.app.Application;
+import android.util.Log;
+import java.util.ArrayList;
+
+public class ETLogging extends Application {
+  private static ETLogging singleton;
+
+  private ArrayList<AppLog> logs;
+  private DemoSharedPreferences mDemoSharedPreferences;
+
+  @Override
+  public void onCreate() {
+    super.onCreate();
+    singleton = this;
+    mDemoSharedPreferences = new DemoSharedPreferences(this.getApplicationContext());
+    logs = mDemoSharedPreferences.getSavedLogs();
+    if (logs == null) { // We don't have existing sharedPreference stored
+      logs = new ArrayList<>();
+    }
+  }
+
+  public static ETLogging getInstance() {
+    return singleton;
+  }
+
+  public void log(String message) {
+    AppLog appLog = new AppLog(message);
+    logs.add(appLog);
+    Log.d("ETLogging", appLog.getMessage());
+  }
+
+  public ArrayList<AppLog> getLogs() {
+    return logs;
+  }
+
+  public void clearLogs() {
+    logs.clear();
+    mDemoSharedPreferences.removeExistingLogs();
+  }
+
+  public void saveLogs() {
+    mDemoSharedPreferences.saveLogs();
+  }
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java
new file mode 100644
index 0000000000..8700528d44
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+import android.app.AlertDialog;
+import android.content.DialogInterface;
+import android.os.Bundle;
+import android.widget.ImageButton;
+import android.widget.ListView;
+import androidx.appcompat.app.AppCompatActivity;
+import androidx.core.graphics.Insets;
+import androidx.core.view.ViewCompat;
+import androidx.core.view.WindowInsetsCompat;
+
+public class LogsActivity extends AppCompatActivity {
+
+  private LogsAdapter mLogsAdapter;
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+    setContentView(R.layout.activity_logs);
+    ViewCompat.setOnApplyWindowInsetsListener(
+        requireViewById(R.id.main),
+        (v, insets) -> {
+          Insets systemBars = insets.getInsets(WindowInsetsCompat.Type.systemBars());
+          v.setPadding(systemBars.left, systemBars.top, systemBars.right, systemBars.bottom);
+          return insets;
+        });
+
+    setupLogs();
+    setupClearLogsButton();
+  }
+
+  @Override
+  public void onResume() {
+    super.onResume();
+    mLogsAdapter.clear();
+    mLogsAdapter.addAll(ETLogging.getInstance().getLogs());
+    mLogsAdapter.notifyDataSetChanged();
+  }
+
+  private void setupLogs() {
+    ListView mLogsListView = requireViewById(R.id.logsListView);
+    mLogsAdapter = new LogsAdapter(this, R.layout.logs_message);
+
+    mLogsListView.setAdapter(mLogsAdapter);
+    mLogsAdapter.addAll(ETLogging.getInstance().getLogs());
+    mLogsAdapter.notifyDataSetChanged();
+  }
+
+  private void setupClearLogsButton() {
+    ImageButton clearLogsButton = requireViewById(R.id.clearLogsButton);
+    clearLogsButton.setOnClickListener(
+        view -> {
+          new AlertDialog.Builder(this)
+              .setTitle("Delete Logs History")
+              .setMessage("Do you really want to delete logs history?")
+              .setIcon(android.R.drawable.ic_dialog_alert)
+              .setPositiveButton(
+                  android.R.string.yes,
+                  new DialogInterface.OnClickListener() {
+                    public void onClick(DialogInterface dialog, int whichButton) {
+                      // Clear the messageAdapter and sharedPreference
+                      ETLogging.getInstance().clearLogs();
+                      mLogsAdapter.clear();
+                      mLogsAdapter.notifyDataSetChanged();
+                    }
+                  })
+              .setNegativeButton(android.R.string.no, null)
+              .show();
+        });
+  }
+
+  @Override
+  protected void onDestroy() {
+    super.onDestroy();
+    ETLogging.getInstance().saveLogs();
+  }
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java
new file mode 100644
index 0000000000..76c6a1aa1b
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+import android.view.LayoutInflater;
+import android.view.View;
+import android.view.ViewGroup;
+import android.widget.ArrayAdapter;
+import android.widget.TextView;
+import androidx.annotation.NonNull;
+import java.util.Objects;
+
+public class LogsAdapter extends ArrayAdapter<AppLog> {
+  public LogsAdapter(android.content.Context context, int resource) {
+    super(context, resource);
+  }
+
+  static class ViewHolder {
+    private TextView logTextView;
+  }
+
+  @NonNull
+  @Override
+  public View getView(int position, View convertView, @NonNull ViewGroup parent) {
+    ViewHolder mViewHolder = null;
+
+    String logMessage = Objects.requireNonNull(getItem(position)).getFormattedLog();
+
+    if (convertView == null || convertView.getTag() == null) {
+      mViewHolder = new ViewHolder();
+      convertView = LayoutInflater.from(getContext()).inflate(R.layout.logs_message, parent, false);
+      mViewHolder.logTextView = convertView.requireViewById(R.id.logsTextView);
+    } else {
+      mViewHolder = (ViewHolder) convertView.getTag();
+    }
+    mViewHolder.logTextView.setText(logMessage);
+    return convertView;
+  }
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index 2c94c242ed..44d310231a 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -8,32 +8,72 @@
 
 package com.example.executorchllamademo;
 
-import android.app.Activity;
+import android.Manifest;
 import android.app.ActivityManager;
 import android.app.AlertDialog;
-import android.content.Context;
+import android.content.ContentResolver;
+import android.content.ContentValues;
+import android.content.Intent;
+import android.content.pm.PackageManager;
+import android.net.Uri;
 import android.os.Bundle;
+import android.os.Handler;
+import android.os.Looper;
+import android.provider.MediaStore;
 import android.system.ErrnoException;
 import android.system.Os;
-import android.widget.Button;
+import android.text.InputType;
+import android.util.Log;
+import android.view.View;
 import android.widget.EditText;
 import android.widget.ImageButton;
+import android.widget.ImageView;
+import android.widget.LinearLayout;
 import android.widget.ListView;
-import java.io.File;
+import android.widget.TextView;
+import android.widget.Toast;
+import androidx.activity.result.ActivityResultLauncher;
+import androidx.activity.result.PickVisualMediaRequest;
+import androidx.activity.result.contract.ActivityResultContracts;
+import androidx.annotation.NonNull;
+import androidx.appcompat.app.AppCompatActivity;
+import androidx.constraintlayout.widget.ConstraintLayout;
+import androidx.core.app.ActivityCompat;
+import androidx.core.content.ContextCompat;
+import com.google.gson.Gson;
+import com.google.gson.reflect.TypeToken;
+import java.lang.reflect.Type;
+import java.util.ArrayList;
+import java.util.List;
 import org.pytorch.executorch.LlamaCallback;
 import org.pytorch.executorch.LlamaModule;
 
-public class MainActivity extends Activity implements Runnable, LlamaCallback {
+public class MainActivity extends AppCompatActivity implements Runnable, LlamaCallback {
   private EditText mEditTextMessage;
-  private Button mSendButton;
-  private ImageButton mModelButton;
+  private ImageButton mSendButton;
+  private ImageButton mGalleryButton;
+  private ImageButton mCameraButton;
   private ListView mMessagesView;
   private MessageAdapter mMessageAdapter;
   private LlamaModule mModule = null;
   private Message mResultMessage = null;
-
-  private String mModelFilePath = "";
-  private String mTokenizerFilePath = "";
+  private ImageButton mSettingsButton;
+  private TextView mMemoryView;
+  private ActivityResultLauncher<PickVisualMediaRequest> mPickGallery;
+  private ActivityResultLauncher<Uri> mCameraRoll;
+  private List<Uri> mSelectedImageUri;
+  private ConstraintLayout mMediaPreviewConstraintLayout;
+  private LinearLayout mAddMediaLayout;
+  private static final int MAX_NUM_OF_IMAGES = 5;
+  private static final int REQUEST_IMAGE_CAPTURE = 1;
+  private Uri cameraImageUri;
+  private DemoSharedPreferences mDemoSharedPreferences;
+  private SettingsFields mCurrentSettingsFields;
+  private Handler mMemoryUpdateHandler;
+  private Runnable memoryUpdater;
+  // UI Specific to user using INSTRUCT_MODE
+  private boolean INSTRUCT_MODE = false;
+  private String INSTRUCT_INSTRUCTION = "In Instruct Mode. Press SEND";
 
   @Override
   public void onResult(String result) {
@@ -52,23 +92,13 @@ public void onStats(float tps) {
         });
   }
 
-  private static String[] listLocalFile(String path, String suffix) {
-    File directory = new File(path);
-    if (directory.exists() && directory.isDirectory()) {
-      File[] files = directory.listFiles((dir, name) -> name.toLowerCase().endsWith(suffix));
-      String[] result = new String[files.length];
-      for (int i = 0; i < files.length; i++) {
-        if (files[i].isFile() && files[i].getName().endsWith(suffix)) {
-          result[i] = files[i].getAbsolutePath();
-        }
-      }
-      return result;
+  private void setLocalModel(String modelPath, String tokenizerPath, float temperature) {
+    if (mModule != null) {
+      mModule.resetNative();
+      mModule = null;
     }
-    return new String[0];
-  }
-
-  private void setLocalModel(String modelPath, String tokenizerPath) {
-    Message modelLoadingMessage = new Message("Loading model...", false);
+    Message modelLoadingMessage = new Message("Loading model...", false, MessageType.SYSTEM, 0);
+    ETLogging.getInstance().log("Loading model " + modelPath + " with tokenizer " + tokenizerPath);
     runOnUiThread(
         () -> {
           mSendButton.setEnabled(false);
@@ -76,9 +106,15 @@ private void setLocalModel(String modelPath, String tokenizerPath) {
           mMessageAdapter.notifyDataSetChanged();
         });
     long runStartTime = System.currentTimeMillis();
-    mModule = new LlamaModule(modelPath, tokenizerPath, 0.8f);
+    mModule = new LlamaModule(modelPath, tokenizerPath, temperature);
     int loadResult = mModule.load();
+    long loadDuration = System.currentTimeMillis() - runStartTime;
+    String modelLoadError = "";
+    String modelInfo = "";
     if (loadResult != 0) {
+      // TODO: Map the error code to a reason to let the user know why model loading failed
+      modelInfo = "*Model could not load (Error Code: " + loadResult + ")*" + "\n";
+      loadDuration = 0;
       AlertDialog.Builder builder = new AlertDialog.Builder(this);
       builder.setTitle("Load failed: " + loadResult);
       runOnUiThread(
@@ -86,18 +122,37 @@ private void setLocalModel(String modelPath, String tokenizerPath) {
             AlertDialog alert = builder.create();
             alert.show();
           });
+    } else {
+      String[] segments = modelPath.split("/");
+      String pteName = segments[segments.length - 1];
+      segments = tokenizerPath.split("/");
+      String tokenizerName = segments[segments.length - 1];
+      modelInfo =
+          "Successfully loaded model. "
+              + pteName
+              + " and tokenizer "
+              + tokenizerName
+              + " in "
+              + (float) loadDuration / 1000
+              + " sec."
+              + " You can send text or image for inference";
     }
 
-    long loadDuration = System.currentTimeMillis() - runStartTime;
-    String modelInfo =
-        "Model path: "
+    Message modelLoadedMessage = new Message(modelInfo, false, MessageType.SYSTEM, 0);
+
+    String modelLoggingInfo =
+        modelLoadError
+            + "Model path: "
             + modelPath
             + "\nTokenizer path: "
             + tokenizerPath
+            + "\nTemperature: "
+            + temperature
             + "\nModel loaded time: "
             + loadDuration
             + " ms";
-    Message modelLoadedMessage = new Message(modelInfo, false);
+    ETLogging.getInstance().log("Load complete. " + modelLoggingInfo);
+
     runOnUiThread(
         () -> {
           mSendButton.setEnabled(true);
@@ -107,55 +162,26 @@ private void setLocalModel(String modelPath, String tokenizerPath) {
         });
   }
 
-  private String memoryInfo() {
-    final ActivityManager am = (ActivityManager) getSystemService(Context.ACTIVITY_SERVICE);
-    ActivityManager.MemoryInfo memInfo = new ActivityManager.MemoryInfo();
-    am.getMemoryInfo(memInfo);
-    return "Total RAM: "
-        + Math.floorDiv(memInfo.totalMem, 1000000)
-        + " MB. Available RAM: "
-        + Math.floorDiv(memInfo.availMem, 1000000)
-        + " MB.";
-  }
-
-  private void modelDialog() {
-    String[] pteFiles = listLocalFile("/data/local/tmp/llama/", ".pte");
-    String[] binFiles = listLocalFile("/data/local/tmp/llama/", ".bin");
-    String[] modelFiles = listLocalFile("/data/local/tmp/llama/", ".model");
-    String[] tokenizerFiles = new String[binFiles.length + modelFiles.length];
-    System.arraycopy(binFiles, 0, tokenizerFiles, 0, binFiles.length);
-    System.arraycopy(modelFiles, 0, tokenizerFiles, binFiles.length, modelFiles.length);
-    AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this);
-    modelPathBuilder.setTitle("Select model path");
-    AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this);
-    tokenizerPathBuilder.setTitle("Select tokenizer path");
-    modelPathBuilder.setSingleChoiceItems(
-        pteFiles,
-        -1,
-        (dialog, item) -> {
-          mModelFilePath = pteFiles[item];
-          mEditTextMessage.setText("");
-          dialog.dismiss();
-          tokenizerPathBuilder.create().show();
-        });
-
-    tokenizerPathBuilder.setSingleChoiceItems(
-        tokenizerFiles,
-        -1,
-        (dialog, item) -> {
-          mTokenizerFilePath = tokenizerFiles[item];
-          Runnable runnable =
-              new Runnable() {
-                @Override
-                public void run() {
-                  setLocalModel(mModelFilePath, mTokenizerFilePath);
-                }
-              };
-          new Thread(runnable).start();
-          dialog.dismiss();
-        });
+  private void loadLocalModelAndParameters(
+      String modelFilePath, String tokenizerFilePath, float temperature) {
+    Runnable runnable =
+        new Runnable() {
+          @Override
+          public void run() {
+            setLocalModel(modelFilePath, tokenizerFilePath, temperature);
+          }
+        };
+    new Thread(runnable).start();
+  }
 
-    modelPathBuilder.create().show();
+  private void populateExistingMessages(String existingMsgJSON) {
+    Gson gson = new Gson();
+    Type type = new TypeToken<ArrayList<Message>>() {}.getType();
+    ArrayList<Message> savedMessages = gson.fromJson(existingMsgJSON, type);
+    for (Message msg : savedMessages) {
+      mMessageAdapter.add(msg);
+    }
+    mMessageAdapter.notifyDataSetChanged();
   }
 
   @Override
@@ -169,27 +195,379 @@ protected void onCreate(Bundle savedInstanceState) {
       finish();
     }
 
-    mEditTextMessage = findViewById(R.id.editTextMessage);
-    mSendButton = findViewById(R.id.sendButton);
+    mEditTextMessage = requireViewById(R.id.editTextMessage);
+    mSendButton = requireViewById(R.id.sendButton);
     mSendButton.setEnabled(false);
-    mModelButton = findViewById(R.id.modelButton);
-    mMessagesView = findViewById(R.id.messages_view);
-    mMessageAdapter = new MessageAdapter(this, R.layout.sent_message);
+    mMessagesView = requireViewById(R.id.messages_view);
+    mMessageAdapter = new MessageAdapter(this, R.layout.sent_message, new ArrayList<Message>());
     mMessagesView.setAdapter(mMessageAdapter);
-    mModelButton.setOnClickListener(
+    mDemoSharedPreferences = new DemoSharedPreferences(this.getApplicationContext());
+    String existingMsgJSON = mDemoSharedPreferences.getSavedMessages();
+    if (!existingMsgJSON.isEmpty()) {
+      populateExistingMessages(existingMsgJSON);
+    }
+    mSettingsButton = requireViewById(R.id.settings);
+    mSettingsButton.setOnClickListener(
         view -> {
-          mModule.stop();
-          mMessageAdapter.clear();
-          mMessageAdapter.notifyDataSetChanged();
-          modelDialog();
+          Intent myIntent = new Intent(MainActivity.this, SettingsActivity.class);
+          MainActivity.this.startActivity(myIntent);
         });
 
+    mCurrentSettingsFields = new SettingsFields();
+    mMemoryUpdateHandler = new Handler(Looper.getMainLooper());
     onModelRunStopped();
-    modelDialog();
+    setupMediaButton();
+    setupGalleryPicker();
+    setupCameraRoll();
+    startMemoryUpdate();
+    setupShowLogsButton();
+  }
+
+  @Override
+  protected void onPause() {
+    super.onPause();
+    mDemoSharedPreferences.addMessages(mMessageAdapter);
+  }
+
+  @Override
+  protected void onResume() {
+    super.onResume();
+    // Check for if settings parameters have changed
+    Gson gson = new Gson();
+    String settingsFieldsJSON = mDemoSharedPreferences.getSettings();
+    if (!settingsFieldsJSON.isEmpty()) {
+      SettingsFields updatedSettingsFields =
+          gson.fromJson(settingsFieldsJSON, SettingsFields.class);
+      if (updatedSettingsFields == null) {
+        // Added this check, because gson.fromJson can return null
+        askUserToSelectModel();
+        return;
+      }
+      boolean isUpdated = !mCurrentSettingsFields.equals(updatedSettingsFields);
+      boolean isLoadModel = updatedSettingsFields.getIsLoadModel();
+      if (isUpdated) {
+        if (isLoadModel) {
+          // If users change the model file, but not pressing loadModelButton, we won't load the new
+          // model
+          checkForUpdateAndReloadModel(updatedSettingsFields);
+        } else {
+          askUserToSelectModel();
+        }
+        checkForPromptChange(updatedSettingsFields);
+        checkForClearChatHistory(updatedSettingsFields);
+        // Update current to point to the latest
+        mCurrentSettingsFields = new SettingsFields(updatedSettingsFields);
+      }
+    } else {
+      askUserToSelectModel();
+    }
+  }
+
+  private void checkForClearChatHistory(SettingsFields updatedSettingsFields) {
+    if (updatedSettingsFields.getIsClearChatHistory()) {
+      mMessageAdapter.clear();
+      mMessageAdapter.notifyDataSetChanged();
+      mDemoSharedPreferences.removeExistingMessages();
+      // changing to false since chat history has been cleared.
+      updatedSettingsFields.saveIsClearChatHistory(false);
+      mDemoSharedPreferences.addSettings(updatedSettingsFields);
+    }
+  }
+
+  private void checkForUpdateAndReloadModel(SettingsFields updatedSettingsFields) {
+    // TODO need to add 'load model' in settings and queue loading based on that
+    String modelPath = updatedSettingsFields.getModelFilePath();
+    String tokenizerPath = updatedSettingsFields.getTokenizerFilePath();
+    double temperature = updatedSettingsFields.getTemperature();
+    if (!modelPath.isEmpty() && !tokenizerPath.isEmpty()) {
+      if (updatedSettingsFields.getIsLoadModel()
+          || !modelPath.equals(mCurrentSettingsFields.getModelFilePath())
+          || !tokenizerPath.equals(mCurrentSettingsFields.getTokenizerFilePath())
+          || temperature != mCurrentSettingsFields.getTemperature()) {
+        loadLocalModelAndParameters(
+            updatedSettingsFields.getModelFilePath(),
+            updatedSettingsFields.getTokenizerFilePath(),
+            (float) updatedSettingsFields.getTemperature());
+        updatedSettingsFields.saveLoadModelAction(false);
+        mDemoSharedPreferences.addSettings(updatedSettingsFields);
+      }
+    } else {
+      askUserToSelectModel();
+    }
+  }
+
+  private void checkForPromptChange(SettingsFields updatedSettingsFields) {
+    if (updatedSettingsFields.isSystemPromptChanged()
+        || updatedSettingsFields.isUserPromptChanged()) {
+      enableInstructMode();
+    } else {
+      disableInstructMode();
+    }
+  }
+
+  private void enableInstructMode() {
+    INSTRUCT_MODE = true;
+    mEditTextMessage.setText(INSTRUCT_INSTRUCTION);
+    mEditTextMessage.setInputType(InputType.TYPE_NULL);
+    mEditTextMessage.clearFocus();
+  }
+
+  private void disableInstructMode() {
+    INSTRUCT_MODE = false;
+    mEditTextMessage.setText("");
+    mEditTextMessage.setInputType(InputType.TYPE_CLASS_TEXT);
+    mEditTextMessage.clearFocus();
+  }
+
+  private void askUserToSelectModel() {
+    String askLoadModel =
+        "To get started, select your desired model and tokenizer " + "from the top right corner";
+    Message askLoadModelMessage = new Message(askLoadModel, false, MessageType.SYSTEM, 0);
+    ETLogging.getInstance().log(askLoadModel);
+    runOnUiThread(
+        () -> {
+          mMessageAdapter.add(askLoadModelMessage);
+          mMessageAdapter.notifyDataSetChanged();
+        });
+  }
+
+  private void setupShowLogsButton() {
+    ImageButton showLogsButton = requireViewById(R.id.showLogsButton);
+    showLogsButton.setOnClickListener(
+        view -> {
+          Intent myIntent = new Intent(MainActivity.this, LogsActivity.class);
+          MainActivity.this.startActivity(myIntent);
+        });
+  }
+
+  private void setupMediaButton() {
+    mAddMediaLayout = requireViewById(R.id.addMediaLayout);
+    mAddMediaLayout.setVisibility(View.GONE); // We hide this initially
+
+    ImageButton addMediaButton = requireViewById(R.id.addMediaButton);
+    addMediaButton.setOnClickListener(
+        view -> {
+          mAddMediaLayout.setVisibility(View.VISIBLE);
+        });
+
+    mGalleryButton = requireViewById(R.id.galleryButton);
+    mGalleryButton.setOnClickListener(
+        view -> {
+          // Launch the photo picker and let the user choose only images.
+          mPickGallery.launch(
+              new PickVisualMediaRequest.Builder()
+                  .setMediaType(ActivityResultContracts.PickVisualMedia.ImageOnly.INSTANCE)
+                  .build());
+        });
+    mCameraButton = requireViewById(R.id.cameraButton);
+    mCameraButton.setOnClickListener(
+        view -> {
+          Log.d("CameraRoll", "Check permission");
+          if (ContextCompat.checkSelfPermission(MainActivity.this, Manifest.permission.CAMERA)
+              != PackageManager.PERMISSION_GRANTED) {
+            ActivityCompat.requestPermissions(
+                MainActivity.this,
+                new String[] {Manifest.permission.CAMERA},
+                REQUEST_IMAGE_CAPTURE);
+          } else {
+            launchCamera();
+          }
+        });
+  }
+
+  private void setupCameraRoll() {
+    // Registers a camera roll activity launcher.
+    mCameraRoll =
+        registerForActivityResult(
+            new ActivityResultContracts.TakePicture(),
+            result -> {
+              if (result && cameraImageUri != null) {
+                Log.d("CameraRoll", "Photo saved to uri: " + cameraImageUri);
+                mAddMediaLayout.setVisibility(View.GONE);
+                List<Uri> uris = new ArrayList<>();
+                uris.add(cameraImageUri);
+                showMediaPreview(uris);
+              } else {
+                // Delete the temp image file based on the url since the photo is not successfully
+                // taken
+                if (cameraImageUri != null) {
+                  ContentResolver contentResolver = MainActivity.this.getContentResolver();
+                  contentResolver.delete(cameraImageUri, null, null);
+                  Log.d("CameraRoll", "No photo taken. Delete temp uri");
+                }
+              }
+            });
+    mMediaPreviewConstraintLayout = requireViewById(R.id.mediaPreviewConstraintLayout);
+    ImageButton mediaPreviewCloseButton = requireViewById(R.id.mediaPreviewCloseButton);
+    mediaPreviewCloseButton.setOnClickListener(
+        view -> {
+          mMediaPreviewConstraintLayout.setVisibility(View.GONE);
+          mSelectedImageUri = null;
+        });
+
+    ImageButton addMoreImageButton = requireViewById(R.id.addMoreImageButton);
+    addMoreImageButton.setOnClickListener(
+        view -> {
+          Log.d("addMore", "clicked");
+          mMediaPreviewConstraintLayout.setVisibility(View.GONE);
+          // Direct user to select type of input
+          mCameraButton.callOnClick();
+        });
+  }
+
+  private String updateMemoryUsage() {
+    ActivityManager.MemoryInfo memoryInfo = new ActivityManager.MemoryInfo();
+    ActivityManager activityManager = (ActivityManager) getSystemService(ACTIVITY_SERVICE);
+    if (activityManager == null) {
+      return "---";
+    }
+    activityManager.getMemoryInfo(memoryInfo);
+    long totalMem = memoryInfo.totalMem / (1024 * 1024);
+    long availableMem = memoryInfo.availMem / (1024 * 1024);
+    long usedMem = totalMem - availableMem;
+    return usedMem + "MB";
+  }
+
+  private void startMemoryUpdate() {
+    mMemoryView = requireViewById(R.id.ram_usage_live);
+    memoryUpdater =
+        new Runnable() {
+          @Override
+          public void run() {
+            mMemoryView.setText(updateMemoryUsage());
+            mMemoryUpdateHandler.postDelayed(this, 1000);
+          }
+        };
+    mMemoryUpdateHandler.post(memoryUpdater);
+  }
+
+  @Override
+  public void onRequestPermissionsResult(
+      int requestCode, @NonNull String[] permissions, @NonNull int[] grantResults) {
+    super.onRequestPermissionsResult(requestCode, permissions, grantResults);
+    if (requestCode == REQUEST_IMAGE_CAPTURE && grantResults.length != 0) {
+      if (grantResults[0] == PackageManager.PERMISSION_GRANTED) {
+        launchCamera();
+      } else if (grantResults[0] == PackageManager.PERMISSION_DENIED) {
+        Log.d("CameraRoll", "Permission denied");
+      }
+    }
+  }
+
+  private void launchCamera() {
+    ContentValues values = new ContentValues();
+    values.put(MediaStore.Images.Media.TITLE, "New Picture");
+    values.put(MediaStore.Images.Media.DESCRIPTION, "From Camera");
+    values.put(MediaStore.Images.Media.RELATIVE_PATH, "DCIM/Camera/");
+    cameraImageUri =
+        MainActivity.this
+            .getContentResolver()
+            .insert(MediaStore.Images.Media.EXTERNAL_CONTENT_URI, values);
+    mCameraRoll.launch(cameraImageUri);
+  }
+
+  private void setupGalleryPicker() {
+    // Registers a photo picker activity launcher in single-select mode.
+    mPickGallery =
+        registerForActivityResult(
+            new ActivityResultContracts.PickMultipleVisualMedia(MAX_NUM_OF_IMAGES),
+            uris -> {
+              if (!uris.isEmpty()) {
+                Log.d("PhotoPicker", "Selected URIs: " + uris);
+                mAddMediaLayout.setVisibility(View.GONE);
+                for (Uri uri : uris) {
+                  MainActivity.this
+                      .getContentResolver()
+                      .takePersistableUriPermission(uri, Intent.FLAG_GRANT_READ_URI_PERMISSION);
+                }
+                showMediaPreview(uris);
+              } else {
+                Log.d("PhotoPicker", "No media selected");
+              }
+            });
+
+    mMediaPreviewConstraintLayout = requireViewById(R.id.mediaPreviewConstraintLayout);
+    ImageButton mediaPreviewCloseButton = requireViewById(R.id.mediaPreviewCloseButton);
+    mediaPreviewCloseButton.setOnClickListener(
+        view -> {
+          mMediaPreviewConstraintLayout.setVisibility(View.GONE);
+          mSelectedImageUri = null;
+        });
+
+    ImageButton addMoreImageButton = requireViewById(R.id.addMoreImageButton);
+    addMoreImageButton.setOnClickListener(
+        view -> {
+          Log.d("addMore", "clicked");
+          mMediaPreviewConstraintLayout.setVisibility(View.GONE);
+          mGalleryButton.callOnClick();
+        });
+  }
+
+  private List<ETImage> getProcessedImagesForModel(List<Uri> uris) {
+    List<ETImage> imageList = new ArrayList<>();
+    if (uris != null) {
+      uris.forEach(
+          (uri) -> {
+            imageList.add(new ETImage(this.getContentResolver(), uri));
+          });
+    }
+    return imageList;
+  }
+
+  private void showMediaPreview(List<Uri> uris) {
+    if (mSelectedImageUri == null) {
+      mSelectedImageUri = uris;
+    } else {
+      mSelectedImageUri.addAll(uris);
+    }
+
+    if (mSelectedImageUri.size() > MAX_NUM_OF_IMAGES) {
+      mSelectedImageUri = mSelectedImageUri.subList(0, MAX_NUM_OF_IMAGES);
+      Toast.makeText(
+              this, "Only max " + MAX_NUM_OF_IMAGES + " images are allowed", Toast.LENGTH_SHORT)
+          .show();
+    }
+    Log.d("mSelectedImageUri", mSelectedImageUri.size() + " " + mSelectedImageUri);
+
+    mMediaPreviewConstraintLayout.setVisibility(View.VISIBLE);
+
+    List<ImageView> imageViews = new ArrayList<ImageView>();
+
+    // Pre-populate all the image views that are available from the layout (currently max 5)
+    imageViews.add(requireViewById(R.id.mediaPreviewImageView1));
+    imageViews.add(requireViewById(R.id.mediaPreviewImageView2));
+    imageViews.add(requireViewById(R.id.mediaPreviewImageView3));
+    imageViews.add(requireViewById(R.id.mediaPreviewImageView4));
+    imageViews.add(requireViewById(R.id.mediaPreviewImageView5));
+
+    // Hide all the image views (reset state)
+    for (int i = 0; i < imageViews.size(); i++) {
+      imageViews.get(i).setVisibility(View.GONE);
+    }
+
+    // Only show/render those that have proper Image URIs
+    for (int i = 0; i < mSelectedImageUri.size(); i++) {
+      imageViews.get(i).setVisibility(View.VISIBLE);
+      imageViews.get(i).setImageURI(mSelectedImageUri.get(i));
+    }
+  }
+
+  private void addSelectedImagesToChatThread(List<Uri> selectedImageUri) {
+    if (selectedImageUri == null) {
+      return;
+    }
+    mMediaPreviewConstraintLayout.setVisibility(View.GONE);
+    for (int i = 0; i < selectedImageUri.size(); i++) {
+      Uri imageURI = selectedImageUri.get(i);
+      Log.d("image uri ", "test " + imageURI.getPath());
+      mMessageAdapter.add(new Message(imageURI.toString(), true, MessageType.IMAGE, 0));
+    }
+    mMessageAdapter.notifyDataSetChanged();
   }
 
   private void onModelRunStarted() {
-    mSendButton.setText("Stop");
+    mSendButton.setClickable(false);
+    mSendButton.setImageResource(R.drawable.baseline_stop_24);
     mSendButton.setOnClickListener(
         view -> {
           mModule.stop();
@@ -197,16 +575,49 @@ private void onModelRunStarted() {
   }
 
   private void onModelRunStopped() {
-    setTitle(memoryInfo());
-    mSendButton.setText("Generate");
+    mSendButton.setClickable(true);
+    mSendButton.setImageResource(R.drawable.baseline_send_24);
     mSendButton.setOnClickListener(
         view -> {
-          String prompt = mEditTextMessage.getText().toString();
-          mMessageAdapter.add(new Message(prompt, true));
+          addSelectedImagesToChatThread(mSelectedImageUri);
+          // TODO: When ET supports multimodal, this is where we will add the images as part of the
+          // prompt.
+          List<ETImage> processedImageList = getProcessedImagesForModel(mSelectedImageUri);
+          processedImageList.forEach(
+              image -> {
+                ETLogging.getInstance()
+                    .log(
+                        "Image preprocessed:"
+                            + " uri = "
+                            + image.getUri().getLastPathSegment()
+                            + ","
+                            + " width = "
+                            + image.getWidth()
+                            + ","
+                            + " height = "
+                            + image.getHeight()
+                            + ","
+                            + " bytes size = "
+                            + image.getBytes().length);
+              });
+          String prompt;
+          if (INSTRUCT_MODE) {
+            prompt = mCurrentSettingsFields.getEntirePrompt();
+            mEditTextMessage.setText(INSTRUCT_INSTRUCTION);
+          } else {
+            prompt = mEditTextMessage.getText().toString();
+            mEditTextMessage.setText("");
+          }
+          mMessageAdapter.add(new Message(prompt, true, MessageType.TEXT, 0));
           mMessageAdapter.notifyDataSetChanged();
           mEditTextMessage.setText("");
-          mResultMessage = new Message("", false);
+          mResultMessage = new Message("", false, MessageType.TEXT, 0);
           mMessageAdapter.add(mResultMessage);
+          // Scroll to bottom of the list
+          mMessagesView.smoothScrollToPosition(mMessageAdapter.getCount() - 1);
+          // After images are added to prompt and chat thread, we clear the imageURI list
+          // Note: This has to be done after imageURIs are no longer needed by LlamaModule
+          mSelectedImageUri = null;
           Runnable runnable =
               new Runnable() {
                 @Override
@@ -218,9 +629,11 @@ public void run() {
                           onModelRunStarted();
                         }
                       });
-
+                  ETLogging.getInstance().log("Running inference.. prompt=" + prompt);
+                  long generateStartTime = System.currentTimeMillis();
                   mModule.generate(prompt, MainActivity.this);
-
+                  long generateDuration = System.currentTimeMillis() - generateStartTime;
+                  mResultMessage.setTotalGenerationTime(generateDuration);
                   runOnUiThread(
                       new Runnable() {
                         @Override
@@ -228,6 +641,7 @@ public void run() {
                           onModelRunStopped();
                         }
                       });
+                  ETLogging.getInstance().log("Inference completed");
                 }
               };
           new Thread(runnable).start();
@@ -242,8 +656,27 @@ public void run() {
           @Override
           public void run() {
             mMessageAdapter.notifyDataSetChanged();
-            setTitle(memoryInfo());
           }
         });
   }
+
+  @Override
+  public void onBackPressed() {
+    super.onBackPressed();
+    if (mAddMediaLayout != null && mAddMediaLayout.getVisibility() == View.VISIBLE) {
+      mAddMediaLayout.setVisibility(View.GONE);
+    } else {
+      // Default behavior of back button
+      finish();
+    }
+  }
+
+  @Override
+  protected void onDestroy() {
+    super.onDestroy();
+    mMemoryUpdateHandler.removeCallbacks(memoryUpdater);
+    // This is to cover the case where the app is shutdown when user is on MainActivity but
+    // never clicked on the logsActivity
+    ETLogging.getInstance().saveLogs();
+  }
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java
index 81b77b1aba..b2e5380e2a 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java
@@ -8,14 +8,50 @@
 
 package com.example.executorchllamademo;
 
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Locale;
+
 public class Message {
   private String text;
-  private boolean isSent;
+  private final boolean isSent;
   private float tokensPerSecond;
+  private long totalGenerationTime;
+  private final long timestamp;
+  private final MessageType messageType;
+  private String imagePath;
+  private final int promptID;
+
+  private static final String TIMESTAMP_FORMAT = "hh:mm a"; // example: 2:23 PM
 
-  public Message(String text, boolean isSent) {
-    this.text = text;
+  public Message(String text, boolean isSent, MessageType messageType, int promptID) {
     this.isSent = isSent;
+    this.messageType = messageType;
+    this.promptID = promptID;
+
+    if (messageType == MessageType.IMAGE) {
+      this.imagePath = text;
+    } else {
+      this.text = text;
+    }
+
+    if (messageType != MessageType.SYSTEM) {
+      this.timestamp = System.currentTimeMillis();
+    } else {
+      this.timestamp = (long) 0;
+    }
+  }
+
+  public int getPromptID() {
+    return promptID;
+  }
+
+  public MessageType getMessageType() {
+    return messageType;
+  }
+
+  public String getImagePath() {
+    return imagePath;
   }
 
   public String getText() {
@@ -34,7 +70,25 @@ public void setTokensPerSecond(float tokensPerSecond) {
     this.tokensPerSecond = tokensPerSecond;
   }
 
+  public void setTotalGenerationTime(long totalGenerationTime) {
+    this.totalGenerationTime = totalGenerationTime;
+  }
+
   public float getTokensPerSecond() {
     return tokensPerSecond;
   }
+
+  public long getTotalGenerationTime() {
+    return totalGenerationTime;
+  }
+
+  public long getTimestamp() {
+    return timestamp;
+  }
+
+  public String getFormattedTimestamp() {
+    SimpleDateFormat formatter = new SimpleDateFormat(TIMESTAMP_FORMAT, Locale.getDefault());
+    Date date = new Date(timestamp);
+    return formatter.format(date);
+  }
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
index 656da1967d..d9cbd95a1a 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
@@ -8,33 +8,86 @@
 
 package com.example.executorchllamademo;
 
+import android.net.Uri;
 import android.view.LayoutInflater;
 import android.view.View;
 import android.view.ViewGroup;
 import android.widget.ArrayAdapter;
+import android.widget.ImageView;
 import android.widget.TextView;
+import java.util.ArrayList;
 
 public class MessageAdapter extends ArrayAdapter<Message> {
-  public MessageAdapter(android.content.Context context, int resource) {
+
+  private final ArrayList<Message> savedMessages;
+
+  public MessageAdapter(
+      android.content.Context context, int resource, ArrayList<Message> savedMessages) {
     super(context, resource);
+    this.savedMessages = savedMessages;
   }
 
   @Override
   public View getView(int position, View convertView, ViewGroup parent) {
     Message currentMessage = getItem(position);
+    int layoutIdForListItem;
 
-    int layoutIdForListItem =
-        currentMessage.getIsSent() ? R.layout.sent_message : R.layout.received_message;
+    if (currentMessage.getMessageType() == MessageType.SYSTEM) {
+      layoutIdForListItem = R.layout.system_message;
+    } else {
+      layoutIdForListItem =
+          currentMessage.getIsSent() ? R.layout.sent_message : R.layout.received_message;
+    }
     View listItemView =
         LayoutInflater.from(getContext()).inflate(layoutIdForListItem, parent, false);
-    TextView messageTextView = listItemView.findViewById(R.id.message_text);
-    messageTextView.setText(currentMessage.getText());
+    if (currentMessage.getMessageType() == MessageType.IMAGE) {
+      ImageView messageImageView = listItemView.requireViewById(R.id.message_image);
+      messageImageView.setImageURI(Uri.parse(currentMessage.getImagePath()));
+      TextView messageTextView = listItemView.requireViewById(R.id.message_text);
+      messageTextView.setVisibility(View.GONE);
+    } else {
+      TextView messageTextView = listItemView.requireViewById(R.id.message_text);
+      messageTextView.setText(currentMessage.getText());
+    }
 
+    String metrics = "";
+    TextView tokensView;
     if (currentMessage.getTokensPerSecond() > 0) {
-      TextView tokensView = listItemView.findViewById(R.id.tokens_per_second);
-      tokensView.setText("" + currentMessage.getTokensPerSecond() + " t/s");
+      metrics = String.format("%.2f", currentMessage.getTokensPerSecond()) + "t/s  ";
+    }
+
+    if (currentMessage.getTotalGenerationTime() > 0) {
+      metrics = metrics + (float) currentMessage.getTotalGenerationTime() / 1000 + "s  ";
+    }
+
+    if (currentMessage.getTokensPerSecond() > 0 || currentMessage.getTotalGenerationTime() > 0) {
+      tokensView = listItemView.requireViewById(R.id.generation_metrics);
+      tokensView.setText(metrics);
+      TextView separatorView = listItemView.requireViewById(R.id.bar);
+      separatorView.setVisibility(View.VISIBLE);
+    }
+
+    if (currentMessage.getTimestamp() > 0) {
+      TextView timestampView = listItemView.requireViewById(R.id.timestamp);
+      timestampView.setText(currentMessage.getFormattedTimestamp());
     }
 
     return listItemView;
   }
+
+  @Override
+  public void add(Message msg) {
+    super.add(msg);
+    savedMessages.add(msg);
+  }
+
+  @Override
+  public void clear() {
+    super.clear();
+    savedMessages.clear();
+  }
+
+  public ArrayList<Message> getSavedMessages() {
+    return savedMessages;
+  }
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java
new file mode 100644
index 0000000000..6042acb572
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+public enum MessageType {
+  TEXT,
+  IMAGE,
+  SYSTEM
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
new file mode 100644
index 0000000000..1d109e0195
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+import android.app.AlertDialog;
+import android.content.DialogInterface;
+import android.os.Bundle;
+import android.text.Editable;
+import android.text.TextWatcher;
+import android.widget.Button;
+import android.widget.EditText;
+import android.widget.ImageButton;
+import android.widget.TextView;
+import androidx.appcompat.app.AppCompatActivity;
+import androidx.core.graphics.Insets;
+import androidx.core.view.ViewCompat;
+import androidx.core.view.WindowInsetsCompat;
+import com.google.gson.Gson;
+import java.io.File;
+
+public class SettingsActivity extends AppCompatActivity {
+
+  private String mModelFilePath = "";
+  private String mTokenizerFilePath = "";
+  private TextView mModelTextView;
+  private TextView mTokenizerTextView;
+  private ImageButton mModelImageButton;
+  private ImageButton mTokenizerImageButton;
+  private EditText mSystemPromptEditText;
+  private EditText mUserPromptEditText;
+  private Button mLoadModelButton;
+  private double mSetTemperature;
+  private String mSystemPrompt;
+  private String mUserPrompt;
+
+  public SettingsFields mSettingsFields;
+
+  private DemoSharedPreferences mDemoSharedPreferences;
+  public static double TEMPERATURE_MIN_VALUE = 0.1;
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+    setContentView(R.layout.activity_settings);
+    ViewCompat.setOnApplyWindowInsetsListener(
+        requireViewById(R.id.main),
+        (v, insets) -> {
+          Insets systemBars = insets.getInsets(WindowInsetsCompat.Type.systemBars());
+          v.setPadding(systemBars.left, systemBars.top, systemBars.right, systemBars.bottom);
+          return insets;
+        });
+    mDemoSharedPreferences = new DemoSharedPreferences(getBaseContext());
+    mSettingsFields = new SettingsFields();
+    setupSettings();
+  }
+
+  private void setupSettings() {
+    mModelTextView = requireViewById(R.id.modelTextView);
+    mTokenizerTextView = requireViewById(R.id.tokenizerTextView);
+    mModelImageButton = requireViewById(R.id.modelImageButton);
+    mTokenizerImageButton = requireViewById(R.id.tokenizerImageButton);
+    mSystemPromptEditText = requireViewById(R.id.systemPromptText);
+    mUserPromptEditText = requireViewById(R.id.userPromptText);
+    loadSettings();
+
+    // TODO: The two setOnClickListeners will be removed after file path issue is resolved
+    mModelImageButton.setOnClickListener(
+        view -> {
+          setupModelSelectorDialog();
+        });
+    mTokenizerImageButton.setOnClickListener(
+        view -> {
+          setupTokenizerSelectorDialog();
+        });
+    mModelFilePath = mSettingsFields.getModelFilePath();
+    if (!mModelFilePath.isEmpty()) {
+      mModelTextView.setText(getFilenameFromPath(mModelFilePath));
+    }
+    mTokenizerFilePath = mSettingsFields.getTokenizerFilePath();
+    if (!mTokenizerFilePath.isEmpty()) {
+      mTokenizerTextView.setText(getFilenameFromPath(mTokenizerFilePath));
+    }
+
+    setupParameterSettings();
+    setupPromptSettings();
+    setupClearChatHistoryButton();
+    setupLoadModelButton();
+  }
+
+  private void setupLoadModelButton() {
+    mLoadModelButton = requireViewById(R.id.loadModelButton);
+    mLoadModelButton.setEnabled(true);
+    mLoadModelButton.setOnClickListener(
+        view -> {
+          new AlertDialog.Builder(this)
+              .setTitle("Load Model")
+              .setMessage("Do you really want to load the new model?")
+              .setIcon(android.R.drawable.ic_dialog_alert)
+              .setPositiveButton(
+                  android.R.string.yes,
+                  new DialogInterface.OnClickListener() {
+                    public void onClick(DialogInterface dialog, int whichButton) {
+                      mSettingsFields.saveLoadModelAction(true);
+                      mLoadModelButton.setEnabled(false);
+                    }
+                  })
+              .setNegativeButton(android.R.string.no, null)
+              .show();
+        });
+  }
+
+  private void setupClearChatHistoryButton() {
+    Button clearChatButton = requireViewById(R.id.clearChatButton);
+    clearChatButton.setOnClickListener(
+        view -> {
+          new AlertDialog.Builder(this)
+              .setTitle("Delete Chat History")
+              .setMessage("Do you really want to delete chat history?")
+              .setIcon(android.R.drawable.ic_dialog_alert)
+              .setPositiveButton(
+                  android.R.string.yes,
+                  new DialogInterface.OnClickListener() {
+                    public void onClick(DialogInterface dialog, int whichButton) {
+                      mSettingsFields.saveIsClearChatHistory(true);
+                    }
+                  })
+              .setNegativeButton(android.R.string.no, null)
+              .show();
+        });
+  }
+
+  private void setupParameterSettings() {
+    setupTemperatureSettings();
+  }
+
+  private void setupTemperatureSettings() {
+    mSetTemperature = mSettingsFields.getTemperature();
+    EditText temperatureEditText = requireViewById(R.id.temperatureEditText);
+    temperatureEditText.setText(String.valueOf(mSetTemperature));
+    temperatureEditText.addTextChangedListener(
+        new TextWatcher() {
+          @Override
+          public void beforeTextChanged(CharSequence s, int start, int count, int after) {}
+
+          @Override
+          public void onTextChanged(CharSequence s, int start, int before, int count) {}
+
+          @Override
+          public void afterTextChanged(Editable s) {
+            mSetTemperature = Double.parseDouble(s.toString());
+            // This is needed because temperature is changed together with model loading
+            // Once temperature is no longer in LlamaModule constructor, we can remove this
+            mSettingsFields.saveLoadModelAction(true);
+            saveSettings();
+          }
+        });
+  }
+
+  private void setupPromptSettings() {
+    setupSystemPromptSettings();
+    setupUserPromptSettings();
+  }
+
+  private void setupSystemPromptSettings() {
+    mSystemPrompt = mSettingsFields.getSystemPrompt();
+    mSystemPromptEditText.setText(mSystemPrompt);
+    mSystemPromptEditText.addTextChangedListener(
+        new TextWatcher() {
+          @Override
+          public void beforeTextChanged(CharSequence s, int start, int count, int after) {}
+
+          @Override
+          public void onTextChanged(CharSequence s, int start, int before, int count) {}
+
+          @Override
+          public void afterTextChanged(Editable s) {
+            mSystemPrompt = s.toString();
+          }
+        });
+
+    ImageButton resetSystemPrompt = requireViewById(R.id.resetSystemPrompt);
+    resetSystemPrompt.setOnClickListener(
+        view -> {
+          new AlertDialog.Builder(this)
+              .setTitle("Reset System Prompt")
+              .setMessage("Do you really want to reset system prompt?")
+              .setIcon(android.R.drawable.ic_dialog_alert)
+              .setPositiveButton(
+                  android.R.string.yes,
+                  new DialogInterface.OnClickListener() {
+                    public void onClick(DialogInterface dialog, int whichButton) {
+                      // Clear the messageAdapter and sharedPreference
+                      mSystemPromptEditText.setText(mSettingsFields.getSystemPromptTemplate());
+                    }
+                  })
+              .setNegativeButton(android.R.string.no, null)
+              .show();
+        });
+  }
+
+  private void setupUserPromptSettings() {
+    mUserPrompt = mSettingsFields.getUserPrompt();
+    mUserPromptEditText.setText(mUserPrompt);
+    mUserPromptEditText.addTextChangedListener(
+        new TextWatcher() {
+          @Override
+          public void beforeTextChanged(CharSequence s, int start, int count, int after) {}
+
+          @Override
+          public void onTextChanged(CharSequence s, int start, int before, int count) {}
+
+          @Override
+          public void afterTextChanged(Editable s) {
+            mUserPrompt = s.toString();
+          }
+        });
+
+    ImageButton resetUserPrompt = requireViewById(R.id.resetUserPrompt);
+    resetUserPrompt.setOnClickListener(
+        view -> {
+          new AlertDialog.Builder(this)
+              .setTitle("Reset Prompt Template")
+              .setMessage("Do you really want to reset the prompt template?")
+              .setIcon(android.R.drawable.ic_dialog_alert)
+              .setPositiveButton(
+                  android.R.string.yes,
+                  new DialogInterface.OnClickListener() {
+                    public void onClick(DialogInterface dialog, int whichButton) {
+                      // Clear the messageAdapter and sharedPreference
+                      mUserPromptEditText.setText(mSettingsFields.getUserPromptTemplate());
+                    }
+                  })
+              .setNegativeButton(android.R.string.no, null)
+              .show();
+        });
+  }
+
+  private void setupModelSelectorDialog() {
+    String[] pteFiles = listLocalFile("/data/local/tmp/llama/", ".pte");
+    AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this);
+    modelPathBuilder.setTitle("Select model path");
+
+    modelPathBuilder.setSingleChoiceItems(
+        pteFiles,
+        -1,
+        (dialog, item) -> {
+          mModelFilePath = pteFiles[item];
+          mModelTextView.setText(getFilenameFromPath(mModelFilePath));
+          mLoadModelButton.setEnabled(true);
+          dialog.dismiss();
+        });
+
+    modelPathBuilder.create().show();
+  }
+
+  private static String[] listLocalFile(String path, String suffix) {
+    File directory = new File(path);
+    if (directory.exists() && directory.isDirectory()) {
+      File[] files = directory.listFiles((dir, name) -> name.toLowerCase().endsWith(suffix));
+      String[] result = new String[files.length];
+      for (int i = 0; i < files.length; i++) {
+        if (files[i].isFile() && files[i].getName().endsWith(suffix)) {
+          result[i] = files[i].getAbsolutePath();
+        }
+      }
+      return result;
+    }
+    return null;
+  }
+
+  private void setupTokenizerSelectorDialog() {
+    String[] binFiles = listLocalFile("/data/local/tmp/llama/", ".bin");
+    String[] tokenizerFiles = new String[binFiles.length];
+    System.arraycopy(binFiles, 0, tokenizerFiles, 0, binFiles.length);
+    AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this);
+    tokenizerPathBuilder.setTitle("Select tokenizer path");
+    tokenizerPathBuilder.setSingleChoiceItems(
+        tokenizerFiles,
+        -1,
+        (dialog, item) -> {
+          mTokenizerFilePath = tokenizerFiles[item];
+          mTokenizerTextView.setText(getFilenameFromPath(mTokenizerFilePath));
+          mLoadModelButton.setEnabled(true);
+          dialog.dismiss();
+        });
+
+    tokenizerPathBuilder.create().show();
+  }
+
+  private String getFilenameFromPath(String uriFilePath) {
+    String[] segments = uriFilePath.split("/");
+    if (segments.length > 0) {
+      return segments[segments.length - 1]; // get last element (aka filename)
+    }
+    return "";
+  }
+
+  private void loadSettings() {
+    Gson gson = new Gson();
+    String settingsFieldsJSON = mDemoSharedPreferences.getSettings();
+    if (!settingsFieldsJSON.isEmpty()) {
+      mSettingsFields = gson.fromJson(settingsFieldsJSON, SettingsFields.class);
+    }
+  }
+
+  private void saveSettings() {
+    mSettingsFields.saveModelPath(mModelFilePath);
+    mSettingsFields.saveTokenizerPath(mTokenizerFilePath);
+    mSettingsFields.saveParameters(mSetTemperature);
+    mSettingsFields.savePrompts(mSystemPrompt, mUserPrompt);
+    mDemoSharedPreferences.addSettings(mSettingsFields);
+  }
+
+  @Override
+  public void onBackPressed() {
+    super.onBackPressed();
+    saveSettings();
+  }
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
new file mode 100644
index 0000000000..d42a241293
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package com.example.executorchllamademo;
+
+public class SettingsFields {
+  private static final String SYSTEM_PLACEHOLDER = "{{ system_prompt }}";
+  private static final String USER_PLACEHOLDER = "{{ user_prompt }}";
+  private static String SYSTEM_PROMPT_TEMPLATE =
+      "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
+          + SYSTEM_PLACEHOLDER
+          + "<|eot_id|>";
+  private static String USER_PROMPT_TEMPLATE =
+      "<|start_header_id|>user<|end_header_id|>\n"
+          + USER_PLACEHOLDER
+          + "<|eot_id|>\n"
+          + "<|start_header_id|>assistant<|end_header_id|>";
+
+  public String getModelFilePath() {
+    return modelFilePath;
+  }
+
+  public String getTokenizerFilePath() {
+    return tokenizerFilePath;
+  }
+
+  public double getTemperature() {
+    return temperature;
+  }
+
+  public String getSystemPrompt() {
+    return systemPrompt;
+  }
+
+  public String getUserPrompt() {
+    return userPrompt;
+  }
+
+  public String getEntirePrompt() {
+    return systemPrompt + userPrompt;
+  }
+
+  public String getSystemPromptTemplate() {
+    return SYSTEM_PROMPT_TEMPLATE;
+  }
+
+  public String getUserPromptTemplate() {
+    return USER_PROMPT_TEMPLATE;
+  }
+
+  public boolean getIsClearChatHistory() {
+    return isClearChatHistory;
+  }
+
+  public boolean getIsLoadModel() {
+    return isLoadModel;
+  }
+
+  private String modelFilePath;
+  private String tokenizerFilePath;
+  private double temperature;
+  private String systemPrompt;
+  private String userPrompt;
+  private boolean isClearChatHistory;
+  private boolean isLoadModel;
+
+  public SettingsFields() {
+    modelFilePath = "";
+    tokenizerFilePath = "";
+    temperature = SettingsActivity.TEMPERATURE_MIN_VALUE;
+    systemPrompt = SYSTEM_PROMPT_TEMPLATE;
+    userPrompt = USER_PROMPT_TEMPLATE;
+    isClearChatHistory = false;
+    isLoadModel = false;
+  }
+
+  public SettingsFields(SettingsFields settingsFields) {
+    this.modelFilePath = settingsFields.modelFilePath;
+    this.tokenizerFilePath = settingsFields.tokenizerFilePath;
+    this.temperature = settingsFields.temperature;
+    this.systemPrompt = settingsFields.getSystemPrompt();
+    this.userPrompt = settingsFields.getUserPrompt();
+    this.isClearChatHistory = settingsFields.getIsClearChatHistory();
+    this.isLoadModel = settingsFields.getIsLoadModel();
+  }
+
+  public void saveModelPath(String modelFilePath) {
+    this.modelFilePath = modelFilePath;
+  }
+
+  public void saveTokenizerPath(String tokenizerFilePath) {
+    this.tokenizerFilePath = tokenizerFilePath;
+  }
+
+  public void saveParameters(Double temperature) {
+    this.temperature = temperature;
+  }
+
+  public void savePrompts(String systemPrompt, String userPrompt) {
+    this.systemPrompt = systemPrompt;
+    this.userPrompt = userPrompt;
+  }
+
+  public void saveIsClearChatHistory(boolean needToClear) {
+    this.isClearChatHistory = needToClear;
+  }
+
+  public void saveLoadModelAction(boolean shouldLoadModel) {
+    this.isLoadModel = shouldLoadModel;
+  }
+
+  public boolean equals(SettingsFields anotherSettingsFields) {
+    if (this == anotherSettingsFields) return true;
+    return modelFilePath.equals(anotherSettingsFields.modelFilePath)
+        && tokenizerFilePath.equals(anotherSettingsFields.tokenizerFilePath)
+        && temperature == anotherSettingsFields.temperature
+        && systemPrompt.equals(anotherSettingsFields.systemPrompt)
+        && userPrompt.equals(anotherSettingsFields.userPrompt)
+        && isClearChatHistory == anotherSettingsFields.isClearChatHistory
+        && isLoadModel == anotherSettingsFields.isLoadModel;
+  }
+
+  public boolean isSystemPromptChanged() {
+    return !systemPrompt.contains(SYSTEM_PLACEHOLDER);
+  }
+
+  public boolean isUserPromptChanged() {
+    return !userPrompt.contains(USER_PLACEHOLDER);
+  }
+}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml
new file mode 100644
index 0000000000..70f251ee64
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android"
+    android:shape="rectangle">
+    <solid android:color="#C60C4FC3" />
+    <corners android:bottomRightRadius="8dp"
+        android:bottomLeftRadius="8dp" />
+</shape>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml
new file mode 100644
index 0000000000..9f83b8fbe7
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml
@@ -0,0 +1,5 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
+      
+    <path android:fillColor="@android:color/white" android:pathData="M19,13h-6v6h-2v-6H5v-2h6V5h2v6h6v2z"/>
+    
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml
new file mode 100644
index 0000000000..d710d27110
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml
@@ -0,0 +1,5 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
+      
+    <path android:fillColor="@android:color/white" android:pathData="M19,7v2.99s-1.99,0.01 -2,0L17,7h-3s0.01,-1.99 0,-2h3L17,2h2v3h3v2h-3zM16,11L16,8h-3L13,5L5,5c-1.1,0 -2,0.9 -2,2v12c0,1.1 0.9,2 2,2h12c1.1,0 2,-0.9 2,-2v-8h-3zM5,19l3,-4 2,3 3,-4 4,5L5,19z"/>
+    
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml
new file mode 100644
index 0000000000..30d5d26b98
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml
@@ -0,0 +1,5 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:autoMirrored="true" android:height="24dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
+      
+    <path android:fillColor="@android:color/white" android:pathData="M19,3L5,3c-1.1,0 -2,0.9 -2,2v14c0,1.1 0.9,2 2,2h14c1.1,0 2,-0.9 2,-2L21,5c0,-1.1 -0.9,-2 -2,-2zM14,17L7,17v-2h7v2zM17,13L7,13v-2h10v2zM17,9L7,9L7,7h10v2z"/>
+    
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml
new file mode 100644
index 0000000000..f8ca0c64b9
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml
@@ -0,0 +1,5 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
+      
+    <path android:fillColor="@android:color/white" android:pathData="M19,6.41L17.59,5 12,10.59 6.41,5 5,6.41 10.59,12 5,17.59 6.41,19 12,13.41 17.59,19 19,17.59 13.41,12z"/>
+    
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml
new file mode 100644
index 0000000000..2c71fc6e56
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml
@@ -0,0 +1,5 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
+      
+    <path android:fillColor="@android:color/white" android:pathData="M6,19c0,1.1 0.9,2 2,2h8c1.1,0 2,-0.9 2,-2L18,7L6,7v12zM8.46,11.88l1.41,-1.41L12,12.59l2.12,-2.12 1.41,1.41L13.41,14l2.12,2.12 -1.41,1.41L12,15.41l-2.12,2.12 -1.41,-1.41L10.59,14l-2.13,-2.12zM15.5,4l-1,-1h-5l-1,1L5,4v2h14L19,4z"/>
+    
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml
new file mode 100644
index 0000000000..9285db079a
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml
@@ -0,0 +1,6 @@
+<vector android:height="24dp" android:tint="#000000"
+    android:viewportHeight="24" android:viewportWidth="24"
+    android:width="24dp" xmlns:android="http://schemas.android.com/apk/res/android">
+    <path android:fillColor="@android:color/white" android:pathData="M12,5V2L8,6l4,4V7c3.31,0 6,2.69 6,6c0,2.97 -2.17,5.43 -5,5.91v2.02c3.95,-0.49 7,-3.85 7,-7.93C20,8.58 16.42,5 12,5z"/>
+    <path android:fillColor="@android:color/white" android:pathData="M6,13c0,-1.65 0.67,-3.15 1.76,-4.24L6.34,7.34C4.9,8.79 4,10.79 4,13c0,4.08 3.05,7.44 7,7.93v-2.02C8.17,18.43 6,15.97 6,13z"/>
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml
new file mode 100644
index 0000000000..3abc6cb33b
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml
@@ -0,0 +1,5 @@
+<vector android:autoMirrored="true" android:height="24dp"
+    android:tint="#000000" android:viewportHeight="24"
+    android:viewportWidth="24" android:width="24dp" xmlns:android="http://schemas.android.com/apk/res/android">
+    <path android:fillColor="@android:color/white" android:pathData="M2.01,21L23,12 2.01,3 2,10l15,2 -15,2z"/>
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml
new file mode 100644
index 0000000000..42593b298e
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml
@@ -0,0 +1,10 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    android:width="24dp"
+    android:height="24dp"
+    android:viewportWidth="960"
+    android:viewportHeight="960"
+    android:tint="#000000">
+    <path
+        android:fillColor="@android:color/black"
+        android:pathData="M387.69,860L372.46,738.15Q356.39,732.77 339.5,723.08Q322.62,713.38 309.31,702.31L196.46,750L104.16,590L201.77,516.23Q200.39,507.31 199.81,498.31Q199.23,489.31 199.23,480.38Q199.23,471.85 199.81,463.04Q200.39,454.23 201.77,443.77L104.16,370L196.46,210.77L308.92,258.08Q323.39,246.62 339.81,237.12Q356.23,227.62 372.08,221.85L387.69,100L572.31,100L587.54,222.23Q605.54,228.77 620.11,237.5Q634.69,246.23 649.54,258.08L763.54,210.77L855.84,370L756.69,444.92Q758.84,454.61 759.04,463.04Q759.23,471.46 759.23,480Q759.23,488.15 758.84,496.58Q758.46,505 756.08,515.85L854.46,590L762.15,750L649.54,701.92Q634.69,713.77 619.23,722.88Q603.77,732 587.54,737.77L572.31,860L387.69,860ZM440,800L518.62,800L533,692.85Q563.62,684.85 588.96,670.12Q614.31,655.38 637.85,632.23L737.23,674L776.62,606L689.85,540.62Q694.85,525.08 696.65,510.15Q698.46,495.23 698.46,480Q698.46,464.38 696.65,449.85Q694.85,435.31 689.85,420.15L777.38,354L738,286L637.46,328.38Q617.38,306.92 589.35,290.46Q561.31,274 532.62,267.15L520,160L440.62,160L427.38,266.77Q396.77,274 370.85,288.92Q344.92,303.85 321.38,327.38L222,286L182.62,354L269,418.38Q264,432.62 262,448Q260,463.38 260,480.38Q260,496 262,511Q264,526 268.62,540.62L182.62,606L222,674L321,632Q343.77,655.38 369.69,670.31Q395.62,685.23 427,693.23L440,800ZM480.46,600Q530.38,600 565.42,564.96Q600.46,529.92 600.46,480Q600.46,430.08 565.42,395.04Q530.38,360 480.46,360Q429.92,360 395.19,395.04Q360.46,430.08 360.46,480Q360.46,529.92 395.19,564.96Q429.92,600 480.46,600ZM480,480L480,480Q480,480 480,480Q480,480 480,480L480,480L480,480L480,480Q480,480 480,480Q480,480 480,480Q480,480 480,480Q480,480 480,480L480,480L480,480L480,480Q480,480 480,480Q480,480 480,480L480,480L480,480L480,480Q480,480 480,480Q480,480 480,480L480,480L480,480L480,480Q480,480 480,480Q480,480 480,480Q480,480 480,480Q480,480 480,480L480,480L480,480L480,480Q480,480 480,480Q480,480 480,480L480,480Z"/>
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml
new file mode 100644
index 0000000000..817d57b76a
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml
@@ -0,0 +1,5 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
+      
+    <path android:fillColor="@android:color/white" android:pathData="M6,6h12v12H6z"/>
+    
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml
new file mode 100644
index 0000000000..ceb3ac56c9
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<selector xmlns:android="http://schemas.android.com/apk/res/android">
+    <!-- Disable background -->
+    <item android:state_enabled="false"
+        android:color="@color/btn_disabled"/>
+    <!-- Enabled background -->
+    <item android:color="@color/btn_enabled"/>
+</selector>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml
new file mode 100644
index 0000000000..87c82d2a38
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android" android:shape="rectangle">
+    <solid android:color="#6080F0"/>
+    <corners android:radius="500dp"/>
+    <size android:width="100dp"
+        android:height="100dp"/>
+</shape>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml
new file mode 100644
index 0000000000..15c404c60d
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android"
+    android:shape="rectangle">
+    <solid android:color="#CFCCCC" />
+    <stroke
+        android:width="2dp"
+        android:color= "#F4F4F4"/>
+    <corners android:radius="20dp"/>
+    <padding android:top="10dp" android:bottom="10dp" android:left="10dp" android:right="10dp"/>
+</shape>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/logo.png b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..60e3e5174e9bdec2caf09cd42a9232e1dff65530
GIT binary patch
literal 33036
zcmZ6ycRbbK9|wNkcjMxcJueB#-kH}3WmYo7wf9IeD!h$+j6||BZ={r2grvGkMU;z-
zQn`tcnZ5mdACKSu>-~N_-s_y#Iq%mw&+$sJvM^?5<YNQ?FrPLtJO=>k@Dv45$iqXr
zl4||oLCng``jjBA_`$(J_xra?vlH}h{f+hY1v&XgSa>7CBGgq*+Mc&P*j+heux$kZ
z5u7&EvyS{W_tHF=|M`%LL%Bhox8h)g&?&`GrSb}8S=v$4r+ty$HHPt;<q1V5R$Aj`
z^EYqOM*^=|N5bpFojz@s0xYj>?mq9%kQ(Ifa?5#aeJo|rSE4w)Nbz~g`ZxdJESpbS
z{Cjp@I6UZa2V@~6Sm6Qr9{^Pg1O6z0%7uiUY+j6f1_Vd_KX*u1-mj5*eG%<~;QRky
z`ad5oWC3=?H-&Qc(Kd5CN_a94Rg39P@&D%~|MzE3Y*H@j2A+DQ>wmi3<{m@O?Be71
z5iH!h?`t;m9Dl3S%Rha*U;OqIMBa=Yc%_R3M-iV6TUwINd=UeZpq_@V_&z|XNBMt(
zVAWn0cov5`@wS6sRTcmL?#;1oLgNzv&xgMr3$mRGoOb6S+y7?POds-5yo<j&?^&xS
z7F8JTZsP2vHQn6jfTW0y3C^)daNEksMQq5)asPInS!_@%3pQ~6GdX^pW{?CimhR6C
zD&_yf>ZZHo?dtz{bf6kxIrUat5lDt??_dMk{^8xb5ff8)8l1aU7JER{(e6|2duN`I
zxB&HB-ofhEySE>)YK@M46bO!-OZ@4TG@sWnC;~+T%Ch%so2rc31(-8BNH;s*Z(fHi
z5G8y)_8&uy0y4XKgvH2~Y-FsiNY8kQ@V48VyPksVe;lu9ZvdzXzYD+iuxq*K$_2Kc
z;$2$8Fgg3EjtbQo@z{v~>JL+<G^0YT;T_L+_q(_F;|QGOoqsBdpi^yk%<if8&gJ9d
zD0W2V%l;mn0~ilNl^sjG`?s2@5!R{Z7j$jf2D*L&UNEs5c1!HnSaj3JzKgnh$(?6a
zSA-Bt1C&6JIk2nwc&1tA@I{Ny_^(wFQQ%e0bq+gb@Fhb?deYr7HjfQSHx7WJ@WjP6
z>njH2W57ySyQezwVYU_}hHrd#tF5;tl{W=}zuO6jFN~29!Xf>;nxn6CmO>X}I)=20
zqX><OLWM~X*IH~zO~;3(gY){Lo3rUJMlMBeVMRig?J{+46e8dV+uYA9CqJC+i{587
ziIHoIw)ustM}TRw;QIHw3G8R$mt>kyqO>DWEve{-tv$|02!w2SJy1AkT6>8CySI?>
zy~}8Tb9J=|dls4w!SolV6i$e`whUPvs103J+qpCbi2gwR)st01{j=*}u4p{}&yhT-
zGlBN@o`-=JE^wdAM!+Vh%>MIN5=if$r3LE@k8U^cBhRtL)6)R5(#42mymSMbGu9c!
zm3Vc76Rhy546&Xi0Ude!^egg9AC_+neNJ#j*%^(EJ1A0ra#OmW<eujP%6b%UoG20o
zlWos`p{ky}Z^~tbONFHOrtTNqC0&5vKa5s+C>)ZK#Y0lUwW$FUV;~$nLXVHkd3Fv@
zDSTLDeGY5G1F+l2$tj`)yH;s*lP*8Fe+KgQ{ZpDsYV2?zo~eR5SE3Jlh=z9QOlimB
zNqG?MDKGKHnJ2Ga$(hg92?4q>+1~ek9w2`e6w#EzQ0Gcbq>{A^-T~swC3A9-K2kcG
z9DUSeyFfj^BYvJWN7MJ|V*ndB*rlDCG^9L)sEk$cV+47TV<$y;Xbqw-$DcUE2)$|E
z-~I_mi~r0>(RCNvLA?nka}Ez)g)_HlK8Q9oLi!jE%QZwLF|UfYKP@I@cL4qApqHw6
z%F6B>`NZV;GK>{bsrAzRP|i3%1PDEXLn%ttNRy-Zz_vXwxiK(0i~(KWw--cIAa&18
z@c0%{y4LMmDKp5*8!jwtTnUa-)Tcg50=S4<NUJ9-zs?}YQ_3n(_Ido71qRgFbYLQ$
zj0azOU9TcS8i1Vwh7ibCn~Sd&1>{x@vI;LSY^*xyp0&k*zf(@RI4Y?3;H>-vcs6-}
z%VChGTVOXvCk1ArJhH}~;RTN^BhYktN+7K%16*Fx)O-!ue{2<5=|n~8yyNr+1a;{S
zIW9W&`GX|LntkcF^zo9Pn(RP2$~A?T{#BROv*_Edr(jpXW2Ftk*u{uVF0w*m+Hn`i
zU}t>tr2H5uB%tT@Iy2y4qyMMz_}j+R!1=pJ>|SBO=pCk*``VbkBjs)C<kxB-(->|h
zIoZC{EluX3C^Hi#tLD>Ix^j*%Ak@FB+uRdG<RD3VX%6;dE@mD8zCZ)y9XE|)XKk|F
z>hH}hS9<suApJ$`S$)!duu|s+8pANj|G_IM|G|;xHESKb?Iytz#&o9iI%UQ3eY2<t
zsSJ8BHE=u~K@5-ovO7G62OjKn9#c*L*58FG7MmH3O9R<aMlLVN5LIG{!)jq43R_Rc
zr+q|{r!1q`)f-AXY5nNjT!^;J0|RoX&auU)e*)q#ZR6RV&Ph)_eg+VAET1sVBm$%$
z4?YUJ4pp`591dG_gqQG_nZ-EHG6@n>rD!p9g!xEJS-V{@M5Xf5i`YR_cbX~ozfZqN
zu`5U8X@GRuJgWCdTCFuo%g%!HW4jwBKWi}dlAiO*xIH_F)a#UoB^k0KzTLrRO4l48
z1m&rTfB`9wvU+y)%2COS&r|5JeuE=NFDd|mGslSar8v-8X(1<xv~K2WiM9|wh=;Ch
zBkD)-?CFTjLRWAPl{p3EtQ~Hn3imS+zy^RLxk$50*Gs>?=@FY=K)W{{KRH$H04lpr
z#Eo)?ksh@cn5d9Ap8}FH745V)H+Eu1Xrsn92|}@x7LWsgDn<_zWra+Q;0`RM&P2R)
zOON$w`7O(RcOK&Y?$x6bK(UNlxyji~sur4F9pIswbyv~8U={V2KFPx*L}&AtVwWNp
znk01@L~+3%vEXvtwBtE&QC)2HcMFDML3Kco?=3#w4@N^vl#e*lXVCZhm0fO5BD}(o
zCnfEnZEBs~!9e2w1eV>07kVF|O`gJ+#SxZO0cgv7HhEWHpx17`$#pj>Woh<WIaEeJ
zu)jK@kq0YdwMwl|BS%D;N#?-0sVMRnW5>qxQtNWHmUH;SiNpY}LGluj=jNqBz<?jx
zaqoMx4yG&}fntp12ZqR|YHn(b8w;hm^05%Y=RxRadpMqy`*rsQ9*Ba>ucT4#lFC2!
z)cbD+N#8hoKBYeDw#~B(&7hq?!b=Sj4RJwtcijp3{Vo<lQlVD&?T}{bXY60;J8wvZ
zUaxHS_1Sp|kv4D7<LZSUVU@3(;uIV>|3f+CG@dTZ<lm4)5IT^k77f#hN^Zim`q6bU
z;HnO>rtV+Q-}srTzUf9$Z*-{<|Eovj;?QN`hKwoi@~(;whRTl9Y!?tz$45+^sZ`mT
zEY}=x_ygGA=F_|L{G#?l6T)J6O%iv$65!g(r7u3!Al&Oqd!$=CaP~%bJItBzAc(;8
zgggArGmxPWq?QD&KJYYlLybLYWpp_>FUK4v)g(u4j%;$;nmV2P&tq9Y<CxzoU7sr3
z4h@GN1mY~VZAzTt|776CJ6`JXX#bImNK_MCuS_$AJ{(=1y*<il_32Le2^3r^;ESRQ
zP_s@Xp>Mld<y=NIPx+7Sy^CZ&0<~y(h1>v34R0_-kyl8MIOnWSA-}%}{(H4O8`@n~
zVG)vzS$09f^rODGA3EN9IYoO;m)5M(mE}7@2@Y(m+>IDMFzm6L*ct^yBF8VbTvNbx
z;o)UAJ6;KSX`mp7Nsk6*58YFtlp|I*%H5<oK-M|h{JW09m^)kj8iO@x$}t5vw)UjE
z^nTgV49LjVE$aLYnX>0d;!38$znKt21PYg<Tb`PoP|gQJF!^TU(23-6vJ@I9;^nc_
zM}~-&oM%teC~rR|!d6^#A?&GB^a(8!yKG#?hcVlGDs38|^5FOYJoTzz{B2h?`31s)
zj?~3q{%Zgm>BdvLfsYc|@)E18SR0>eMx4~m(uSI=TQrt=u4tSKL)~+Wzrzmtf`l1C
zFV0IvL?8S1_6Q@Rre4TpUf7IT9|dmlfa}U1UbP5yU57mvSz>TbLd;si@LHY4`=h!6
z=I^*Z#7cbxlOsS^?AjZ9wI%gR=)?fWHo<>mk`JDeF<=GUQ|B*L90qSG`2{rh;dx2P
zHl8SlEw4Zlzs+6}?sRzX?4B$i{DkU_1P5%w?7^iO;!nKA9%0@bo8`4c!u1$(vS6RE
z`!JrcA#YEAjRE^BT*=B*RV)~Y(*vE3Z*nV<Z}{!oZV_X*GDdXI&8DuQ^pHRph#pnX
zG#gkmv>jA8H>R=cla|8enZ7^e{=)!nLr3@j=qvG9Q<-WlWatsRApTj-Sp+g0f-_M>
zz=PJJ-e&sI=Xs&<jT8!WbBRM=&bqGX@C~SwY_mCli7evmB|LaDzkhVBk?%ze%_B@Y
zHJHwq6O$9>35X}fwtW?QAGh4+JMW;KQ^e7lcskDq97yJmc3ofpVBmGp^1F6fyc_Ha
zVpXEqyyJ>iqlL-p=l%ny#9>A^5L|_2*#sQG;4fN%EC1#L{XF-oHvm2csJ9hi7l-0b
zKIweUo7epn6#h*3p|SeZ5#(I*S$McZOiMEPVJ%G_@b;K0Ad5<OR4C6IALrxnoC!Yv
z+$LU0T}en07fAnb4&)Q)PVZzfr2E|TBBLQ?egj6Qi)_sDb!b}32K|h|P^7w0*CAvP
z0?$yOXeaCH+=DoYM-8xBG>GjLMpJ1r&gs|xvLorNZ^1sf<YN)NZ%clJSuvYWulx2s
z8-QssmRyL)31OwqbFw@c##w|V<1HDz3kXtx{b*2v;GGMGcIi=4l4$Bv>_831=sI{c
zaeRKjB|TC~y|rV|$Gu7l&^|vp8ieD&*0FfyM!JU%>G&S#8*s7XA`R$jtQ`4+CLnma
zATke$6VWs>mofDTb{DdL=S$|DBnxQb_J_<JjOMO9{N1;enAtx4H4&BuEfIUV1?Mjt
z-|)0?m_kxv;bcZHIFnUAd|pZ);l5t=%SZiclNrtg1<O3f4E&lfB4YKjaM{LpaT;VD
zJh;e7zkpnH+Ungb1=JdbGy0?%Z?ugG@;k0KiL#=zF;LmAf4CaSfFwr*g%SB&5wNHx
z*3)!LTHXQ1Uwt$m*aK7lZjkH%3YKStpPV|z5l4!Uca3ql4S^48|Fsq~DRUCf>w1jE
zbs<)jVZkGD*^p7twH!y+J54v<PO8HUe6Vb!_ys!wB7*Gp6ciH(CI<;wS9_KLpV8e)
zx7XDUGSmT3eLSoOPkf9rnj2$lhapm#RyiMW0rmZyo{G?I55I}KRR()8@G(Vz^Ma1%
zHn7j=ikXp=nOx-`VdQ-&YWaEN(RyJ}RPlwIHZxb*Hy=}Oc9-AKfva|Xzl#p&3bAu!
zj<sf2#638gY#hJ<-a`_}pLDhqICy6*c&AWCX?i@Dh2i$>W7W!n6e+S88g_+GB5_Y)
zndXKP8Z}=f;cWD=Q6Uyj6kzf^OgSOC-?Kb*J^EwcRKk9C@o}c3fOHxP#gMyW%wDc3
z3=fqnixN1K<A_L@^rft^_bHd+nl+)(gha4!zXid%nu_4&9~C-V9V6=p276$(UB&nX
z<mV*ZoqPB<V_BCo&fOj5n?jNp;)?8+h<Srnad8|EmG4GS`qvGjM)jVY2&&)tXogVy
z656L&!~?{(!BYaYM|a3wrOZ=B<H3toZxm<Ik)9n0F$nyetXlf>1Y3WibK)?-z5z7f
z|Ge#=|5_uUQ}sVlnX>=35X3MNVFQu`k~Via$On8F8UrNZ2Jm+`l!{uS21B$)v}=+m
zjK!?<9niSlJ#e6)lEA~n1AM%p@YU;?|2p@m0wSk$!Nn%Fr2zwvecv;YrWcqhzyicB
z&{)Z(tV4?DmcERT2=eD#e8T&5(7MI}o<c|RQ3s>lWkKZ$lK&z0f0{yS{!nn{Amv46
zlEEos!nw4AcmufVuvJFY1T$b7qn(=wej`!hUFJx%4!8pO26LwDSGT*3@TJK5pE-}$
z--1BL$M!D`VFS7gsQQTuAHf5z!xkej^gZRCi{7p|kE+goGc6}8cr2Y)W}@?vI~m&p
z7L0!nCEv+qfc|nG-+;bH4_&WP5j?Z?lT>+Tozc$M8Asi}5|Rxi1qrE72pgS|An<}s
z5z>9gsK<hTS)af^&=q;V6vL(Q>d7r)xDKqgXubD@`@y1?JU&wfRo_$kJkH>!$GfX@
z6MV!=Z(Jcfp${zloeHPy2%i8Tf8g9_<z=hem~+R?_k<r;Y-XKH0?G$fjCr=Y+J+X`
z_Sz$ziNX;`b@n8EVd~j=Z?k{vA(Ol$KeS7OO7A(CL1~y5gjY=<>2zMGO{61j-|Wdv
zb%Yt0Rd)y2ZToTCA1GYAF%cK<0H2RP!lN$P?;Y>BUM8cq+v`j9fp1z6{cCwI8@wxr
zhY|u@sLL5?tiGT}g_^7cOwkUQ!?_M=Em)59uqIW%y|rs`iY0A-M$gU3Dt8>Bo_bp^
z5{|eHk<6nE@enm%)!IBsM)9$XM$yFEg@oe2L-N){Bp_%=jjTZ<%fU~&rZ{K>B^ZJ=
z4VI<J6I-#hyp?2N*S1yl4&2<5NUT=}l0ptLNH|f<s1`D+X7zcK^_QVxBtkeWn3-vh
zMJgT=#lF9c@rgQ3h2N0X`8t&k(*y!}k@Z6Y+wb!W$c7;TBam<{pFC%@F46wS>OJsy
z^8v|K%NMX`YDnn=U3{?ddByBI?VT&u9(aNdA&fynew*7UQJLhg#nU<)yN|z*Ctn8@
z7)rUiTjJkaUZ!PG(bImCsRw$+_KM5hKXT$Iu^A->fO(tbXzhCCSs$^Fu@(S_g=;L)
z$0-=_EfTW?=p7K9^NWu>IW@K8%Yfe&u%kdp$YF?2-k9R444*@iP9n{;{!BS8ND^2q
z2IiTibD@yU<a|eqt?NIJvpb;Pq$(dK-GWdTum=CR_*Y}LMv60`%+~1pa!PPxKK6^>
zrCmN@;wAYC9V*$-Jm+5<mpaek{wV`Ne+p66VXYDjPv36oANVGel#nFKBo8R3w4#g&
z8$j}5>w2pgvvm>S9FI`Uaz-k;y>b!_W0d0gIFRWl?qWzu5C=ZwlfDnJ)XVjPH52em
zg6Zh8=34tcH$2U+kx!Zr$4l;(C-q=M$^fP0TU)bNN2;F8-iw5@zrfOvE}#yBiHaZ7
zW8_PR)5}nzUbYMI{e7xYw<)$u&SMXlXzWymV5pZdME{tQ7fp8Z#L7SuUaAL#N0{&-
z?7jyt@%g7oBt8udS@tQr*iL9wWqF}HrrKE+kucaYnN&AK_w&++&y^2O0-jH&Nd7IF
z;=f1xC9&lmD+(_HdA;E%OW+DadC>_*Y23)uz^Ad+0}q$E#6EvhFKXODyn96b*RR+P
z!7C1){>%<96q)NM(410O_r*W7Tu00wKs-?4!GE!!JXg7C=Hz69A5Zh?@rRJjd_E0<
z6Sw4<NZnWx5dr+1+0iDQrRKWk$oqnNIelRA6)H>ZhV!*9;rUvAc?D8Foh}mdcy70G
zX;KNWc*UcD$FDoS5B*C+N#%p07s(E9vf|;*#7uuaOYBXE?~9#?zaIw*PGSWrZmpx5
zFtN^*`UOvz!-fRXp(}u4XW)R?I{9%E_z6nRVtt}2sPEg3Q7&a(ci3F6A&=q8D8~Q~
zEsWCcT)2B(x)0C*<v}MR2$@5Dg2mW#Wqyj&UyYN0y5K4W60ah#*}IO+C3q_Z(%7{S
z&+D*<1dYuHI3RfHyS#IkA|>n4_b>d9)g4_bjd`7ZP6<Bh1~C0xlQAVscriR#`S7P3
z$%Y)-p1hiTq^DT<f}NT2GJm?Kp+Vr`0MmYvi&^i?%+YkO*TwBPMqePb&#%l%l%lA3
z9FOY*4Ci_VLHKoVLN7{N#pFmF5zIAipc^&!-iL%<S__TvwpY6PN=@cBLJK+cW3PK)
zE0Q+1VI^|`hYMu(r#%~^G+4}%5<$L7z7?}!w>kO52av1?%lzbfpBL*+DwOsu9i$xV
zMObD1432O(s<CkkO_koJ^ES=BxQ2*-V53Y(BmLQtUkJ8><b4z@<&PnyKe03}yL9t%
zK9fe_KZFVJSJ%WJvm5uL<l+*4^t~HwKz{Gy(w;c5i%7&Jncrsy1Pm~>``Y?Y&LZZt
z3N@z$gS*D-W55GKF;*3EZ<B`I)F^z^u8=c4VrjA4W)h`l4HZ{AA6oJ8h@C>Du#&t>
zxsYvk@)Sa6KAOtR0C~v2#ubzNs`_rhakdQiO+O9;i$*SZqBSGr_1pZPXRgcdDew4d
z;iU8{`4N>nqi$TJghI%WQT3M00B7}00S-{6ocZ2rJMO{-o&&Y}_ORbthOFcoQb2dm
zc+=_zJioJ|S103=!VYLf{b*9cLBRyVwk$Z9>29ukMA+?oDCq9aTZbw;N8(xA^!qig
zmskSt&<;Ye%DFKfivNCeziTmmg|prQ#C-fYfFF6|eUH*7sK`e70v_<;kUFp+3)F)h
zUyur}E8BP2&6anJu@XBQBfaX#E42L3xkV`P8y}_1<h7nDf;kDfU4A9tSPH4JON8_o
z$}c#~Ar&MH(!oR9ydCUu702AHF=M@e+?W!+`bNts0IlZ8yl|lKcM9+jfV!7^vJ^88
zqO$mbpVsZ{RK2t&3=4}z;16YjkXdO>&p@v|RWx)-rGE${LJmdyg<3qc1xjuYqpTyw
zX6g@3E-MDcEY|<AproO{7VGu>OT2cc%DcXE?gc6~_MfkA36mj!wb@K|^db$^nNbtE
z4C1#2=9mrnxe*i;xL(RR>isDcO1fb~0Jq{Hjl@0gh8t6*J%8sWd<AR~n(d$NMV6q$
zoDQ1XrAhio0}PP(%8<za5%l2Lk?_~Y0stA2cYy~i6Tr?3TeYu^r2taeY5SU1ZBYt^
zEamWD+`qyA3S^lYCk9E@S;$6p5ITAMF%)o{tpjlYbB-L}I`68YrG6zV0^9sz;N*^b
zy*j3TZ}ri`3wSqs|K@hnPv&Zz>O-|?%Au5F!@R3F9DAz{UAJ|TUuMYF)T=a8&!8GA
z!_&UL4D|{^eb}mp_d<na3CX93{wTj?RQ;$=I4>d#SQT(pw_sYlbTp2}oT)O%z1Y=H
zB?c~}<Mv*pm5%*w+kDHe&w(XPuUpj*u4WzR<?X=csbAe+Mni>#=r0w|03&$(*gn0u
zw${~@fGE<v8Z(jjqZ})@J*BdDkx3`Ur~4*xL+&0RR=w&jfbs>uR~)}NgX^zEcg!!(
z>UH~LDKpCcHJK??XX2J_IDCPM^jaUFmNu3eF6u$FiC?j#*q~5@rI9ey@hRa!gLg!U
zdhIrq2SKgFJ~g+|B=-H^e6kYwBCY6x-fy?)+9tRqKIRv)1RkD5n-Clbxx-5eLOl_3
zY}$YH$Y>DQpBtAZm+ZN)tpyub7kxJYW4AVr@KCzBboScoX%ZJ{oDS_t5aJz0uIv`Y
z|NZV4iwZt?vC%h-)uO_w-Fn|8>637#62@!PSwqqbeotLgejr8*=)Y)XiKKo)fv^TR
zGGlEm6G>hB!tkQ0Gek&IpA@~p%klnTp&@#ROA+j(@dAex0&U8K=l(|o9g_q~p`v=u
z9$TF~YA)Biw{cpiFBua*V`Jo7bVqIF;aQ;UBQ!FhdnFxL{Yb!4(jWKSi%UKb;q&CB
zO_=91OekiAU%PjlJ8H5D;lt7LFi=#CnWN`#?u*q&TytvELwB;lh@k|{6G-IZe)fOL
zL3*M3g^@q~kNV(#@lh_Xy4x&`N3bpBh|(n>Ac<T(j+y?lQE2`LwZ`w|ZFNpXvJ-^Y
zKh=Lp8ggrQm~#*SZkTC-=L>bTtu9Vf<W9AOE717Sh@b^Ly-s<@h%i5bP&^seEPe2M
za<Dyf!$F15O@c6QUizfIpj7ja;v$8)<2ImHS!|09_=J!~!iJa-!$~?6h$eeWhg`lw
zBlyF02{Y+s>xG(_rM*qkA5_+iw>S}k5)7J0)z4_0;0aU&exbr}SM1f!q!+(Sblyl)
zU!C#W`NWHCm`|zHj7keu`J+m@v#Uf@xF?4t`7YPJ;Unf_SbYBU3OBX8Q328S=bg`5
zohO+?%Hg9BX}}bj&vE~?PeBRklQgjeqbOW4`Rzot#->FFxDeGe(i*J#!pRTOFG*We
zYmL5x_E8`yzuXyyKW)P|4&tVsoaP~Y>$fi=05@jZF{amD`xV7{(PdCUfsCuqQHbyn
zF=yqdzE~^Vs3k-Z#?AoxJHRBJ03x|a?z(q3yh@YgmPI&`)O1gvhvQk2wORln#KsOC
z(kY=3(`eAT<clKC^{!C&jgJAE-wlvG{B*bzfeE^HrM!JKZaVh92e88LnU*+y+!I@w
z9XS7=30LPEk_!uHP%_Cu?{=pHWN@L3<Hc(a?cG(JG003Fnx`s(0TK4^JU`{2Pt~53
zD@~^Ped?|fKqsEtNM(kaD)qE4;HDcUd$ZaX;C|9u56GSuE_otP2RzGbr6Nq!?zBVJ
zCvpAJ_EA@uFMa~)36S821FYnY7;W973pSEw^%`FnsM-sAc!Fx@&E<o8i<-az|MwgN
z%>z+ynnR2ui6R49ID2@g>t5~am|~%3y$##vjxs;gNg#7Hp2ecZeQF8K0KVg>`cMuc
z8{KFC9tV<*30aHGbE=q6NzGqFotf;u)VSUWOv6!6ce5&BrdKA%Jl05$kvm@qTB^^$
z+R>l<CJ<>THV?qu^=!$R|CblwYh_ixR;MK@iRwBUwm*-<o%{9~1@?3@_Lb3OWm>>$
z25L-Do{xMhT5=r<vNu>F$MiD}p8vGq28c)IK0k^!E}7z9*3up88$*?#E-BW;SRykY
zntGD%Jb*bEKLs6hDvDY?LPSf@81TAmam=loFXD{+zi};)qd9?_UTMjW+t}-iw6}5g
zw@w$bh~G)e^A5KVLnjxX<P`&y16t15yplAH_0yv+gmpOC8mWF6C_4&mt!zu(My1^V
zt%C>qGuQPWyc%0<9L+4@84HR?cJODG7&vjY%q(DVbX$uGNDqtlr_UXM_2`n5TY&!^
zNOd3AX_07EBCI+J;ABCFA7h9Bytq&*E1uwG873L|;>YBa$$;gv8Xgf}Zz$mBjx;pa
zU`^4@5oGt&hesH~E;7@+Rf&$fNK9hTUuoj%u|>J}=`k;4CXbFRC>wFhu?#29lop-@
zKH8OIlUY{n00X2XGQFS@B$YqV=h6I+ncb|~un9}c*kk^>F={w{$xh_1oxPSNt|$yo
zD+fM`>=fMw)&lT9twZZ`k|A*x5&XIN>%YH#1rA`{?Oi~)Wo2y4iDY`Jf=$8YMZv#~
zM%*k%67TX*2Bz3xf9M<A{a$d$22*b(lWbUmJ`ctYj0Qjd;P?hMTKy1nWN~1NlIIrp
zjP-#u;eI3bG~dWf^N+5@h)CSDY9au7Y&bP_6t%$3g%m+sYx;Kn2%PIsi2*b0q@De?
zucmF~ClXmlbkt)k{83+rxER2EZSyvN*xH>8^KyutNc_P~4|=-TDhel`SU}!q+#j8q
zo7kvaN%i{}uhoZ`yDuWaiCn#Eh6;NT!$Uj}@q`_opQT8N5dWaQ2_f;DZz3W_>r11!
zD2u4)54HS4&o0f>CnOED#Hl%cIX``Exp!}5@gooQJOA`B0x?OJkVF(ZjWKv&gIjzl
zK|&<Uy$v-wmE%m|buaI*$@>EUcV%19cJH|rQCHY^6%`bukld5na&$UZu(gZvgp)H<
znWo&v>IQlw?1)Rlyx?YO(90(bu2_o)^*5S3A3snUo8380a=DGpFQG~z4Gk&Y$mIc>
zk)+v$M-`4tVDos+dPX90JvcV(<lF5@@)`a;p*kc1tVUj=_OVA`+Any5vnus7^Nsh_
zrw7(kV{FH;l)u4GM2s#nX!SIz0H?v8J@*&_ccS@k)L(gO+fkp-EyD?zMcgk1*z1cJ
z>&WE>*pQvN5%<<fN&LI?Rhm3`Rta<oUlgY9*^lAk`59mWQawlzM#A&8f#HP<mrgj1
z;ofxB#BO-MtnZ2EP2mW?RdQY)Y+biS&>kA!fO)`<I1^5hbn)9>T2RA`>esdnUM$VI
z>OKK%TF77Dd6b9!@bqNa!`>MsP$RLL%s#!QS%lJX6DE%(g5aCSywMHm;cX~GQQtfB
zjUITAHFPj17T&g_U+}2m8d|vlcpc698jLMF#Nc>v9aH~X+wQ<`xnaF>-P>HX`iECW
z1p34jGSOn~g1wc>r(QP+nz$|IySod~@gQ0jz4Pf+Hbi-Z{^HlqZ&1tc#D$!`wWIS1
zA<4Y@>^l6ARr298UE72gd8G45c}Aas!QN*igD&vrCrOtHR};&B@!Ee2{qLoQ(G+0v
z5dtw^@?%rhj$)4wLQSo<9FbVvLzH-q)O+KwaaMPalP=VdU!x+-BfhCUM|W)4(Y$s%
zFzQ{Z2Mp7sE6m4Qo-LlE>I$muM^y5YmEnTamn%?_56)-#H`iCpiEr!{8yDw_mL1{y
z0%YBa;@JBJXhTNYo#f&~Kg!P6M#(R;&-pR|Rly#X!u?lGmdHjgDU$Czha0<VpWo>K
z&FJdb{nlcq#SjzYHh<b(BxPt}ZB*<CS%f0v2|Rcx;;l-ZEcfHA(W>8B9REI9HYliw
z4+z)vnMD%bQIo9=`%bo=9YSBn5T)RnSW6_17X+yz#d!6G&Ho_&2@X5Ij<);+=p|#z
z3~6>3E1Qf7lpe%Y9XkzcOn5%8hcf9+3x<cvB1F*3a4g?06>NFKKz<z~^_(G-d3pV7
zjBJD4i}aYqk+T7OczKWpHR@RzAg5`);+{~<d7o0@meY+c!#{in4K<%85w<ngqk<JA
z#Sy<rzF2t>=?p3WA67$<7fE~zUSSA=Yy;ju=uQ!(f<J;r4cE>aOF>>4pWe9S;`EFS
zko9H6tv=s%ALGp8Y&0bu4XRW4^aXM;p@6_h=AZXTbE<nl1Dopx{}vr}uTl@Wtey5A
zmh7fPUS^^havsKcpusoSQ8Hg9Op$4O>?sS*UNx0@#xZu%f(#Z<G-+N$^ebCcLRP+}
zuTw>7DXy>r&<v>Swj31+Wqgh&ra}_l-Ml>!d49}vsOraTh;hrIr0sjwDK$;6S_E#@
z#WQd~yHCaM9Z62(1||5-I2C1ZJMN|G+4#UGocNnKd@{A0J|2Mp<bSVE>iXTYz2gV0
zvXPQmkXn{Z=RE?+R9~5~+ERv657nme!s`kD1XZ1@UUGnu?K4Q=@Ofi-;fxJ4-drl6
z*%wT&irJGM6pMt+^MHQ@@_#6x{cy72Dy;nt(_v0pJ`xHMGeO47Q*5b?*yquk#41eE
zE287R7@@7t6=s&j9%Vq%&T|*0Z@KcDW8m%Ar;KO;vu_oRpIk=W#}gvaQV1XO&Q;Xm
zII@GVk#NdFTl1}^sG(Ty1+Z&leLX(Qj4Zwiy}83UDwxH%T7ic@I&5F)MHHO`ZGug^
z_paz{+Hl0wxyWqgL<k#tA2=D4O*A{MMKenf0;R$96M;+wHRXy!U#Ps(jt*u-{w13a
zJm1?Lv)DSlYPpoW%b9+Nd&D(JmPj59!53WZN!kdzYBnXjn=E`}J#Z@_YRUV$jrUmD
z(t-&@_{%7pf_?k|n&SZl(LZelXO(y6bt3Fhng&5GatAf1yudAXHouKZ!Oi#9$U)qa
z0m%wOYg&3G3|-akVE5)ZIyN8d*|zr6Gr*xamdRJ6Lyh4qo6l9mk!UzlCHZ8W7<Nh3
ze*HVx!Tfg=M9$?bkN?-#;77d@U~v+d$q@Ch@0DRC?l@T_?Xg3zlhdN8Y|n7-wpI5a
zUj0@8-}|v~FR``wo2;3M;5lyYliJs;8xvXnd!{~G|GvhYY`dU|!$RIW(YubbRwqmb
z*&1gw6DSghKsHdIz0tz#VD0<jzg=qcJ?W+O=#x>()$Qf{n%SsU{#R$=;Ax?I>$i=#
zfLbm@;Ewy}+&%js;^+8xbZB4;8DIZe#BNCc+YWnT*`7TxFnP)?iU>Shd#?jW9z?<A
zBbBEiFeTozJMeGAIc4=q$ltp8ug~Op=Nmb7NY9PPp6SGmMxKwLO(c75NaP@K>}`}W
zh5}LA!uYm#P}CvsU)LJ1MIWw59#C1j$j?ju+BP;<9txj5^8OQ5J`gD#yq|LQ<T{}~
zTKqwosw?m{&O9kD`x&%9u(~?uRXB*yjNYJf*FN3c$KRYkG%Jp1w={aK*7EM`KI2JN
zW<3-K8gF^<QGYmnaPUt!@$yHpM#I;+n4ZtDk*7YLeAVF3f%8=%nSka|z8wS}X#%JM
zM0UhC*9>&{e|Ne(Ms^<Btf#zKA{+knAl4bl10IakIg%N&-tk54OC<N{-QWRzlFuUp
zE>5c%Io5+_a~CUik!k*>(f*flM#wMY&p@mTaEFDCK!VB#0qWjE;f}+QWGLlp+ODXv
z==P}^h~EFR@hKi0<D-oI#>x%+@#6?+4j0K(b};2LL!FemJs~)OC$prko}!o_S4Z|g
z3TgVIZ|t&n2zEU7$sZ}C%Ad-YCf<b%r;w{NcdPK|d7qa5o;}vaF?_p5^aklMps@=i
zD}dKkD+O!Fz8_bLo$H!fS(WjWw?JPavv!|FJCd_zVRe{z-F+MCm#1D2CTct0`*6$N
z*Y-I5Vc~uO3h4chu1*xqd=47(`-AE_%uEkT&Hk5b-I(c3iEH^t!<W?Xz#QTHGf1kr
z2YmiE+bTB_yqt`_aSY}FCI5TcFU#F!f1CwCB+b0DQo)sLevpqr-Ax80l>tHdaLaC{
z@n{akueV7Ym~l}KInadHonHnFY&42FuRwo@hqBfi5*sMO>4V#V1CfZeLgzhmg?CFz
zJ47@6sH&z&TL<KW9)uZgDNRrN3{>pQU!x9`c|guRC#mzycm;Sqjmk1#d0&-KO<t2|
zmZlaFM9q=*IG?(!C@UW@z}8_)wNwNI)NaEzfFDT6A4N9q__u?l@{__OZIa^A%HL<;
zLF}O%e*r}iAx$6~otTq{*Z?0MX5YUyZve^<Vr_CEiYts4Ikb)PA@=_<fErn(`t~68
z({b!h#Ie&);dNbq5eSbI8z;R1hYqtV0G@!8`xlt1Y;Yt^gaj=dqV0QY^HLS?^m3Oj
z;3EkBCy}sQ0eY;hE`o=das*+4G9sX<azX5{O%W9E7T6oob(Im+XC9y&@(`8B!3P)S
zUpN;}*@b<rWCuQ<AQ+*0^a@`KW`#B5*=H=Nn{Vx-0!aX@7eNGvKt|G69&-5r$@?>S
zU2TjaD8hXpPNj@J&L02~Wd1FpAlqiIu75Yky%yC7fnmTdOpZ2y|3McS@HsY14u;?g
z*P*<A7xU%{iTdObdfEq0c4YM7OCGulsX8I4Tb5@r^=c#(9|lJy8@DOpYEH>yLZngL
zx6V8Id(y-(bZ<nWqA<$^nHi!hnAjqAUviXU038<AxrfXp7rQ(sGGChf8JP&-RBhTg
z4mU+y3^-oc1De$X;!HHx|K%UrRU%OcaTp2>Bdfrdyi`Cv|L`1J@(?$WW+Gt@IY6q)
zp-ZN|U4-o91|Pz@pQtP(Iq)gyYu@;9;w|`j^l*TCar%b}p<@TAqkP_ZyM_T-{{d<P
zXvxKs`7yM9Fw2JWz>`5eqyv=sNbZS=eDEM*Xau+I2kIH{6hQSq)SkJ>X*^(X$Pk(Q
zu^j_my&t>d3C;Gil6f&EZosqy_}{^N!yKAA!N>0QSy9C6LxAqXLpBE#8mQDF*uCMj
z=LH_!2(CSGF5X~{&D$jqxxbLo_BusM9;^a7E*jVg>Hg^gMo6(O@MPx0>;&?<f;~x9
zm>g($IPb(TcEJ)$Y2O(PF+h>+kmDiDv8dt+9>5U~8g1}kLIm((MredXFZ&=^-~JB>
z`~lpcQkvU`lZQCjoMDp!o%07RMu4PY59|t?mQDhFfkWWVkssM2iie+;MNPG{A9^?L
z7~R-6Un3wvAjXc$<ot}VMEs2SMyO5TM#()%^F0A4)-9@8qQYC5w;MZOC+MZ^YVzP6
zC*Dr$aUg;|yzLy|!(S=kiseg>5t8U3ZPd&#HGkiTaB*I|lJq9HSK}mqlVRzP>6#Nj
zBfW2Re0+RnW+ruaXL&XB)V^zPgAXjdGr6!exxc@E?32N3*@Jfj-S0e1<m(B+;oIp$
ziEV3~TT>~!k^h`ee!9#p7F&uG8>4Que09uHsJ{rpZj}h07))^b)}j8Rv~VhW{ePVM
z-C_(mB5dvYhoBw6g~?sg9$9o>FIa_E@1lhTwr}lvt4?Vw(^wZCe%FrvD<kp9UJ9%~
z=mo|fNc-SjtgkL`FC-DGem^ySFm`vQO-v6Ap;b)wvv(TyDkd6DkzmW~ud!M~cEiI>
zU7pCaj5@RSan$Y!nD>>gum`Vsq<l1s%-AX6XpY_m-Fu$*kDGdc$jzg(RtTdf0Uw|A
zwo>=tDIAr@aQr{b0;Sze$TR=IJ#%XHAyZ6q-LNz&>{Sn!FlQRpv8g_q59fYN{orTp
z=jzc_Tx3KJp{DJ%5U%U=vLM+>NZr6RL@kd85gwc>(!cfBTew2YUpLx>{~#kc)nVVk
zWy7Xl2DZ5abrhC2N(e~4r<rq7ub)Lg=RnL{#j;$AM|+A~lAazP+=?2Ts7sE2eaD0Q
z%^5(R>eU+3=1LB}1qfxaeHuI4#|OXlWlIw$<3HrC>$m)r`&sKO1XzN(ewhkT5cK_r
z&HM7bR1h-YyKz2l!?;ZZJM5d)zFqNoVpbk>Zl{kOG)c3y;Cp9eGMSB*3&7*fu2}9J
z`SxEt^n>F&j;RiV{MaeA2I6XMYwYL!HX|LdE4b6Ak;lQ;t&%gb2%5Lw<s}Tbp(qQs
z6+^%0syA=7R6yjZG4&M(#+Lt_JFzK9#>98rKmJt<_cn4%ts~<sy`reti**^S;9cO@
zqa>-OiA#xp6*e&A>D()9KZYtm8E6;%HOeiLrscSELlf$DhtGHaD;5x_UkSrV@o4Jx
zld4`#lsn(MGNtbP{DkO$Y2^OMfBxlAjE0jWqGL_;^SKiU3HRbbYu(n<N?R?-T^kPX
z85bBFgpgX&@<!@KQgAYogQ+c4H>9cJ7w+v3;gMtP)y)%^IGDCbdP53{%iTY6dw@Fj
zM&gZ2)Q5AZo&9wKe$nMdkuBrRuyDDRe8bGsKN6P#0%Y5HI6Ws2>`JJ6#=*%X0aV-|
zR$q7`5W7C$q0DD4{<|xA*G(W+oLi0<{aM!S;Vv(jTwsu815ICsEcs*vELvbubF(27
zUF=@_+}FEX`EO=y<Ubp6K5cf=ADHe3^%1QhympfNPkcnPyH&qeAG_Yt`DFtNwT;yQ
z^wz`r8$^ipaKcSR;`?UxB0;elqhcL{qNw9j@u^o1*K|P4J#FXDMN%G06KC-62$VQ)
z);l^oh^7`4u_s-`Q2*VxaL@kteHK}^#)=B3jjvE_o6*~6L?(Yvdq5`wP`<LUy!wU%
z3GZGaO)G!2a4C0Ut!McW<CpKsSRhWNgJi+q*QGN3;E7tyFdI<MRe@gmf-=U}$CiPZ
z3p>Y=?g6f7k_MoPelP7WrY7I1Gq@SU$zbRayPUVAy1=msH|842;$beV0fP>}N!yr1
zHwKv)K0HF$nO+T{J?!E4fB-e#g!09Y=$xz;x2kkl)l;8+LoVWELriZQ7qQf@F$;9+
zzf<?yN_w5#av-;VisUK)Gle0BO;F$3kH3w>QS4a|kI%*T+etcr&Df)#1NZ8RIRz@w
z^e>Nj9u}ljygOK``n8Mv-1;EyhR1}y_XEico198M(0LV+*5JO%B_-*xmr(X<-N6QF
zI`edJUaRJ3LoOc9`vA3&|1`gO>GKYN+PfIZ7AbdzS%hg9L=pi6&z%r7L=VJ6rgwK}
zE~wO%gpl@=4@x+14mqJ@&7Km?{|gd123Pn3>(hXOhJfM+pjYOc$b6G&oaZD6)zpsc
z5+Qdvw2_(NG})8Y=Z$HfL(?e;+nC*|0teDBK63j>soG1uo1j&Zf(LfT3<Z(k7!p2K
z8&iK@g95?g;G8nVO@$&k-KiQ7B_=gbb?o1JGMR_6s=PF23V~bmpV99I5zOFl`<Yjv
zB;RD?h;SGZt~1!zm7*v&Hky_nR^8b9TG}{S@64|GUf6j4qxa?Jc?TRE%v5#v9hVMQ
z5m$Yx5s0+jghQ?Xs>|ANVTH`~#k>>3$XA0y#>g)}_axglpdl%2+%pZSrXl-UHo^&X
zwZNf>H(zxH0c_(uvk}zCS=0#B@WbQQJG*<=-?mGAfa^??K{tx}`N^bo#JcV7y^T|p
zXTFRT97}MX_FhY;QX6B;f=>CG8|*OU^IR$JcL`0Z0(ifq)Fu7#qNhT{&G(D7MW_=r
z)qhmMgq@Z9JAw4W?H9Aq#h&1^&L6iet0B#G_OR0!>oMIMe}rp{abz_GOP3-~x#Ps%
z#)K7j+y9UYz1YBrpTi=|_!!ptnBUW`O+}}%_b!+JwkBvRHHi|y`T%9jd8pu`EV4y9
z<fLOOBPfe|72Ar_lKWiE4o22xTa9KJV22u=tDlvYH8iY=<6qOd>Q#uAReCXveC0C@
z?k7DhU%QvScK+ibjRn6*Nq>#EZm#GDT+$6{^2eQP;Jht7VLSh$n~^}Rxn_>PXvAWr
z49F9<G|#4&rUBk|pT34dPcYy2zoeZF*rBs?TR;qzKlTL_4;Wvo*WV<-R`Z`i*J1^b
zqwalct|1$7M5?R<^W#^$zxT7^50<Oq^mK3~yf#{xaRzwaS(sZpMxl<LeYPIONV4j9
zAQ>By-43c%FG}w9W~H)>YkH(`5Ziqj^ey*wo7VGwA4szTcI}mwj3u?8Dc;qWjxP7L
z?@5eVIaZg^f0^I=-tp7?mcmk?UIn)56IdBGcyl6g?qVd_>(1&vFUnFh<EM4#iQdNb
zR~#A#5^~(!a->uc{!FI2rW8YRR$+R-ld$Z3X>{tXk$>1^r#W$_aQsxP%sKRTmz!NL
zoU}Sa_UrZvBriEMNW<@%)K0x~*x9{O=oR+CB4qi}Q^b-ag{mE}ZqM58@Zz#G@~cZ{
zVd#Cen<u0YT~!BjcG6=XukHOF6(7&}U^>$-Gj_oM%arf{-Bwk#aqC#@huAzF&SRd?
zs_ZXB2iF5kM(Kd#xaMh#FE7L2t<~)W{`~LZ8%y=AHa)Nz0XZ`g8TPeO7X(a$I7ZGD
z&THP7cWWzYjvCfwoLpd6O|Zd$rHG4(=z2jrgCKszWVTECOV=GaAQWz&WTNEa@-U^W
ziWx-Y7Z;tPZooGeF1r0fE1r&9UwS}96C%}qdHF7V*#9FBkffiAARrdE&uKjF|MlPw
zM{z0}wEi&j3rF;d<LOzAgIBaNh@eiB&+6MRMJQADv>Ov+!VO>aHN~1<opLjzp7$vq
zeu0%hf}vE1arz7M5ulrg>#WFcv^slOUwT$Tay2Nv_xTVCe5_NWXo1~=SV5`=(+4X)
zpC`0{vq<`|C|DXvOQxei{Zl+a>(~15ZXkV>_4eyU_a3ZR?60wux~<F5r?}BlE4SL)
z`K8tJV0`RrVIg&ODhsQK2ESb~hEj1Uayzx~^ePYM=4*}5;t)v)!6*j8r5KWVOv0c3
z*gn4>MPK($_fYzu?1!YA|9nQ;IkfC4+RqJ@3Z?!7WDfB8y%MS95Cv?$8GUqWiv6o6
zAX<QDrNM4bg3kls#Fkx$XzV4h-+-s@x$=FGott0=yDb^ZtWecq6)mexl;(o7ZnPu{
zoEoi&hnAZV&AK=q`6Ga~od^z@t!q0w%0V9WukW=S7ga1&mI_My8NyXK(hyy<sLPD3
zc2>Z>KSe2)-?``ep$kP2%-dd6KLQdX_))zDW_jYDn+#(ojTr##V-m<_c^sUTbznWE
z8K{IHJ!|0<c&SX}1y)hd(L*m!gzxi8c*iS@Y_Shh={v1yBo0&Ll|;(tyQQxX@Y#Dc
zT|mD5;gl7^hR<$UH)=;uX}~TP;I6}olCX?S{~8C$2mC6$O91l6%_c)%rn9oKL0?z|
z834Ti349U2&L%Z-R&5)Lc_I6Kvb^t#0ia9%G}-v#zY!+o%BslGbR)*NI~RB=0+U)K
zB8A9t0@#;v4uM!7K#X~g#{D^_j%EUmn?cuD<2SLNRX;L>HL6?B)(Zc!C>8t_6l;tG
zSNi6~BV$Hx2F*n<kTy47UP_!PgW$x=PuOwm5`EN{Y<XeI+Y5hBItL#Wha=o_{smqJ
z;3SwsF;wJ*=)TtmcB6NS3>$NBfM<RL|A9b>_?Kcz)<<Lp>TKCKx&Z35+`;R@38=$?
zH_r;9szGQNnU3X)j&MaXpf16eML_Fb&rf;Ku|kXxZaUJtmGE0jca~X#jg;tdjDN4#
zF?-skeeA~_w-EYGwmu_w6=J;^L_Nx|savN8-jthJ;zE>_d?}1rYpA1lAhX!CdLROO
zFh;$_9=AO3Fz$|}8yY^OQXHZ&q-PPjs1FeNIg6PAuzK}j!{{M>nM<PMSoLwJ)`dab
zG{DNf`?=@?4G~fkcA(+{UODgEx<n3wK9T-x&gLMt?HCoM_ZMPcG`{elV~&UrvjJ{k
z*LxnKvtqyP-0M8_y_|I>y&lcF21xoM4lz5UbjA1W2PPM=9nEjbJN4)5ML^q_7-nQk
zsXaIy-((mP7|WIM<fay^IGm)%dNzDGGe{OR4IGEl5ukeTz}oV42}k`66j67!8>4Y)
z-8<_ZJmSK7-Gv>+&hgwXlH(d!_E{el2h<~T9H7Y%2~0P>4ezk3R3MU39Oe~*!xHKx
z4CF_a^QkN%C!@?6^)vo9n=?XIQCzG`X+a+y#}XH|epbXQh9z<hN|0qq^;)_YTJz&A
ztQak#7GA9Rm4l^IVX|^udk!(P8TDToU!Bdr>7YEib$-r(T)b{L_dtGp;rt-4H9s2M
zA4yF2bk2P5Ot^<wI;by4gUG9*F%L;WaYs_^HZQq+8U1IJDqQ&ENHZIt^x)pfu*JMj
zzjNC5O8^=vmWGbqcBxw(IsR+${jPZ&$cNKF3c^b8)L{|1&!7SIY#R}goeTTOC4WSk
zsC3>N{2M>8K_nGm&_L{(-gPuF<}JGe!c<G_@w)g-!2a{yH|+hKDvtN^5kRD1k7=fD
zl#Q#Y(tq+m8i`UwSnYMa_UA{zd>S23monbfR<S_T=LnF7e>zv+X%fSDzw~fmGbO>h
zv%d}^z6##!#sd8AIH41J(x=5_kLwIDDMG;V$%uhR5nNNfaUWT^rWv#(B&FxvKDZ&K
zD;Mu?EgC71=8N6^+5H}SRhh;j|0CxdXA0UM-sxhKUgDUCH=js|ad{rO`O+1bd`o0z
zR}4B`hpzYV+`aDvwcG6x#G}8;9^*|(VIW(YAC`><vjckBs9XmsWQl$$b1@_7|MCJf
zwgg+QNPjMl9qqgG{^&_hY1PCQeY<;b@RwxSUzCw^yp;XLiNQLu)32DcZDY#ZSVYtM
zvvW03wv<+bQ|ng`LxH_r^V0}`?Suds3e+J)y=VNx&h6`8p^o>r>cGAC^(I1<tvSwR
zV7FNQ&Enti*ajZ(D(8Wr`tvziaoD~3XM6d}y{u9h_Bw30e{8!2>Hlc!&7+}w|M>sw
zp3PwF`%)$?WXqBeaZ9$chY*=$r^uf5wqz+qmLl6&LMnONOEJ=hvPLM3Eo8~QGr#$q
z-|u(6|9sE6&YUxI&VBAP*L|PsdcCgK^Z9uF$B^s|+ythh)0mLE9e9pgZ(`##u^?uF
z`1D`ER*Tm*XHC()J$HKuzwl?csOn3j7c1xrO+Rkh<k^jHLrQCXv&Fmp8y2lv1p5>8
z{|mdHqC*bux$8&0dN}&+MZ(geZcn-Fr)z)%npY3v1z{J?AkLjEWqro~kz9!ES(i*D
z{<TKklfNTXC{}L^=J&fQAjH7=3rB^ou>QcfZ}t8~U+B&{%<d<OJsC+FtqeLv>3Cv|
zQc3UTu*&5z(0v#FbUWV4)R1lQ0AHOPe-oJtvnx>hpsyH_$#|^s(VwLgt$I?0hl*@*
zU&CdZsN)*^7U08U=CB<9_*WGZN6)=DW`LsXG@klq>}(Cc9|5IdAKeTCK8a>Y0*kc%
zz+VwQ<1EBmRv(DJ#e_e9Oj!E2wS8ncYmfORJ<OCYXc|vE0w04^zX*QIe_i#v3Kc1V
zC*9Pl3n#o|diG;*r=j*XVaeB$kD5xAde*`~cRSp;O0z8A;cF^N^FD1gQx~~};oL0u
zw2-DwHf5ne#Jcmjv^!T#dR~ZltR)x8T^;>g1#8CME3}(~s}FgOI9S%0nY0op?V7Xc
z)P70xR1RePh04i}F|Coiti;6~RNrSJQ34~6O<oF<@o^HiH3gK>s%Ox<Dy1L+73&)c
z4{X8&Uw^NA4x8x`^-9mM2Yk?vGl;xu{&CMz7~E13Utp1rikU_30R9^m3atT87k;i^
zwqTC;JFgAMS29k_-<Fw5>oS8VCi@<>Qq<S{U`Rv}o-C@gA=OvTD5$en--9glT{?`~
zRvjDZQ)J?!2rF7HCvFc%|H8#v$Pv@~MAcfEt;*g!dFd!&EDIuydR~U`{sdsGdvIGp
z5`7_i_h`=jkrerbuiy4Hl&n8o&W2o{n613=ahfN{*r6R2qhA;PY+1<p8=8~qH+3f1
zfevn%7{7VwLN2)U{!1p)ul)xY$`^xcRR0+0_l=+4ABZItk$)^rf-x7eI;dX+@Y}Ig
z^>gopkJfvFdR6b99@zZbwhJoi^ySnv|M%a`OXCGK8Q2E?Nu_M29oYTS1AFt1*O7%$
z_h#R|XD%q6uiH`!?fYM;X4=daONAlRk1Aiw-VCyF1DL|VTIM0q?NHZt#g0NwZa-9y
zx_!&l{?UB_UuJKKqx|IxXiAQ!#L`t)t9L9{7_TOn&3XR86!Ai11s!S{j5SKv+4Zd>
z-(G}%p21L(GM`2rj*e@2OBeb1#Q7Q5&OOk&Lk>J4{BZbenk|*LsMv!6NxVjq?TGC}
zCv&wW<o3lnY?uvApCOvg>B29*tQQ><shGEcS)5i;(Gxs_-q=aS$sdGET+M5DNG6@{
zL%J%s`cBn31LRSjYDwk=%7h3126*TDXX>w7%V)_9x4DaCY2m}W|8NCaiBYdZcFSrL
z2$Z{)kBM*{&KECii)&cn#((t@>Fl)LYHd2f)Od&TD9cM~`G}U&AUuVJgZVF+kq=ck
z;Wi&Cs+{euz2D(hEY#tpal6(nr=y@EEwnD;=NVq0G3tmy)=$sW{mNN1Egimg%@@ji
zyQpB(S9Nv6hvACCzh?#ACsY0l-+yNfkg=3Y#SEi)NpY!?x=&!U^{e0Pmt=4Jk`H@R
z=+e!)0()?<J1fo+z@wpZT5dBV>1fdA%F6t)zjL4Z2S(h^U`Q)Ify;GF4Vw3u##i1^
zdbHpV&H7BQ6bqC7bZ{o$e&BSvOpkkMgylwZC|5__kn>%n!up`4ZHvnNCBSK+ephAg
z*WG_=3Bxgo3Y!wV44~TaAduGhF>LjY+QxL^sNKH&D$cT%mq!G(Dmhyu1Edb*m+1=q
zdy+Rpx{nw+bry#_T!|?OnnN8lHnkoh2Uw{=Qr<fkYlC4@Gn%NprXE9&erg`zC=SzD
zcj+$dif5y)w;bWZKM|(d_%X9oN)G@n31qwZ9lph%y<y@0*=pA`f4l;IDU@{=<XZw+
z{gRZK=kTuYe+kBaUV7EAFu)rZOzI519yv(v#q-(ial96csfy+dquL}W#l?Q`Y)+s$
zNbD|sIluaF-@_ZJaK618+$%sFHJ2<oPlAiWTebJETVurCuDB)u*z~xb19F)Nc+W*E
zz0hgar*S<QV!w8T9vubd6DnMrhK-8LpAC$RZ~u!8j)vcGkk`sd2g@wt-UoFfPXr4P
z9+kbbJst=3O_>0H=Qo4)AL#y<ftM}<p9)}a-;5B>=ljo{&%^oH`~|lA(d!fA7R1(l
zEz;b32=rMiydbKwPip@6>A896`=;vvxwH}G!!fu%5yj6mkmnH#sh#>tf2&d^C;Vj&
zZN_1t>Um+Z6reooL(kI+egm72zO`Qb<gen)<-Ru8v8M1W2k8wkk<!VKYddiP>|Qof
z_m~c{#DH@9pJy^56TlPqh6~jC{PtzIpN}+kzonn<(8I-1cPD0)#Gi`QU;c*X@WCDT
zor|(Ik~loe(m$*?@?~#vTq+8Me8a$Vhfv8gSutL#43wY7D&;Rw?|`>mFh+d4sO#>6
z<@!-sh~wg|>bMHS+=1ysYLf?1S9gUNz;-F^)>IDubGi3ob?ij9cQQNQ&MU?{pIe;M
z3f2GERbe9d0e2|`7jY2|C(2y=>MLiN9!K)<p&J|MFS?)l!%5eWk~<6#-7(w~C%p5o
z@D*7^bTwkWV!voW-K*_rcQX$$??XWDNZ$H~Q#*f7*r&)7*e_PCoEiM!@*z3X-`Vlo
ztY~ds(LsCjani3~>!|}7haZc=LOAh61|J5>%~QuFMz9?zV59IOaY&UlM<q~1meUk^
zPKIET|I{x&e62<do~*Fmk#bx496qjbF;BUrp+0v+_ki}TLj^+|^RZF@L_;viON#X8
z%p4mQ(L6PpFy11$t@cgSUd*R<pn2+>EL{H0o*S(eTe#QpM32ekew%Dm14G@ls-J5l
z3R}1@0jx~Ng+ef25W9VY(Muj(?_1Pcd$z_j_yfC6_HR0Z;mPNxD4^XqrUrq=@bmdi
zFli#{SB6l9*!bqJfhq1PQ7te8<G!B5AY3VwI32SeKJkU=qA7ovI|S4cvPbCz8)qG~
zxe(a6QQc=V;9Yy@IZsmqx6ON(cfrVBOM-}ZE`V`B_7LFY`tKJT{)1yGV_Y=Yo0&=X
zRQ;kfAPX;=5jAiFJM6qbHUR&CIp4c1h-f{qRxVe$cfCvc>DX_8w8?`q9+ZhF@RWGJ
z;JD9n>&tF;%Iirrhl)eWYUSbC-Z@-rqw7?DQVZbe1DiK$%g%EzdAPnsv9EZbysroj
z(qcK@o?lvGn#6(nWk7AQ>0|)SOt~V%$9<VYbu%R4?&_Ybd!~+0sbdP__aJH;02WCQ
zWuu-iWj<8zS;JY%LWvazapU7D|GPE8bMkD5Dh7`LERMU_1!&5H#n!zRe)+kQa4oM%
z<L^Q2L<sJ_vLsO}Yx=4&Bpr=P#C`n;deoNgev42wGEj}UNW)Y#ZDuf(R;EX+K34%l
zcKZ!XeuC#o`r#yh!Se^wi%U&MS6@xP`&F0sn-}mOHtJ-k);^njFZTV7K$*Ld`;&Du
zi50I$)axsq?<jyFSsa|;MwNQIc<5{hE5qGGTVdWSdJX3uk=WBiI+yo@l*3E}+i4t^
zs#sfbH5{@EM~D!U1B-Xa?TNS{{G+&_>j4o5`F7Fg8FRe1ko|G->F(D!ieN+akQTpM
zhI(lXWe?j-Xi|FtkTXJnNpkCTwA+jgzqM?Ah{KBU-mx6@d`sU%Nc7`)B*|-Me5N#g
zx8%PicHtJFPgv5Z=@38jInwIZs+ITVnr1~R`a4vSY{T4-HpSHE^jmPEKz>G*Gu_*-
z$sFG~u_q$`d3aVK15#R2y?!IZ?of4@+^t$3`!Fv}@Q!YX<PKg42weNN6Mslo+|os1
z@AP(;mg%XetfA|G{N0&OZ*s3t&9*(~E1>N>T&(9E4G9GAPKExSIjQ^lVA|n-hGkja
zcV}B>(<iQROgT^j^zVHPKY!5jbJDnlu3L<S#;ew}FXSygc$(8cIX)%RNLidY$Z_sz
zvt|e7Y{;M9PHq@5=5(+9o$5e!AM0rhd6Pld64vT27p=JK?iAAe*ybtE%e|;R$|+Io
zzQ%}4U4vu(xzS;)`ov<cbT}sU8jp-0rPFgRJth0O`n;cR?~~~UY9Hqr>~zJyyC;r-
zm(q^>@FCQJ^Zf^J8|c0-IblN?ncSI!#wR8BSC=cg|J6Lo$~l>c3wqZz$oldq250L5
zrToeXuanC*82qEPV*BW)(COVqX=^q>F;$@8P@vDzPW3c~7&A0!e-WK?tFX^~sbut)
z?^&b6AC>Ma2MnD1jE#LDdB8Q9C8W}UztB3zGTZBZIz-$oi9M#CA5j0u!{Qj_5&ZtS
z(XOPSP?ctzictp5oZG6lMFAc(KBk!9Nv{~aNlk3{>s)S>iN12Ffv!m+B-{bIL_`Fc
z*c}qVFa2n4;jTR1R&-ONv91&6kzhf$i?>5T+7PE%jysi)*bADWVmbcrSd|{325r`$
z0;PwuI_`IcUkPz!T*M6d0?N~0N$81|2Y!6xsOjV=g|x@tF1)5c@-Z~Eeqwhs^iiR9
zh7Y7LaFE@ducc3~$^#6JIpI9A;EvplV?5km>wiOXth(Fp%bTOtdTiHJC$9Wgm;7JU
zlo9y}&iQaQ#XxujZ;pigP_~3Yc9EI~??rYZ3shlyv_b+P6^jbU!q44<Kk6d3r*S_f
zboU)DjbP|T-(^LE@6PnQoknJW^%3-dTSt#Y_}?+#6FlBDm6DU91ok-(??fE_E?__P
zL>^cAriQQ7Avld4JEZ#`uTgUS!K;O;Gi@#pe-MDuVW3wG`3Qtt)|`8r+4=!_NvJ+I
z>V)DudiGc!kj37ArHH1y{fe)&V?ju<CzJM!0{sO3j+wgxiqNOr&#ygHIN)T+lP0E*
zM}+X;Bzm&$#Mwg0_f?0s%!lp8hX_@!paw4`$z(p04d?SD`UqzP1pamXiTyWwyTJR=
zzpswabDY6rELqU(nquL1vblbMPOC6wn|5%SZA#$LCxZ8VdZ9*{IavXi7h@P@iTEyk
z1{c57>*X(B;vqUq=qegs=k_2t?dxeVAiM8w`DWEg`W*vwWzQcsvM95j-Xjtd!B71}
z?h<;w>e}MgH69j()1^!qN2z2-eD*@FuP|zC$<f+>I|`Ct&U<n7!v9p%EEpmO7vhHG
z0a1#fh3?@gJ9!@T90)!Y_Aa``_Fr!`Ld+SB`OgM<4YjeNEH0ugju1OtjKhXJKL4Ui
zy9FsfP`r3Z!04*_-jRP0KIhOAdr@*BbcP@NWd|NAyT^MJpxITcul|j6L5~L>);@Mz
zLxW^}+<TWxg4&SkftX!J?g6^$kvxSduwChXyC<_Wccq5TVsT?)F?jKg7Zi%E{~2f6
zlk(tF-;QO9tfz~f(8wD+hLc0wNuPV0=ZBbCCKIT8BLS3YbShZa9~4}x%HQmOn#_+u
zH5RJ<QTt$~#|pUfHp)f>AG4kPG%980j)=M!`;TRw>)(j*+2243{&a*|t&t)T*vp~_
zn-J8#<!|azOIMP+P5`n4;-I4y6p(Efk4cL`B%C0prZ_2P?=av*KxPMH!?<&Vs`eW*
zW`8b0XH+mn%g_q*SubqFsK4AQ#8n0CnG0Aju;E`!t~ZC(*RSuOs0So*CHPbkbfyzu
z=2RWUM~$<e(7wKOXzJY({h6d<zrB)oQ{iW1@bB(I?lHjfXwUr0*Mzd34kBIYK(Gp{
zF^|oY$KWAkW%`Wxe^G|ga2&L@rO~b|lLHACLh`Q22l2`QcBEPhw#FsX^*13xheV81
z=mKj+^KbQdKngqcCq(D34dt^4*Ozvuc4tIMTNe47p{yES7;l9p8=?_g^gzUP(YEAM
zEG8J$jZ*--2lRi-B$(yEPxKeW-)C)l0fE2i;hE3Aqv(Z=d%18bZby{;k|K~GGEv-R
z;WsF<jT_aV6e5|*FB$%44LhEhvIae!>HW57k~SWBbBf*E6gOZ3h$qqU@8r{TJ(f}c
z@<)(zEk7L8PY&u<^Aeqq^LmwLgo@80==?>RnH+p%;<7R@`8sVg8N*)x0qAj{3JBDf
z)`F$^5Oqfm+2EH}Wwr{(C%YmkLMVx9MS`R|v~5gy=}8w=vI3Lm5R<%a_{l@y%zGvF
zegj35V(8v0oJ=ZYB1*TfVUdsl)s%h|lRJU-3?+z+3g5f<0A-oEdYBdBmwZjncNIXX
zfg*dl9*Rc-mmq*Rw}}Vz`5h)y5CH5X=(@<tN=QwH=0z0|l1ta-eu34k8=XwC!OSso
zNX@s{WP2tY_|Xrp8);=xfVk|1uc@t53J6OhUyZ7*7W#s-NVx(SzoeZHkzy?(Q@-40
zJVM;!WMVNXXDv<Uh<W8Ck62+A`PPOr!;u)C7bGMKZ~7G^X~zOO5yXhx#(-c|G@GP2
zfXOq+#9%kysSWwN3$R~}v1sx*KU@rvo^zYz7k5oQB&+!zhtLC`V!?kn@vB(#JcuNB
zRwWvT+^1e`1HJ?@T|=A#t=Jz-x}DNSlP}=(m@GV)0slexvp(4BQixN`zw(p6ZpOG^
zS`|Utybh5MEazxa*}^47Z90ohGTdL9(_a-Uft$@IuPY^$mOf%vpooaq$yVWx!1UG6
zvnzViG|cQGj%fdSxZ@$)f4D95annm92yB4?PVQ+E%w7Di13Z)k&P?fCy2t`NLR#n&
z&~_fsFY;(mw|zRQO#%KKIUr>B?wwi=+lH?L2$Fp-YVY0iO#vf|qjH<roUvLjCuK;l
z1U@|MJlSk4lH7=^pi^2%n_IErn)K>m2Z8jEUSUAi)CF#yQud)F4r64RFe|T;p<{XP
zeCNGGhg^0uEnl2wW~U0>6a=IwoE}YBJ*3K`ohh_Hif-k_lf9SBp0C}>%EgH{vzBtA
z0o`<dh6{fUU$t*7Uou31PweJI^i9t1D1Q44d{`=$Ye1R;TKGV(*nf-HLjmmJdEZ(x
zl?hPNqSLLx#jAw#OZ=;;R)eeSRcC-B>CPsiRd<i<clb#=sHMXPeATV)v6kM$A!T$;
z4_EG~*2>tX8{H4&3L=w!JeYaKx$TC0d=o3w3{1)(v-+jQ679|XTM=D2%B!xK!34h-
zPH(R0ltI-G8+ANb!E7U)7<FWQ6g_>r_d4%2hEwyJzk_p*o!ojaTJiWOI_(L{QVf$^
zpK}o{o|khur=2*OWAl`5YpJu5)u%IeUgI7&@Aq*<mbsIjHsgmGbaaP#TPZE|r*7QW
zP+$_5ko1>J;H*L_^X|of&%jKCXs1faw``yZwf{Hi!WBp6{H3!eea_;=i%64eT=InE
z4pZ{ea~uA(Bh2qaU*5TWY%ku|TkNCm$9f~;kROg}S+N(2n4>@1fy%~(JM0;s9EqV~
zjjC2e)W*2O*r_x+eoM^zy;7V^8<4Fz%=sHH@Fa3+Mkgq+`cNmotC^eb9nt-p>22<$
zoqs;x0VM?0JkD|?CI8`IUC4MtA=YGYn=>bTMgR3mJF5QU-K-9<k_($&t>@%e*U(l{
zcz5MaNMs@74HFe^lTZCG)Ux4Uy+Cj2dp^Csq7-GmiJeJe_!|#Mcc0tofZL_<>Rg`h
z+vJ#K-RPR{|Ef=(?hF34+iQP4yf~1iBHORpD>S%wRcyNX0y%sVqxMM!z4!P)HO&b+
zu8aKLM7v5y97Dk~%}L?DRxWGA<AB*SVucyyy~(Zm5P|%h3A3_*+Z^@8ox%JQ-s~re
zbng@ogece#8vcAd_fNf3Y8!oP8b4DVbTpo;7i<?r=5ob9_M};+LrYP8@RLy*cjt?+
zgUF3_x}VyGm>2$`cPrrcjqN)wo17m{j5-5+-U>?%AYCepocar1FwN!J3#MOItT-r%
z8C@t@U3Qg!Z}z=OlE3521DzYE!)|}H|GQ)Hx3A0rm`g5G2Fe^E@j7?%fKUG1%>a7#
z@5A7%zv}gU{>lCO_n%~CPD<L9jA}`AqDb!suu24=H-;&`qP2A2U<`t<?xs%XIXHbj
zNLL$tNm32Fvh+teQnu-RWYoCfi1A~}F?WrOpQx@_H0Vnyi#(~LXToBzsr<OqOqiRC
z_MLoN8&n(&JgJSzXF&n89QKs($UU`ouUEfBWs6+#j3^2(iWL5Oz&VZ{vXR%KR~L(^
z_@(tQUL3^?mk|+Hfsv$?eVlKvLI@IW)74fS;MQgpfVr=Tqv0w!Oql?20rW-Ka|5H&
z7s`!c7`L#?!Gh#t=n5YrayH~APTV``)jZz*`;nmr1lNW^@+>ARixuwiFb|si<K9+<
z6$dGz+zEwbL&>Z!m7YL7xtu4SGsp<C{keWb>!jW@D0!CApa-19r$p??b=>@rW1C3Q
z<~7-<VzP#9Br7&>(^V^<!cBM?$@Czw`tFMz$Rs#D@k2<oPassO2ek3x+n?jzM}_}7
zLvFgI2iR%$&70RnlRseLvjk!)$a2I={$n)~X6y!yZ(2<^v_bNMyNbS|cSLKioj!om
zao%H(dV&XcA7u%+PCvq2d=a@j8>s%u36C)UV@dQgckD#Z$Iv2nB{TqC<j!4Jv&h<S
zD-~y?$+>iFVyeJLj9m~Ut61i8mx(|5dJyL@{l%ekBhF@oKspbn;Ph;dEz{UWxo<E;
ze@XnJ&3xvi$~9?HuQq*A{py_F$W^xZ>G)WQjWZbd%wY10`fdXkY`=1}f(fRq&)M7}
zP<OzyT%JZ$L-ch)R($AqL!9U_aw7|&rP6|cyxO-J<gP#of@jWLXU0Wp8c@|C_)pUh
zX5|rT^^){!!TLc#@VtZZhgoF7G*wf9W1z8%Cvii-2GHphvg*j+MDoi%`5?mSyJ44<
zT;7G*(>Iw!dUf{>l_{Get?#7Dk3uB1b4ia)B4FseY!8X<(4wfyT%#kNEi;QsP_DKS
zC4IVtwaXw)2c4FMGrtRw=@_8Wse_L=Iq&b$d(Fqn+dBx%o&7(Jt&17mT|p{Vd;=#V
ztG4vDn}}=@BwD$>R|c+9Rnt6xuMyFFfyFkSxY>9N7s+*Eek?2N0_Jyk=@biSqcdRf
z=I5BL;tyRfV9kD$xj*tGlh>?rC!2|dWP~?$hVC;Kb(nr=Q@=eCENZ?qF&Dv0sXLD*
zUS=jeuSkUIjbqJDsVL!vC7qD{E%$Q1TQiE$X54CFF!T3R19V!Lm*donNLS5=YW0}l
z+hPCs#g(ZCmxs1>$Lw1L=(_XUw(Iou|0TM!xhCXn>S2LZMd~~SeWyqHm-?KvvM!2+
zS}#bEg{9qGOmK^vy5Vb5FgUjbdB(}0VjDU+OOv;aj{wrv?!^^|tdHLQqwxl3)g|#-
z<Xp73FP_HsO(sg`kumpdQ^-ofeI`W<X`=tMs=<V2xt^f-;J5rvu%bCTuYUhy&i>X}
z9|JE8WUEFBjp+9{`ic)(36$%j#Nc58N}CoM&*6>5zaNMcCpd51QE-;yD*g6B?ZUzD
zh5DZzN7~cn5q<PI96OrG_Q=-DaQ0hANx&J^JAz8uCT@lzC^5zq4c`d$d1+;>y)E>0
zW4a+RrV_Z}twNCvohzNJ8Q;Mj=l<dLZP;Nxu;fZ0uc~oMXzsob{pf@b5qZj2`Qu!q
z&?ryhia!dg(7*Vw{MzD7^Ql139VdJ4TDIaVoq?w>)gSF0^{O-77&NBiL?o@EArl$)
z7d5KG{f=~G?THB;`<k-rO}_-YjfKiIP8|}S0?5>1PP6n_b_?EQv~DHGt<BvBI(2#P
zid)4tcHh&}+Qs)a$c+u0^K~RU*!f6IaDpFq#vSD9bBOvm4$Pd5l;^PO*&?q=o;xVX
ze~7c9v4hkvHksJ8qnCWc(gcZL@@<_&z*Toiei^n$Dq;FF1ws-66i<Tw8f3YkdD~Vz
zkijeX62G}SlKqmrlt!uMdLu6Vm7n~3%Ki`zHt7JfXrG!Gttd89FlzX?=qhBwBX;%b
zl^~Qz8!$P8H9wAdnT}&8U*@5kdc1MTJD{R=hb=7-n<a&CB0Y@e4@{i8#}W+FiXa0#
z`xba;slva}FsxH$x6{yKRM+jgQ7MZsLapk^USj>?+F$K!L-wCPIIOX_J>Q;k6aFWx
zqe-WP{pNSLK$1R!!JpteeC?d-<1k&_#&2De(H5V@PJ5SbFndGjH~sXk+mLtB*W16v
zCeV3v!t6yE!xvI)<Xx9vF1uzS@E}L^vLKlC$Hskm9zK5!eJ<-}gv4$9=0lgoKSwL}
zRnBa8xqz#+N?mkN)!U<bFVTC4h85lo^e^1a-7uv--1d0Rp7s-Uu~DH&a37dVq2u%N
zWZc*^obR&@B<(uA-yQQ^dYC|SZl`Zz3cykKV?0Dmc(fG%O=Dxji@MNx{QXYHvQ+)p
zOt8q?e?r#%D}@e*y6kaqTaM<xf$;FM_gk#FDdJ#pBxWN!C-ku+*~I_+*W&wdDFOPx
z^T7x`9e-V<y#RUNzI6P?BM+ysY&`ibFLEXaDiSp*e%@$x80ty_R5Nj6Fd_ZM&G)<|
zvnIFI=`*`mb%=1~|JMQxDa(P6AuS`?#(vOo6TqD<^VtD#qTp`c##zyd&z+1veu5?0
zfR74RXR*Y96Ytj^n3@UBI78Nf>IEMl<Z)ffB!SoumX1#CQ|6;z@|~4g8!0$C`<hO#
zOXEfvTrN#Q%(Jvk+93lf*Iqx~8_K!ST@Zd84m>YS$>!q!j><;Z05<R47uL;B`BlR5
zho|0BE{Pnl+bX-z#P9=?4~}%LVS=qe|0xFMQw+IIO&v$BGHq!G6?9GiFd}8f#k@k;
z*w2Q33S-rdQ$_Fo;^gCs@`}-AM=f81wWTTM=;4+Et{o_u?XJDoN@Ld%|E~on$y`mh
ze_Rh_)(R7$#2|0Vdl&KN<$;v<vrEBWFFwp*3yzdkVUY^^EosUU<HJG}f1HxM&8ycD
zZhy^s3-)Qn)a#g*n|Oc;%!+>c*+|i^dVrJ@YTutyCUWT?Pj?^a-z)0+>xN9dFv}=D
z$=#Sf0yf+PNYCWW1Ej~k*L1y=4aA+)<!%)qhG)ok-<8J^If7K4Q#<%B(Ifefpmwdu
z#GI9ZT$TU}oA<V{6B#I|&9g=0FO(inxzd?X-+&J0tuUCHqHd)fNHk5(9{4Wfk$*-`
z5$BB26W!bw|1rb#P4yl;wm^zgx~EFK$A(%@TJaF4WH#cfzopBK?7V)~hWco^q=x5i
zec}0Ou}r4;;DmXP4-1Jko;1O=$l%R+HtKB2W9+>*BbO!yFWbrM-q%E}p8X7JTTve(
zIo;#H&bGbz6*XzUX8TI-SP_d&_M0@zUINsy*I!}y6X@NBV}iV?LOfuR`4zL4ml(3O
z{j*U95YQKR{ht<h<yl<O<Y_Q<{bzNj`FG$k2)Vh)Y~l^l)-nC0kM-UzW13;D_SZGZ
zmRCj<LRai*PNt&hjH)Q~Ba48*ilxNS9lmw#e|Mee1&BKwM+U-A-CE|l4>zjv-%>+&
z{VqFt9dNgo4y@{gjc$gSDn_=n$4LI!)MolAi|DY^120D-gwJ*PvYb5WpH<i)#f{xa
zlS<rIZ_A~(?sOjAjd+q_U!NHvo7({&@X-0MWNe+&IsMAVvaaPx({XoO(g-HMEWQ9y
zbTIZ?Q>yout@UtO5>^l7-Yabr<`yvdIj$N#=@|OdTOo9W;R{(%Em-gEje{)nKiPsT
zWbQ~aTr$TowTomBnw2UtnX_spa!!oI`cbI&kr>K&N|3SzWauOXs3uUKTejiQq)PSr
zaGW&?vRZMDQAR8ODIEZB`biXqiqs3lupwE^<f=-Oa6Hn`sUMN!f6D7pT#4A@`D}Os
zElv**_BgDs=AQa7U-DZSEMD8*WW4mk#MC1%?`S3&qQeuTWp7HuwJNHd7^9W*z|*c{
zZ&lAxg<K%;T&5Q|eL>3d*ip}p#<(-EF-lNS80d~;bWZdoI){^T<8mbVGKaAM3zg$|
zSxqWG*lsM_oK`<!d~bivt-GbDwV`_}uT;4)-2GmloVWmmP6`=mmzsZL-p|Or3-mU`
z*w2aUFhCR9D&Qds3skslm9kO>`P{*;7^H%r_Z+I{ORkril`EJyVdMD9Pv=R(wul9o
z2p36kvHwmtGn{5L$ivDqDI2nW@AuqQ>&-!h?vMi4WP7)+zWm~BPdNP;_q7sOZQK>N
zP!ZJ`Vx!2%Je|B?aS?J4$t_nk=+UG#Zuxlvwugr<O%ERY{wHyPWcL^SBazMIzCZSb
zB229tIX5F^)E@9U9d>gTk=tqN$%>c@Y3dHkp_2xW5YI!_r(weuW~!NY?S7gj<BzMo
zmynw|O{9>n1*MSLFu;ZSeOkOybwFPV_og%dFc!!*Xo##m!g21~Z(hAZDy}<N_~KkR
z>H?7E*J%`R0#hNLmjJmR`p$k6@_e5iF>|~M*%ep_nET60dVlKHFzed~&SL2Q*#f<~
zRDDFiN#8Y>?<%6~^$|})T^U?;m+z`J^wQI<#V56$@u->2Ugby`kh~1Nv(>+IJtj0`
z#7?DF2IahU5UMsKHXTDhTIu^v@X(sO-E~}_xDJtH%(&IT%B@UYA!aa@qse%wDBO?H
zPps|fcX4KRy!dB9y2Go`MK<fM<o%0WLBAI`Q8SW`#R%k?ioK8xo8n=;cL&p2EP*8t
zpV8PY0yllsZ><g89J4q_FDdyghq`<>3pLzbMrYA()k;zPFpRfY*ZcM6tCzM&@3U-e
zo*#yZOMY?!N>BF#;+cru`-QW8_fr?=IIsgVN`iE>jK2h~;+y6pO(D=`%xoMF6+Tbp
zf3-l(wLSk&VfSEa+(BM|EiSs5teI=+p`2|XlbbUr)%UlADY+OK4lf*i@vIY-tU?qW
zs}C~0cX+3_p*M}2@)6vTZEOg+^XBbaa%V5|@$eO-+Wx*cEWddH(xX>z1t>89i!Oad
z;zbwtGI&}Gz%T}4osG7ReO<Nxyq%x?>!ySN%$=wwsK-;m2)z?ycVU8#v1Qm=4~l?+
zPrg?DRvCKWSa>oaSr1s?^+=$;8^HH$kdAR>elb(z2G2V3%h++pjWyX6&42VOHqtS$
z;^AdW5u<Wp&-9h-Y(;UJ(sO3AgrZ&!Jt6&g34M&7x^xB^d#JsYa5;D-_7gpC$_lTH
zpAWO^fzxF9tV`_M8PoR9g%q}~z2RmWIQSuo=_<ld65-kQHr1%ppSf-UbWtk`lSy?f
zID=;5gv64*>!hlc)5GY~FcIVFElqwL6jC4TJz_|2Rgw9p>)}7$OB~?yF)aCvtRed8
zcY+`sfNJ~fhNKyh97|QT1TmFIkc#Nkn>(*|jUQ;HJ_UXUeWDdIJdvN7{s@wfu0np}
zM?4KbzYFQKE&vssrPwYfby>8gG#X72910Sl%rSL5#hD2x3mEELQEKOo;@4o^VR?Hn
z`1gqmB!m9i5cTMdGN;5MYi!M=0ej!o)K+CJLlOs23;TOUWmAE%)Dv&E13u$@LQuWi
zV5~${z*r~K38t&cF(hC0&9!KEW*fW1s52u>8*ig7|2{0kPwn=W&lEmoi27CvTmu@)
zF~?{DS-aM!fd<jV+B(8Q0KR?uSd`)`gC)d_;ux%OwQVmtbrpIAv196m;TUSTbTkfD
z<U;Y!e*#<O=hQ<AyW_nnJ~6~|fk;0PG@z~9;>iYrK>wlEh!prLc%B=;-3H#L1<V90
z62(cQW(@JA8kGkp(SG8Pc;MkaizDPedAxWfE->E+9<eQjjVK(-@H>`|EN5(MUeW`L
zU!rk^5r@IqK>=8JeeBO&pn|4cV7`8ho&WL?<*<`)PpdVE)ZvD4`l#nD^XaINury0F
zsFlMANM<mNX)6N7=&yT8dffFsOcqhpTs%dl26<}t6DrWC`DHCYHa|qi<a^%+KLT~9
zTb3Q=kKx68*y>#vfh28-K>kANvdq13`N?5Edmlj)9uB;dAn_C5Y|50e188ON9Rrku
zghuWWb&=D%45t~0ZLgcRntJE!cF#>cR2U3{oLnL$$&guSLVwT&=ZFuL&DtEu_MWcM
zT~t+w4fHo~_7<sDD4zC9zYB?hJo9m6-uS`8tSV?%2u{cQ=^EgSKF6SQeY+vz+yCYR
zdgfF7jgOP(KuW}O{QSw|$Q!w10+<gr$XVuiR+VEIS;5PkWW1H%U%3gIHz`=fqsNX8
zkveUC@+%K`Z`4pI^|$pjGI)6LBHgXq^^z$mNE9xj%xuXdC2H`a!DhtFj+Y5DWdU<T
z2X5%SGI8~`lfn9VihLF)nn1+Bm3a6%kF2v<a%i?1`W-uDf+L=$9w7yMI7<<Ha4T;R
zxyGcY0HeVA?X>eakZ%KeGVl5Kg9M%ASmHdZZcBH@cbc+>7N}Zkq8Hs)){T?U`Rz^m
zbuqnC<X#UkVb%O6SkGJ1a0(XrcQWpyA;wh#Mr7Z=MGsbiQ&@~AjsL`K#FJX&QM$1q
zMST@zD^~v`4HHj?TpuSqPKcD~Lt&1tT^U-QVRh3asrjTv-k%ONixjAdc={5Zj6up9
z_$p?xh*49o)~psxWETI<R%)hbl7THsYhCAkg=gm`#J$s`wmp?LpecB27!K@+Vc=iz
z%&o3+4`_4f+)C*K(%^9OXZ*@tBPN`QHB=)rcEH$<n~jrOzv~i|@5$b2bgd)KV7Ir%
zxE;yealyb2Ad_ZfiGz!!Ln9qbvJX>T^(@tSpbM-i5pk(6o@if0Utp^6eKxSB<hq~q
z7KGLDWnYNcF>XjrfaoD=<Z)>n#eOUtyuU{YGB^zkaXJRfUGH0Xzs5&qI`xisl1Cl)
zj6oOm@91~h$?a9OipL4`WOB)}l8(^)gE?`EuU@tI;Zg=CsELUe&gfE3)@JIuF(nhU
z()lOv<$=sFy5Aq_nqvokI}6v$Fv;%V26hz?=a>#8TbKrgI9SO?NZ$?}Igg*1{X~Zc
zy0zvR(HsXntM&eL`XDD#BSJSCq_&Ul<aI@;o<xZ&A}4YBVPGx<M{lFm+VzUDd);`3
zj&3$R>Fx6DK6+!H`8o}EnI_!<(W8uD2)RJJ7I!!Dn%L}`X(mqg2*Sqp^4SeARWM6<
zzIU|Oc(qy16n#vr$mzrE{X!OK<xZe+>XEFFTp-VuVemdgT-*yY*DVfX;Bl2nCJ}J2
zNvv8Ebp6F&d?a%yG0t;2HC^Mx{k((Ijx@#h!L^*9KrsX3&?eupb&u}ZX<>L*L+CdM
z=!_;-jpnP%>O)+|k#{2WO?kO?MpzN#BfJ(7X$+;32GhAuwOTBr4`a7i42&N<Ufe30
zzEGKZAr5_Z*H8xx)B6NqdJeI(FP#OMdxt$RN0v3ZaTiOm#IS^`?7(E3|7#bI6itKS
zG>fpQoSp*2gmAPpBYO&Oi0;%|iz|1X^Y%UupHtNhfaeY#e5Oo<{{>yN01udjb8mJ&
z&=o_b9kY<u$9o%S#SbxjO@iDPc<Gy_rGW5TjR&#cM4USvF9uOF$wW1=z(Bv*%h}&t
zaudMQo!%9VS73ejI#;p{EQlfA0FyOz${-Fl4DgpacvyLa+#WL;b6KMum4PQ|#7gg|
z`Y|-G?TwHBo&6Lh)6pqbc;vyh<D1JE@7*Mt%ae6z8I_!cPQGjWC+~P<E8io%vdCpG
zwwuQQdkVc=&-zH}l52OXbE<I_IJ5fWkb@MAYDQ+DCoEQD$u>G`^uB8%OYf3en{KsS
zjy$j=mC@c_*ncFlCUOYq36SUUwl_tH3g|9Xjj!3P)=y@>+U<L8WO7oQgzKw@{i^nE
z-TxbY9-7QMaRb%1?>XDMJ?+snn@M>o^6rZd60maUm^@D04zVcZFX{ram)QA}p7Tup
z1xDzXdh_=pd7%&WnkCu9{7>QRr~PJiqaK{N0_>F7O;(@W#LQQoHF-UKwmGaAQUvr#
z@(l?pU`xIoS#8ZuXDjt%20!}R!FjmdIz?&%lo`gm%W&<Z*LQy~2hH`ez2u2UQHEL0
zo5B&YN0pbQkE;WTa||y`wBn6;lO3A<Y;HI+m<VMUy~{!;w)T|Vk5G0sC3HP6Ni*)I
z>PI|8>Fn9k6XHV|2H{F%=xiA0PRPw6rls~%;tFW<LA;HGYHJk`53l)H7tLTW-<zSa
zv8E9+^DXsx$wlNE@A-a>DFmWxF%iRZhMn@&QS$c)!(vFB+0PZ@S`|h#fNE)Dm}F}-
zMVt5g)4Pap7*PKsJtQpnG~_YoTa7BJF$FHkX`?-EIIz>4(MAWe5<Q@)Fw%FU!oh$}
zG^xF5_-AH>6EA4j2QPR~bnvyf>ZEGoira=#KvhI8-QUsWZYt`^jw8~dLp^KiXM|4w
zwm<#es>{7%ovvW=9bxEkevg=gE_xlF|9?cVd*6!)$zh?ys_59?*nrC#<-4(K_evqV
zz0;!A8FQo5Bjo0S^X*Qk4_hp6*^ryq`QR*Q_mCx;EC9YkwC?<)sn}yS&9l2uicV54
zQFToL?qn+F_tunkTs1VM=#vMRBmug3fAII^{VxI0D+}~)5%ag%tP1%qJi>bBo86?Q
zFF_<hZf(YNN7MwAx6Uig#4%LT+p(s>K}*CA+!G0o>0|-UyFLsVnn*5olsrOv$C1X2
zjA)qS`YYvuAzDG`kkA-rrw`7+VGeq|9$5dLu0eP&feb-@!pYU%B<ZK1zMQy!u6%3w
zC}OqtZ)12bDM91!{<^J5{U$R+(Wv>vPjLkp54W>XJp}$I(9y-spaUAh0G(a90DAH=
z*(la|BD5A;z=W!0MEub~E|LCL^Cc%pSs_4=A$R<AavZqKY_g7~OcKnn9JQC@_q1NA
z7a!E)95?Tp?JGH+fZGERT;kdtj*PnzZ_(tjo0!En{%|c%44o|5P8%KRTdRK{Y?@PW
z{Z{tVJw)7rG--Ka^K^+xo9IvPg2>bxvNx2~XU}*WMSr@5yl2D(EDZNLQ0KqgdAK|M
zq3l+8g5kY1z31ANBF+oTLq{vGH3eJPS?vE#rY6czQBG#AJ=48mhv9bQ*Xfm#=)>vI
zc$mn4IP>#ey`M{764PnP>(4hlbk32lo1u7P%pt+0)4+nUXpyY`4V%~?cVRQk+?eUp
z`0>$K5R@;$NVR)iKoM`nguE!4(Q-qic!bMjh^)*#Zq&Ctr%f_J-k<Z^P1kO?5WG_!
z{m07_Q|$mezscAhFXy7{%!`Q<Oknn3zbDp``jORdEK=vj!C?lqOM*~}FuMrKhkMII
z3sw3P+~AAsGC|K<EA6Kwf%h|yH)+y&&_|tVG;ACav9P(l6<ExMc3n+fb{Q1U_>^MP
z9CnN0hZ_<rcaaB;gpBXDh?Uee?hHM_Q9UU)pF(DC#35SRR-s<<50dkNpXuzEY*omN
zgF1IHSs7z%qBv0V`WSJ(9bCSS_$g>v_|Am4nhl%>_mQhSC~q~&wY;S01P1dHRuB^S
zzU<)X?VY%Bgw%-NA}Dt~fKT)M*?D>|GV2~C0g!Y>U5Ax<68+C8?HzeHS(O)Jf%`a~
zR2@-tk+%KO<ryR8#`yG6>a(*<&I0T)+*_UKa%kCt7&JW3Uy1{%%tn#bw~)iG5?lGT
z;?-~dzG!0@9(^r3#aunSt*5LHm8v7M(l|yH91)MnFQ=`(VUICGE=|$$_NV^<c0QBu
z{f-kP5tgp4{m5xk>#oEkq#)A=V_M`MamBQp-F0?WUp9vEPt=9oUG>1|GA-goOhFQZ
z0%ZmQw<J&vKY-H_{xSuS#D@8yf|MVSdqFYE$v?>qgc4yU*~vTP$ewUb&wVTO$F)vn
zxap2y$vudYZryzA`};^73$j2?zXRnkQZe51Ybwm}k)|g?fa7k&rq%wca&q!(tX=F`
ztne@O{_rwh6A^T@EF386JyaoAq;i-kqK|Do!NiO~?+=^5v^OV1%6F|CMgw<B-};g{
zC=!dBkq1o=d|o(!-{JhiM*-3hUDBXP(ex72q1;>Dw`IDh0vjKXvfeh8#Aik18lYU*
z$lS0nu1jty>IKNPP#y4<oHoy7e7OR;W_N2wvF7sNd`QcUbk-&3zf!KBZ;?Lznf`mq
zOWpW4d~NTp1hX(kUu4VYktfaL9w0A^y=X&IPin#UZGjx6NL!zOzD9d5c@_GPkeGoh
z_`Pk+ue17wm8QJf3GnO7r<lQ6aBa)|n2G0s<w;u=<^gTf20L*0gP)*t{#j_!maGo$
zSx`fHr7f6EO2=uAtgc^QA!^JSth7=gm<yhVY)t)G=#Z&6TpPW6q7H7Wc$pyW`VC6M
z*xY%$e?>d$dJjWI_UvKA+6I57gct6K;6_Pc_4Hv+mnuOPSu?u8LT-?LovX(2mnUw=
zCnA}3Pp6n*PB!?Y{b8t$iQb;FN|U{eSC+~I4nej1wlyx$jeamIRA1(REP|3EEvk0@
zlhdf#6W%r3EzjOE70WVZ6h;VQ{-d{Q$-6_lA=GOjNALJ&wZ0cLXp1?|JoKf<i&(oR
zaXG>q^Y+IrQaLky0U?z#Q|@qPcRm0H2S4bP*6&A3BK_5c1QlS0UszNhD$OyXXM$LU
z8UrSsQ9t}iYA8}>ZDgq>xmGNwFBA6r(gu>5i^?vLhWNT>J($jO(#-b-rvAbXhfTDQ
z^s(n$8J7`1LcLzG=52c4$BAyW;|8aZtZO-;Z}M=Z%lTG(gm31|CPk=-MwD~rr-(iO
z{9j;(U}YR-s!Q1&c##5m7YUM7<DBWh@Ic&{Q6b;u^8bWP6iBHAqAtxH|MD>I<Cq2?
zWVWq@jm*XpdiF;zHtC;(_jx~^Ab<th@I`i*Uc{tmYy6dAS7gjCk1nCfGr0mnI?{c5
zOA1{*@j0bmkn}qf7MG<Iv!F?HdH=27Y}^kiPl`z4LS*MqZD%3g(4#S4{5TYY9L4eX
zI%wNH?z6KpcP0~76ZR=LwmVQ5-qV=i1}@<f_R>9?up0j3&qDH^b}sFzGwLqnZW#Y0
zsn~#e`M$yxaoU{gNU6R+O6Q=)Fw5EL5WoZ02+G(fTToA%c*;*X*@sS4=9s*u{<QXM
z2)(66h6MBn$k1|6^{G(he>CCo=<wv=2#OYvb-ThQTP$nx29W@1e<2c0#G;b8rCE9+
zYWWzy42Cg<`A8`<(PR^)bO=*v&n?@d2ItRleFs4w3I@kaj+3H*f8=u#Dc2y%;t*_3
z_y)Q*{zXELI>~34G88GhTBGbgVxlw3H2WBeC4jR&PLBsY$HcBCd;M}M<402zxZcj)
zz)z*VzxGoF!_85Y6>*q$mimI2VvWSDt_@y*zU*DN`y-nJ^yUOIXt^Mhr6z6^;&W(u
zRvp9-(nw|HZD{hcfH#$Z#QqajVv-FNNk;8s>u9$*$ay;li8U3MaNftbGPb}=5*>?E
zTavmXVj0I-whrg4UY`r(d1j#lO|YQvkJ&kKn<x@dyGO+;6YOz6Bd?PkQFCKmZCDao
zVR3I%GQd6{>FDCLxs3<XEf!TF!vq4^+rI*l7^sq9d=3pyzZGd05(x2=zLI3dX414l
z0<IXD-y%XeDAHW(nWvEVB+y&c42z3wAf^7n?f!>X<3+>1flKiMWC^B$$}2z5Rq2M%
z?(k9LapGJu<`DT`{p&xznkAr=5q*7<02ED;Uo86P^>rN*&}8Gw{1?cD0651+YodYB
zG-OnADVR5s4m{U}c4y=Vv_`c$RbPJuLN88Jo@?k@aLb0tN7NdvT@AYjB~d^M(?}*h
zSb`rFvX9rjkE&FLS3rX;B_xn2*$IT`omK1}81hpm<t~?)!UW81sG$H6<BR<Q))&T<
zjR3zLl4^{}Q3b*iZe(+;k`oes2A7)%Wpn64#g=TpGns02x&i?xQ73%7Eep<E#5Dhx
z$nw5m=I^qdwl@32Bq(t<!&wjvuiB6tK*?Dq$}&XaqlZcF{d|){emt@J1VmB8hwdOF
z(y_XbmOnopg``1ePJnEoIi(vww*yrhaQ!ndip5O35jT|N5rx0j&CYxk-B()%l?c_|
zaxwYm(RlPRVXq&cLoAT^0K($DF@Qx8CLob7={nZ36(D;Nq!oaX;~h_)naeJIms6z`
zO)SJ~<E-jtSU*QXKv0d0qr{HfPi@RMKJ;!kHALwXm`g#^_gsLu8IisLv@+XgQQ$+Y
zr!3ew3QK~SMxeVu`^_wvjFTO?f0B@V15CW6p1SYr7>fm&?3G+#67}#xVp!y*2*1_u
z^-Y@k*b@0c09^P5$sd7!z3RRtKQT=67w}#oc#qUhpYr;dWhP46DFnn2Lt3dfq)fmC
z)yO=H3=&jl0za6eG%|sA;5V!HR}7aOUfq#?g3AaWV?q}>2bro90>VRl>-D00+wmU;
zri=CK1Iyzne*1l=F9qJFu-3E(hHm7C_K*F}KF}Jv?L}<L&ue?1&m$oq6wXBl$@5>M
z23~!CeUecoe6)FQ6|H4+N#?=5pY1DmmeyVz5r<#meU(zZj79}rA2G+Hk4Gm7>k!~}
zqK}4k@>B&LoND8wdZFdiUr^YLELT3-ANbzTOldz1>htz1SP(DTyE>1mlZ=jjx8*SE
z1=u7<4sdaHCD6Ve?E_t*+0F@5KIcjy+6hD^j0QKfwCoITFl?^AlARv^tlbT;B1`LU
zWp$?_TI4idwMQ7I4mX)5F^r?`WldUKkDKg476*5u2KtxATlyykmkiE8h&a(O-FP(2
z{}0dD@l~9c2q%_Z<JoZ)RyutnV((;M<eZ4tPS1qc1k$9YURqXXxEB;5el1&>7r+Zj
z(MnkiH8e4e(R{^fwnK6s`d)}|A`$PZFqM{ca-vkzaOvf$e>6pIEdIF`j0*a8Q;s3@
z-D-0UYV0utJ}5PP{5R8(SNJK}2h7@x9LV3XilB{tt<(Nx<oWB=*x&W6pPP;mGj6AH
zFEXrnMfEMr?e^`h{rqH&M<#D}=Pa}gZDMasFBV`Rc*+RN>>nCPf<r2WxUSmwzB%<W
zOf)3d%VBocXE@&gCwuPb06Hq+Zta)F?a>qjReKnBoN4A)sMTuSZ4=`m4zMJZ%il1P
zx<AW)deKjLhT+2iKbftx`M(I!{`*;%Pev^R_%R@TYaDS02=P2W{R=fT$k+I(7DVB{
zqP`H<|E3|q;<f%`S(s-#PmERwV1k2^#09kQcV%$`(Wj!8oqz>sHmoOibLwi)xyl$Q
z_T4@fRARt+Miel`qkuJi+(1)gPd2t1TB8uT^8^fN;ALXJ3pvtTdsY9_1jI}K8+{tg
zbR$3T)xV%={{Q|$|CAk%r+NZnoVVSn>ZRk%=<KeYaTNW!|1$=$AT97H;s3khMSHsM
zE0QXplo<#6gsj)X{G^%o@wrnNQ1HLo{a?ld$9)l_ibV^HhU7=|+XH=VqYLHdNm2g~
D9MSpG

literal 0
HcmV?d00001

diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml
new file mode 100644
index 0000000000..c8b2c96d58
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml
@@ -0,0 +1,5 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="48dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="48dp">
+      
+    <path android:fillColor="@android:color/white" android:pathData="M19,3L5,3c-1.11,0 -2,0.9 -2,2v14c0,1.1 0.89,2 2,2h14c1.1,0 2,-0.9 2,-2L21,5c0,-1.1 -0.9,-2 -2,-2zM19,19L5,19L5,5h14v14zM11,17h2v-4h4v-2h-4L13,7h-2v4L7,11v2h4z"/>
+    
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml
new file mode 100644
index 0000000000..a8c859d8b3
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml
@@ -0,0 +1,5 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
+      
+    <path android:fillColor="@android:color/white" android:pathData="M12,4c4.41,0 8,3.59 8,8s-3.59,8 -8,8 -8,-3.59 -8,-8 3.59,-8 8,-8m0,-2C6.48,2 2,6.48 2,12s4.48,10 10,10 10,-4.48 10,-10S17.52,2 12,2zM12,15l-4,-4h8z"/>
+    
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml
new file mode 100644
index 0000000000..c7b4b2e4a1
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml
@@ -0,0 +1,5 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="48dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="48dp">
+      
+    <path android:fillColor="@android:color/white" android:pathData="M20,4h-3.17L15,2L9,2L7.17,4L4,4c-1.1,0 -2,0.9 -2,2v12c0,1.1 0.9,2 2,2h16c1.1,0 2,-0.9 2,-2L22,6c0,-1.1 -0.9,-2 -2,-2zM20,18L4,18L4,6h4.05l1.83,-2h4.24l1.83,2L20,6v12zM12,7c-2.76,0 -5,2.24 -5,5s2.24,5 5,5 5,-2.24 5,-5 -2.24,-5 -5,-5zM12,15c-1.65,0 -3,-1.35 -3,-3s1.35,-3 3,-3 3,1.35 3,3 -1.35,3 -3,3z"/>
+    
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml
new file mode 100644
index 0000000000..a8bb4b2f64
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml
@@ -0,0 +1,5 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="48dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="48dp">
+      
+    <path android:fillColor="@android:color/white" android:pathData="M19,5v14L5,19L5,5h14m0,-2L5,3c-1.1,0 -2,0.9 -2,2v14c0,1.1 0.9,2 2,2h14c1.1,0 2,-0.9 2,-2L21,5c0,-1.1 -0.9,-2 -2,-2zM14.14,11.86l-3,3.87L9,13.14 6,17h12l-3.86,-5.14z"/>
+    
+</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml
new file mode 100644
index 0000000000..1627ed98c0
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android"
+android:shape="rectangle">
+<solid android:color="#D3D3D3" />
+<corners android:radius="4dp" />
+</shape>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml
new file mode 100644
index 0000000000..b327a544f2
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="utf-8"?>
+<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:id="@+id/main"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    tools:context=".LogsActivity">
+
+    <LinearLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:orientation="vertical">
+
+        <LinearLayout
+            android:id="@+id/top_banner"
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:background="@drawable/banner_shape"
+            android:orientation="horizontal">
+
+            <TextView
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:paddingLeft="10dp"
+                android:paddingTop="20dp"
+                android:paddingBottom="7dp"
+                android:text="Logs"
+                android:textColor="@android:color/white"
+                android:textSize="20sp"
+                android:textStyle="bold" />
+            <View
+                android:layout_width="0dp"
+                android:layout_height="0dp"
+                android:layout_weight="1"
+                />
+            <ImageButton
+                android:id="@+id/clearLogsButton"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:paddingTop="20dp"
+                android:backgroundTint="@android:color/transparent"
+                android:src="@drawable/baseline_delete_forever_24"
+                />
+        </LinearLayout>
+
+        <ListView
+            android:id="@+id/logsListView"
+            android:layout_width="match_parent"
+            android:layout_height="match_parent">
+
+        </ListView>
+    </LinearLayout>
+
+</androidx.constraintlayout.widget.ConstraintLayout>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml
index 089acb572b..ec215e63ba 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml
@@ -1,44 +1,237 @@
 <?xml version="1.0" encoding="utf-8"?>
 <LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
     xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
+    android:layout_width="wrap_content"
     android:layout_height="match_parent"
-    android:orientation="vertical"
+    android:background="#DCD7D7"
     android:clipToPadding="false"
     android:focusableInTouchMode="true"
+    android:orientation="vertical"
     tools:context=".MainActivity">
 
-    <ListView
-        android:layout_width="match_parent"
-        android:id="@+id/messages_view"
-        android:layout_weight="2"
-        android:divider="#fff"
-        android:layout_height="wrap_content"
-        />
-    <LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    <LinearLayout
+        android:id="@+id/top_banner"
         android:layout_width="match_parent"
         android:layout_height="wrap_content"
-        android:background="#fff"
+        android:background="@drawable/banner_shape"
         android:orientation="horizontal">
+
+        <TextView
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:paddingLeft="10dp"
+            android:paddingTop="20dp"
+            android:text="Chat with Llama"
+            android:textColor="@android:color/white"
+            android:textSize="20sp"
+            android:textStyle="bold" />
+
+        <View
+            android:layout_width="0dp"
+            android:layout_height="0dp"
+            android:layout_weight="1" />
+
+        <TextView
+            android:id="@+id/ram_usage_live"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_weight="1"
+            android:fontFamily="sans-serif-black"
+            android:text="0 MB"
+            android:textAlignment="viewEnd"
+            android:textColor="#FFFFFF"
+            android:textSize="16sp"
+            android:paddingLeft="5dp"/>
+
+        <ImageButton
+            android:id="@+id/showLogsButton"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:paddingTop="20dp"
+            android:backgroundTint="@android:color/transparent"
+            android:src="@drawable/baseline_article_24"
+            />
+
         <ImageButton
-            android:id="@+id/modelButton"
+            android:id="@+id/settings"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:src="@drawable/three_dots" />
-        <EditText
-            android:id="@+id/editTextMessage"
+            android:layout_alignParentRight="true"
+            android:backgroundTint="@android:color/transparent"
+            android:paddingTop="20dp"
+            android:src="@drawable/baseline_settings_24" />
+
+    </LinearLayout>
+
+    <LinearLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:orientation="vertical">
+
+        <ListView
+            android:id="@+id/messages_view"
             android:layout_width="match_parent"
             android:layout_height="wrap_content"
             android:layout_weight="2"
-            android:ems="10"
-            android:hint="Prompt"
-            android:inputType="text"
-            android:paddingHorizontal="10dp"
-            android:text="" />
-        <Button
-            android:id="@+id/sendButton"
-            android:layout_width="wrap_content"
+            android:background="#DCD7D7"
+            android:divider="#fff"
+            android:stackFromBottom="true"
+            android:transcriptMode="alwaysScroll" />
+
+        <androidx.constraintlayout.widget.ConstraintLayout
+            android:id="@+id/mediaPreviewConstraintLayout"
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:background="#edf0ee"
+            android:visibility="gone">
+
+            <HorizontalScrollView
+                android:id="@+id/mediaPreviewScrollView"
+                android:layout_width="match_parent"
+                android:layout_height="wrap_content"
+                android:padding="5dp"
+                app:layout_constraintEnd_toStartOf="@id/mediaPreviewCloseButton"
+                app:layout_constraintStart_toStartOf="parent"
+                app:layout_constraintTop_toTopOf="parent">
+
+                <LinearLayout
+                    android:layout_width="wrap_content"
+                    android:layout_height="wrap_content"
+                    android:orientation="horizontal">
+
+                    <ImageView
+                        android:id="@+id/mediaPreviewImageView1"
+                        android:layout_width="80dp"
+                        android:layout_height="80dp"
+                        android:visibility="gone"
+                        app:srcCompat="@drawable/ic_launcher_foreground" />
+
+                    <ImageView
+                        android:id="@+id/mediaPreviewImageView2"
+                        android:layout_width="80dp"
+                        android:layout_height="80dp"
+                        android:layout_marginStart="10dp"
+                        android:visibility="gone"
+                        app:srcCompat="@drawable/ic_launcher_foreground" />
+
+                    <ImageView
+                        android:id="@+id/mediaPreviewImageView3"
+                        android:layout_width="80dp"
+                        android:layout_height="80dp"
+                        android:layout_marginStart="10dp"
+                        android:visibility="gone"
+                        app:srcCompat="@drawable/ic_launcher_foreground" />
+
+                    <ImageView
+                        android:id="@+id/mediaPreviewImageView4"
+                        android:layout_width="80dp"
+                        android:layout_height="80dp"
+                        android:layout_marginStart="10dp"
+                        android:visibility="gone"
+                        app:srcCompat="@drawable/ic_launcher_foreground" />
+
+                    <ImageView
+                        android:id="@+id/mediaPreviewImageView5"
+                        android:layout_width="80dp"
+                        android:layout_height="80dp"
+                        android:layout_marginStart="10dp"
+                        android:visibility="gone"
+                        app:srcCompat="@drawable/ic_launcher_foreground" />
+
+                    <ImageButton
+                        android:id="@+id/addMoreImageButton"
+                        android:layout_width="80dp"
+                        android:layout_height="80dp"
+                        android:background="#ebebeb"
+                        android:padding="5dp"
+                        android:src="@drawable/outline_add_box_48" />
+
+
+                </LinearLayout>
+
+
+            </HorizontalScrollView>
+
+            <ImageButton
+                android:id="@+id/mediaPreviewCloseButton"
+                android:layout_width="24dp"
+                android:layout_height="24dp"
+                android:background="@android:color/transparent"
+                android:padding="5dp"
+                android:src="@drawable/baseline_close_24"
+                app:layout_constraintEnd_toEndOf="parent"
+                app:layout_constraintTop_toTopOf="parent" />
+
+
+        </androidx.constraintlayout.widget.ConstraintLayout>
+
+        <LinearLayout
+            android:layout_width="match_parent"
             android:layout_height="wrap_content"
-            android:text="Generate" />
+            android:background="#F4F4F4"
+            android:orientation="horizontal">
+
+            <ImageButton
+                android:id="@+id/addMediaButton"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:background="@android:color/transparent"
+                android:padding="10dp"
+                android:src="@drawable/baseline_add_24" />
+
+            <EditText
+                android:id="@+id/editTextMessage"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_weight="2"
+                android:background="@drawable/input_text_shape"
+                android:ems="10"
+                android:hint="Text message"
+                android:inputType="text"
+                android:paddingHorizontal="10dp"
+                android:text="" />
+
+            <ImageButton
+                android:id="@+id/sendButton"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:background="@android:color/transparent"
+                android:padding="10dp"
+                android:src="@drawable/baseline_send_24" />
+        </LinearLayout>
+
+        <LinearLayout
+            android:id="@+id/addMediaLayout"
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:background="#E9EFEC"
+            android:orientation="vertical">
+
+            <LinearLayout
+                android:layout_width="wrap_content"
+                android:layout_height="match_parent"
+                android:layout_gravity="center"
+                android:orientation="horizontal"
+                android:paddingTop="20dp"
+                android:paddingBottom="20dp">
+
+                <ImageButton
+                    android:id="@+id/cameraButton"
+                    android:layout_width="80dp"
+                    android:layout_height="80dp"
+                    android:background="@drawable/custom_button_round"
+                    android:src="@drawable/outline_camera_alt_48" />
+
+                <ImageButton
+                    android:id="@+id/galleryButton"
+                    android:layout_width="80dp"
+                    android:layout_height="80dp"
+                    android:layout_marginStart="40dp"
+                    android:background="@drawable/custom_button_round"
+                    android:src="@drawable/outline_image_48" />
+            </LinearLayout>
+        </LinearLayout>
+
     </LinearLayout>
 </LinearLayout>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml
new file mode 100644
index 0000000000..aa408d5ee7
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml
@@ -0,0 +1,233 @@
+<?xml version="1.0" encoding="utf-8"?>
+<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:id="@+id/main"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    tools:context=".SettingsActivity">
+
+    <LinearLayout
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_marginTop="20dp"
+        android:layout_marginLeft="10dp"
+        android:layout_marginRight="10dp"
+        android:orientation="vertical"
+        app:layout_constraintTop_toTopOf="parent"
+        tools:layout_editor_absoluteX="1dp">
+        <TextView
+            android:id="@+id/textView"
+            android:layout_width="match_parent"
+            android:layout_height="28dp"
+            android:text="Settings"
+            android:textAlignment="center"
+            android:textSize="20sp" />
+
+        <LinearLayout
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="40dp"
+            android:orientation="horizontal">
+
+            <TextView
+                android:id="@+id/modelLabel"
+                android:layout_width="wrap_content"
+                android:layout_height="match_parent"
+                android:gravity="center_vertical"
+                android:textSize="16sp"
+                android:text="Model" />
+
+            <TextView
+                android:id="@+id/modelTextView"
+                android:layout_width="0dp"
+                android:layout_height="match_parent"
+                android:layout_weight="1"
+                android:gravity="center_vertical|end"
+                android:text="no model selected" />
+
+            <ImageButton
+                android:id="@+id/modelImageButton"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginStart="5dp"
+                android:background="#FFFFFF"
+                android:src="@drawable/outline_arrow_drop_down_circle_24"/>
+
+        </LinearLayout>
+
+        <LinearLayout
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="20dp"
+            android:orientation="horizontal">
+
+            <TextView
+                android:id="@+id/tokenizerLabel"
+                android:layout_width="wrap_content"
+                android:layout_height="match_parent"
+                android:gravity="center_vertical"
+                android:textSize="16sp"
+                android:text="Tokenizer" />
+
+            <TextView
+                android:id="@+id/tokenizerTextView"
+                android:layout_width="0dp"
+                android:layout_height="match_parent"
+                android:layout_weight="1"
+                android:gravity="center_vertical|end"
+                android:text="no tokenizer selected" />
+
+            <ImageButton
+                android:id="@+id/tokenizerImageButton"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginStart="5dp"
+                android:background="#FFFFFF"
+                android:src="@drawable/outline_arrow_drop_down_circle_24" />
+
+        </LinearLayout>
+
+        <Button
+            android:id="@+id/loadModelButton"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_gravity="center"
+            android:layout_marginTop="10dp"
+            android:paddingHorizontal="10dp"
+            android:text="Load Model"
+            android:theme="@style/DefaultButton"
+            android:textColor="@android:color/white"/>
+
+        <TextView
+            android:id="@+id/textView4"
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="20dp"
+            android:layout_marginBottom="20dp"
+            android:textStyle="bold"
+            android:textSize="20sp"
+            android:text="Parameters" />
+
+        <LinearLayout
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:layout_marginBottom="10dp"
+            android:orientation="horizontal">
+
+            <TextView
+                android:id="@+id/textView5"
+                android:layout_width="150dp"
+                android:layout_height="wrap_content"
+                android:textSize="16sp"
+                android:text="Temperature" />
+
+            <EditText
+                android:id="@+id/temperatureEditText"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_weight="1"
+                android:ems="10"
+                android:text="0.1"
+                android:textAlignment="textEnd"
+                android:inputType="numberDecimal" />
+        </LinearLayout>
+        
+        <LinearLayout
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:layout_marginBottom="10dp"
+            android:orientation="vertical">
+
+            <LinearLayout
+                android:layout_width="match_parent"
+                android:layout_height="wrap_content"
+                android:orientation="horizontal">
+
+                <TextView
+                    android:id="@+id/systemPromptTitle"
+                    android:layout_width="wrap_content"
+                    android:layout_height="wrap_content"
+                    android:layout_marginTop="20dp"
+                    android:layout_marginBottom="20dp"
+                    android:textStyle="bold"
+                    android:textSize="20sp"
+                    android:text="System Prompt" />
+
+                <ImageButton
+                    android:id="@+id/resetSystemPrompt"
+                    android:layout_width="wrap_content"
+                    android:layout_height="wrap_content"
+                    android:backgroundTint="@android:color/transparent"
+                    android:src="@drawable/baseline_restart_alt_24"
+                    android:layout_marginTop="10dp" />
+            </LinearLayout>
+
+
+            <EditText
+                android:id="@+id/systemPromptText"
+                android:layout_width="match_parent"
+                android:layout_height="wrap_content"
+                android:background="@drawable/prompt_shape"
+                android:textSize="16dp"
+                android:text = "SYS_INFO tags" />
+        </LinearLayout>
+
+        <LinearLayout
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:layout_marginBottom="10dp"
+            android:orientation="vertical">
+
+            <LinearLayout
+                android:layout_width="match_parent"
+                android:layout_height="wrap_content"
+                android:orientation="horizontal">
+
+                <TextView
+                    android:id="@+id/userPromptTitle"
+                    android:layout_width="wrap_content"
+                    android:layout_height="wrap_content"
+                    android:layout_marginTop="20dp"
+                    android:layout_marginBottom="20dp"
+                    android:textStyle="bold"
+                    android:textSize="20sp"
+                    android:text="Prompt Format" />
+
+                <ImageButton
+                    android:id="@+id/resetUserPrompt"
+                    android:layout_width="wrap_content"
+                    android:layout_height="wrap_content"
+                    android:backgroundTint="@android:color/transparent"
+                    android:src="@drawable/baseline_restart_alt_24"
+                    android:layout_marginTop="10dp" />
+
+            </LinearLayout>
+
+            <EditText
+                android:id="@+id/userPromptText"
+                android:layout_width="match_parent"
+                android:layout_height="wrap_content"
+                android:background="@drawable/prompt_shape"
+                android:textSize="16dp"
+                android:text = "USER_PROMPT tags" />
+        </LinearLayout>
+
+        <View
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_weight="1"/>
+
+        <Button
+            android:id="@+id/clearChatButton"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_gravity="center"
+            android:text="Clear Chat History"
+            android:theme="@style/DefaultButton"
+            android:textColor="@android:color/white"/>
+
+    </LinearLayout>
+
+
+</androidx.constraintlayout.widget.ConstraintLayout>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/logs_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/logs_message.xml
new file mode 100644
index 0000000000..3f80f58db6
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/logs_message.xml
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="utf-8"?>
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content"
+    android:layout_marginTop="10dp">
+
+        <TextView
+            android:id="@+id/logsTextView"
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:padding="8dp"
+            android:text="TextView" />
+
+</LinearLayout>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/received_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/received_message.xml
index 770a257ca8..0fb38f3112 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/received_message.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/received_message.xml
@@ -9,11 +9,11 @@
 
     <TextView
         android:id="@+id/name"
-        android:layout_marginLeft="15dp"
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
+        android:layout_marginLeft="15dp"
         android:paddingBottom="4dp"
-        android:text="LLaMA"/>
+        android:text="Llama" />
 
     <TextView
         android:layout_width="wrap_content"
@@ -29,14 +29,36 @@
         android:text="Generated text"
         />
 
-
-    <TextView
-        android:id="@+id/tokens_per_second"
-        android:layout_marginLeft="15dp"
+    <LinearLayout
+        android:id="@+id/subtitles"
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
-        android:layout_below="@+id/message_text"
-        android:paddingBottom="4dp"
-        android:text=""/>
+        android:layout_below="@+id/message_text">
+        <TextView
+            android:id="@+id/timestamp"
+            android:layout_marginLeft="15dp"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:paddingBottom="4dp"
+            android:paddingLeft="4dp"
+            android:text=""/>
+        <TextView
+            android:id="@+id/bar"
+            android:layout_marginLeft="15dp"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:paddingBottom="4dp"
+            android:paddingLeft="4dp"
+            android:text="|"
+            android:visibility="gone"/>
 
+        <TextView
+            android:id="@+id/generation_metrics"
+            android:layout_marginLeft="15dp"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_toRightOf="@+id/bar"
+            android:paddingBottom="4dp"
+            android:text=""/>
+    </LinearLayout>
 </RelativeLayout>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/sent_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/sent_message.xml
index b8121e973e..7cf080d444 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/sent_message.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/sent_message.xml
@@ -1,5 +1,7 @@
 <?xml version="1.0" encoding="utf-8"?>
 <RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
     android:layout_width="match_parent"
     android:layout_height="wrap_content"
     android:paddingVertical="10dp"
@@ -7,28 +9,54 @@
     android:paddingLeft="60dp"
     android:clipToPadding="false">
 
-    <TextView
-        android:id="@+id/name"
-        android:layout_marginRight="15dp"
+    <LinearLayout
+        android:id="@+id/message_content"
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
-        android:paddingBottom="4dp"
         android:layout_alignParentRight="true"
-        android:text="Prompt"/>
+        android:orientation="vertical">
+
+        <TextView
+            android:id="@+id/name"
+            android:layout_marginRight="15dp"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_alignParentRight="true"
+            android:layout_alignParentTop="true"
+            android:layout_above="@+id/message_text"
+            android:paddingBottom="4dp"
+            android:text="Prompt"/>
+
+        <TextView
+            android:id="@+id/message_text"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_alignParentRight="true"
+            android:background="@drawable/sent_message"
+            android:elevation="2dp"
+            android:padding="10dp"
+            android:text="My prompt"
+            android:textColor="#fff"
+            android:textSize="18dp" />
+
+        <ImageView
+            android:id="@+id/message_image"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_alignParentEnd="true"
+            android:adjustViewBounds="true"
+            tools:srcCompat="@tools:sample/avatars" />
+
+    </LinearLayout>
 
     <TextView
+        android:id="@+id/timestamp"
+        android:layout_marginRight="10dp"
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
-        android:id="@+id/message_text"
-        android:layout_below="@+id/name"
-        android:layout_alignRight="@+id/name"
-        android:background="@drawable/sent_message"
-        android:textColor="#fff"
-        android:padding="10dp"
-        android:elevation="2dp"
-        android:textSize="18dp"
+        android:layout_below="@+id/message_content"
+        android:paddingBottom="4dp"
         android:layout_alignParentRight="true"
-        android:text="My prompt"
-        />
+        android:text=""/>
 
 </RelativeLayout>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/system_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/system_message.xml
new file mode 100644
index 0000000000..bd3cfef228
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/system_message.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="utf-8"?>
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content"
+    android:paddingVertical="10dp"
+    android:paddingLeft="15dp"
+    android:paddingRight="60dp"
+    android:clipToPadding="false">
+
+    <TextView
+        android:id="@+id/message_text"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_centerHorizontal="true"
+        android:elevation="2dp"
+        android:paddingHorizontal="16dp"
+        android:paddingVertical="12dp"
+        android:text="Generated text"
+        android:textAlignment="center"
+        android:textColor="#9C9C9C"
+        android:textSize="15dp" />
+
+</RelativeLayout>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/colors.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/colors.xml
index 4faecfa80d..25f13893c8 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/colors.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/colors.xml
@@ -3,4 +3,6 @@
     <color name="colorPrimary">#6200EE</color>
     <color name="colorPrimaryDark">#3700B3</color>
     <color name="colorAccent">#03DAC5</color>
-</resources>
\ No newline at end of file
+    <color name="btn_enabled">#007CBA</color>
+    <color name="btn_disabled">#A2A4B6</color>
+</resources>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/strings.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/strings.xml
index 83f46e62b2..93eac791b7 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/strings.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/strings.xml
@@ -1,3 +1,7 @@
 <resources>
     <string name="app_name">ExecuTorchLlamaDemo</string>
+    <string name="demo_pref_file_key">DemoPrefFileKey</string>
+    <string name="saved_messages_json_key">SavedMessagesJsonKey</string>
+    <string name="settings_json_key">SettingsJsonKey</string>
+    <string name="logs_json_key">LogsJsonKey</string>
 </resources>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/styles.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/styles.xml
index 391ec9ae3b..387804aa1c 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/styles.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/styles.xml
@@ -7,4 +7,8 @@
         <item name="colorAccent">@color/colorAccent</item>
     </style>
 
+    <style name="DefaultButton" parent="Theme.AppCompat.Light.DarkActionBar">
+        <item name="colorButtonNormal">@drawable/btn</item>
+        <item name="android:textColor">@color/colorPrimaryDark</item>
+    </style>
 </resources>

From 2f8ecf39733906053af003252b28429f86da08cf Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Tue, 30 Jul 2024 16:08:25 -0700
Subject: [PATCH 34/75] Program.fbs change to support serialized mutable state
 (#4216)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4216

Need a way to indicate values that have a meaningful initial state serialized in the program, who also are able to be mutated on device.

https://docs.google.com/document/d/1D8WpMmIiQxU_n5OYWXl3mrpBYUewz79izyAO2UknSsM/edit?usp=sharing

Reviewed By: dbort

Differential Revision: D58747605

fbshipit-source-id: 096b40443ba4ecc8044a4d397838309e8c97c8fa
---
 exir/emit/_emitter.py                        |  2 +-
 exir/emit/test/test_emit.py                  |  4 +-
 exir/print_program.py                        |  2 +-
 exir/schema.py                               |  2 +-
 exir/tensor.py                               |  4 +-
 exir/tests/common.py                         |  2 +-
 exir/tests/test_verification.py              |  4 +-
 exir/verification/interpreter.py             |  8 +-
 runtime/executor/tensor_parser_exec_aten.cpp |  6 +-
 schema/program.fbs                           | 87 ++++++++++++++++----
 10 files changed, 87 insertions(+), 34 deletions(-)

diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
index 1023279f19..f57ed15d10 100644
--- a/exir/emit/_emitter.py
+++ b/exir/emit/_emitter.py
@@ -1090,7 +1090,7 @@ def _get_empty_tensor_evalue() -> EValue:
                     dim_order=[],
                     requires_grad=False,
                     layout=0,
-                    constant_buffer_idx=0,
+                    data_buffer_idx=0,
                     allocation_info=None,
                     shape_dynamism=TensorShapeDynamism.STATIC,
                 )
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
index d6d6a8b809..bb670e2ad6 100644
--- a/exir/emit/test/test_emit.py
+++ b/exir/emit/test/test_emit.py
@@ -109,7 +109,7 @@ def check_tensor_buffer_loc(
         value = typing.cast(schema.Tensor, values[value_index].val)
         self.assertIsInstance(value, schema.Tensor)
 
-        self.assertEqual(value.constant_buffer_idx, exp_buffer_idx)
+        self.assertEqual(value.data_buffer_idx, exp_buffer_idx)
 
         if not value.allocation_info:
             self.assertIsNone(exp_mem_id)
@@ -810,7 +810,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             < non_const_buffer_size_without_const_prop_pass[1]
         )
 
-    # cant compare plans directly with __eq__ because of the plan names, and constant_buffer_idx in tensor values
+    # cant compare plans directly with __eq__ because of the plan names, and data_buffer_idx in tensor values
     def _compare_execution_plans(
         self, plan_single: ExecutionPlan, plan_merged: ExecutionPlan
     ) -> None:
diff --git a/exir/print_program.py b/exir/print_program.py
index a4f9a51686..cf2daa2c2d 100644
--- a/exir/print_program.py
+++ b/exir/print_program.py
@@ -79,7 +79,7 @@ def _format_evalue(  # noqa: C901
     evstr = "\033[34m"
     if isinstance(evalue.val, Tensor):
         tensor = evalue.val
-        if tensor.constant_buffer_idx > 0:
+        if tensor.data_buffer_idx > 0:
             assert not _is_dynamic_shape_tensor(
                 tensor
             ), "A constant tensor can not be dynamic shape"
diff --git a/exir/schema.py b/exir/schema.py
index 08b4822559..e9b589f839 100644
--- a/exir/schema.py
+++ b/exir/schema.py
@@ -51,7 +51,7 @@ class Tensor:
     dim_order: List[bytes]
     requires_grad: bool
     layout: int
-    constant_buffer_idx: int
+    data_buffer_idx: int
     allocation_info: Optional[AllocationDetails]
 
     # check schema.fbs for explanations
diff --git a/exir/tensor.py b/exir/tensor.py
index 452bd6ab8a..da35c2c491 100644
--- a/exir/tensor.py
+++ b/exir/tensor.py
@@ -308,7 +308,7 @@ def make_allocation_info(mem_id: int, mem_offset: int) -> schema.AllocationDetai
 
 
 def make_tensor_value(
-    constant_buffer_idx: int,
+    data_buffer_idx: int,
     allocation_info: Optional[schema.AllocationDetails],
     spec: TensorSpec,
 ) -> schema.Tensor:
@@ -341,7 +341,7 @@ def to_list(
         sizes=tensor_size,
         dim_order=tensor_dim_order,
         requires_grad=spec.requires_grad,
-        constant_buffer_idx=constant_buffer_idx,
+        data_buffer_idx=data_buffer_idx,
         allocation_info=allocation_info,
         layout=layout_enum(spec.layout),
         shape_dynamism=spec.shape_dynamism,
diff --git a/exir/tests/common.py b/exir/tests/common.py
index 3c56889eaa..afbd183784 100644
--- a/exir/tests/common.py
+++ b/exir/tests/common.py
@@ -49,7 +49,7 @@ def get_test_program() -> Program:
                             dim_order=typing.cast(List[bytes], [0, 1]),
                             requires_grad=False,
                             layout=0,
-                            constant_buffer_idx=0,
+                            data_buffer_idx=0,
                             allocation_info=AllocationDetails(
                                 memory_id=1,
                                 memory_offset_high=0,
diff --git a/exir/tests/test_verification.py b/exir/tests/test_verification.py
index 33623d7f6c..c223e0ad84 100644
--- a/exir/tests/test_verification.py
+++ b/exir/tests/test_verification.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import unittest
 
 import torch
@@ -47,7 +49,7 @@ def f(x: torch.Tensor) -> torch.Tensor:
         for val_idx in range(len(test.execution_plan.values)):
             val = test.execution_plan.values[val_idx].val
             if not (
-                isinstance(val, Tensor) and val.constant_buffer_idx == 0
+                isinstance(val, Tensor) and val.data_buffer_idx == 0
             ) and not isinstance(val, TensorList):
                 test.load_value(val_idx)
         vlist = test.get_value_list()
diff --git a/exir/verification/interpreter.py b/exir/verification/interpreter.py
index da10e82a46..fff6a6d79b 100644
--- a/exir/verification/interpreter.py
+++ b/exir/verification/interpreter.py
@@ -166,11 +166,11 @@ def get_constant_tensors(self) -> List[Tensor]:
         tensors = []
         for elem in self.execution_plan.values:
             val = elem.val
-            if isinstance(val, Tensor) and val.constant_buffer_idx != 0:
+            if isinstance(val, Tensor) and val.data_buffer_idx != 0:
                 # load val into res
                 # pyre-fixme[16]
                 tensor = bindings.convert_to_tensor(
-                    self.data_buffers[val.constant_buffer_idx],
+                    self.data_buffers[val.data_buffer_idx],
                     val.scalar_type,
                     val.sizes,
                     stride_from_dim_order(val.sizes, val.dim_order),
@@ -239,7 +239,7 @@ def load_from_value_list(self, idx: int) -> None:  # noqa
                 tensor_list.append(self._value_list[i])
             self._value_list[idx] = tensor_list
         elif isinstance(val, Tensor):
-            if val.constant_buffer_idx == 0:
+            if val.data_buffer_idx == 0:
                 # TODO(zhengxu) Verify that argument is actually an out variant
                 self._value_list[idx] = torch.empty(
                     val.sizes, dtype=get_scalar_type(val.scalar_type)
@@ -248,7 +248,7 @@ def load_from_value_list(self, idx: int) -> None:  # noqa
                 # Constant Tensor conversion
                 # pyre-fixme [16]
                 tensor = bindings.convert_to_tensor(
-                    self.data_buffers[val.constant_buffer_idx],
+                    self.data_buffers[val.data_buffer_idx],
                     val.scalar_type,
                     val.sizes,
                     stride_from_dim_order(val.sizes, val.dim_order),
diff --git a/runtime/executor/tensor_parser_exec_aten.cpp b/runtime/executor/tensor_parser_exec_aten.cpp
index 2655e0bc69..d752120fdb 100644
--- a/runtime/executor/tensor_parser_exec_aten.cpp
+++ b/runtime/executor/tensor_parser_exec_aten.cpp
@@ -53,9 +53,9 @@ __ET_NODISCARD Result<void*> getTensorDataPtr(
     const Program* program,
     size_t nbytes,
     HierarchicalAllocator* allocator) {
-  if (s_tensor->constant_buffer_idx() > 0) {
-    auto data = program->get_constant_buffer_data(
-        s_tensor->constant_buffer_idx(), nbytes);
+  if (s_tensor->data_buffer_idx() > 0) {
+    auto data =
+        program->get_constant_buffer_data(s_tensor->data_buffer_idx(), nbytes);
     if (!data.ok()) {
       return data.error();
     }
diff --git a/schema/program.fbs b/schema/program.fbs
index df585ec03f..cbdda2d360 100644
--- a/schema/program.fbs
+++ b/schema/program.fbs
@@ -53,6 +53,20 @@ enum TensorShapeDynamism : byte {
   DYNAMIC_UNBOUND = 2,
 }
 
+
+// Table to put additional information about tensors in that is not applicable
+// to the vast majority of tensors in the vast majority of programs.
+table ExtraTensorInfo {
+  // [Optional] Specifies the SubsegmentOffsets in
+  //  program.mutable_data_segments that specifies where the data is located in.
+  //  If not present and the data is located in a segment, then the data is in
+  //  the first index.
+  mutable_data_segments_idx:uint64;
+
+  // [Optional] The unique name of the tensor. e.g. 'mod.linear.weight'
+  fully_qualified_name:string;
+}
+
 table Tensor {
   scalar_type:ScalarType;
 
@@ -63,26 +77,47 @@ table Tensor {
 
   sizes:[int];
 
-  // Specifies in what order the dimensions are laid out in memory (from outer to inner).
-  // For example, given a rank 3 Tensor of size (3, 5, 2). If we name dimensions: [row, column, batch], then a dim_order of:
-  // (2, 0, 1) represents a [batch, row, column] ordering where "column" is the innermost dimension, then comes "row", and the outermost dimension is "batch".
-  // (0, 2, 1) represents a [row, batch, column] ordering where "column" is the innermost dimension, then comes "batch", and the outermost dimension is "row".
+  // Specifies in what order the dimensions are laid out in memory (from outer
+  // to inner).
+  //
+  // For example, given a rank 3 Tensor of size (3, 5, 2). If we name
+  // dimensions: [row, column, batch], then a dim_order of:
+  // - (2, 0, 1) represents a [batch, row, column] ordering where "column" is
+  //   the innermost dimension, then comes "row", and the outermost dimension is
+  //   "batch".
+  // - (0, 2, 1) represents a [row, batch, column] ordering where "column" is
+  //   the innermost dimension, then comes "batch", and the outermost dimension
+  //   is "row".
   dim_order:[ubyte];
 
   // out of scope M1
   requires_grad:bool;
 
-  // Overall, a Tensor is either constant or non-constant, except we differentiate 2 special
-  // variants of non-constant Tensor ("input" and control-flow "placeholder") as a special
-  // optimization to avoid holding unnecessary AllocationDetails.
+  // Overall, a Tensor is either constant or mutable. At method load time
+  //  constant tensors receive a dataptr into the serialized program. Mutable
+  //  tensors can either receive a pointer from the heirarchical allocator or a
+  //  nullptr if they will receive a data pointer at execution time (inputs
+  //  and control flow placeholders can be like this). Mutable tensors may or
+  //  may not also have an initial value in the serialized program.
+  //
   // In summary:
-  //   constant_buffer_idx > 0, allocation_info = Null: Tensor is a constant
-  //   constant_buffer_idx = 0, allocation_info = Non Null: Tensor is a non-constant.
-  //   constant_buffer_idx = 0, allocation_info = Null: Tensor is a non-constant
-  //     that will receive a dataptr at input time or during execution.
+  //   data_buffer_idx > 0, allocation_info = Null: Tensor is a constant.
+  //   data_buffer_idx = 0, allocation_info = Non Null: Tensor is mutable and
+  //     will receive a dataptr at method load time.
+  //   data_buffer_idx = 0, allocation_info = Null: Tensor is mutable and
+  //     will receive a dataptr at input time or during execution.
+  //   data_buffer_idx > 0, allocation_info = Non Null: Tensor is mutable and
+  //     will receive a dataptr at method load time, and has an initial state.
   //
-  // Index to the program's constant buffer table, value 0 is reserved to indicate non constant
-  constant_buffer_idx:uint;
+  // Tensor data is stored inline if program.constant_buffer is null. Otherwise
+  //  it is in a segment. If this tensor's allocation_info is null then the
+  //  tensor data location is specified by program.constant_segment. If the
+  //  allocation_info is non_null then the data is somewhere in
+  //  program.mutable_data_segments. If tensor_info is Null, then the data is
+  //  in program.mutable_data_segments[0] otherwise if tensor_info is non-null
+  //  then the mutable_data_segment index is specified by
+  //  tensor_info.mutable_data_segments_index.
+  data_buffer_idx:uint;
 
   // [Optional] preallocation details for non-constants (null otherwise).
   allocation_info:AllocationDetails;
@@ -102,7 +137,11 @@ table Tensor {
   //
   // 3. dynamism == DYNAMIC_UNBOUND: the stored sizes field can be ignored since
   //    shape is fully dynamic.
-  shape_dynamism: TensorShapeDynamism;
+  shape_dynamism:TensorShapeDynamism;
+
+  // [Optional] Additional information about the Tensor that is not applicable
+  // to most tensors.
+  extra_tensor_info:ExtraTensorInfo;
 }
 
 table Int {
@@ -276,9 +315,11 @@ table BackendDelegate {
   compile_specs: [CompileSpec];
 }
 
-// A sequence of blocking instructions to be executed in order. The abstraction is not currently leveraged,
-// all current programs are 1 chain. We are leaving chains as part of the program definition for future
-// use cases around graph level async where different threads will be represented as seperate chains.
+// A sequence of blocking instructions to be executed in order. The
+// abstraction is not currently leveraged, all current programs are 1 chain.
+// We are leaving chains as part of the program definition for future use cases
+// around graph level async where different threads will be represented as
+// seperate chains.
 table Chain {
   // Indices of the values that are (non-static) inputs into this Chain.
   inputs:[int];
@@ -401,7 +442,17 @@ table Program {
   // offset. If constant_segment.offsets field is non-empty, constant_buffer
   // must be empty. constant_segment.offsets[0] is reserved to be pointed to by
   // non-constant Tensors.
-  constant_segment: SubsegmentOffsets;
+  constant_segment:SubsegmentOffsets;
+
+  // [Optional] Describes the offsets into various segments for each mutable
+  // tensor. Only mutable tensors with a meaningful initial state are
+  // serialized here (for example weights that will be trained on-device as
+  // opposed to just layer activations). Seperate from the constant_segment to
+  // reduce peak memory usage by letting us read directly from the PTE file
+  // into the mutable tensor, as opposed to loading the .pte data into
+  // constant memory, copying it over, and then being unable to release the
+  // constant segment. No two elements should point to the same segment.
+  mutable_data_segments:[SubsegmentOffsets];
 }
 
 root_type Program;

From 2852bda4e145a1ce56ce1ba6a41221b377d9e25c Mon Sep 17 00:00:00 2001
From: Chen Lai <chenlai@meta.com>
Date: Tue, 30 Jul 2024 16:51:41 -0700
Subject: [PATCH 35/75] move delegate debug tools to sdk (#4459)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4459

It happens a few times for the version mismatch for numpy and pandas and resulting in to_backend api failure. The version issue may not be resolved soon and pandas dependency isn't really needed by the `to_backend` api. Move the related code to sdk
```
  File "/home/riandymdn/.conda/envs/executorch/lib/python3.10/site-packages/executorch/backends/mediatek/preprocess.py", line 18, in <module>
    from executorch.exir.backend.backend_details import (
  File "/home/riandymdn/.conda/envs/executorch/lib/python3.10/site-packages/executorch/exir/__init__.py", line 9, in <module>
    from executorch.exir.capture import (
  File "/home/riandymdn/.conda/envs/executorch/lib/python3.10/site-packages/executorch/exir/capture/__init__.py", line 9, in <module>
    from executorch.exir.capture._capture import (
  File "/home/riandymdn/.conda/envs/executorch/lib/python3.10/site-packages/executorch/exir/capture/_capture.py", line 17, in <module>
    from executorch.exir.program import ExirExportedProgram
  File "/home/riandymdn/.conda/envs/executorch/lib/python3.10/site-packages/executorch/exir/program/__init__.py", line 10, in <module>
    from executorch.exir.program._program import (
  File "/home/riandymdn/.conda/envs/executorch/lib/python3.10/site-packages/executorch/exir/program/_program.py", line 17, in <module>
    from executorch.exir.backend.backend_api import to_backend
  File "/home/riandymdn/.conda/envs/executorch/lib/python3.10/site-packages/executorch/exir/backend/backend_api.py", line 19, in <module>
    from executorch.exir.backend.utils import (
  File "/home/riandymdn/.conda/envs/executorch/lib/python3.10/site-packages/executorch/exir/backend/utils.py", line 15, in <module>
    import pandas as pd
  File "/home/riandymdn/.conda/envs/executorch/lib/python3.10/site-packages/pandas/__init__.py", line 22, in <module>
    from pandas.compat import is_numpy_dev as _is_numpy_dev  # pyright: ignore # noqa:F401
  File "/home/riandymdn/.conda/envs/executorch/lib/python3.10/site-packages/pandas/compat/__init__.py", line 25, in <module>
    from pandas.compat.numpy import (
  File "/home/riandymdn/.conda/envs/executorch/lib/python3.10/site-packages/pandas/compat/numpy/__init__.py", line 4, in <module>
    from pandas.util.version import Version
  File "/home/riandymdn/.conda/envs/executorch/lib/python3.10/site-packages/pandas/util/__init__.py", line 2, in <module>
    from pandas.util._decorators import (  # noqa:F401
  File "/home/riandymdn/.conda/envs/executorch/lib/python3.10/site-packages/pandas/util/_decorators.py", line 14, in <module>
    from pandas._libs.properties import cache_readonly
  File "/home/riandymdn/.conda/envs/executorch/lib/python3.10/site-packages/pandas/_libs/__init__.py", line 13, in <module>
    from pandas._libs.interval import Interval
  File "pandas/_libs/interval.pyx", line 1, in init pandas._libs.interval
ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject
```

Reviewed By: Olivia-liu

Differential Revision: D60426829

fbshipit-source-id: 04501de212911eb46786aaf7ea3da97a60cf3e1e
---
 exir/backend/test/test_utils.py               |  67 -------
 exir/backend/utils.py                         | 165 ----------------
 sdk/backend_debug/TARGETS                     |  23 +++
 sdk/backend_debug/__init__.py                 |  12 ++
 sdk/backend_debug/delegation_info.py          | 176 ++++++++++++++++++
 sdk/backend_debug/tests/TARGETS               |  17 ++
 .../tests/test_delegation_info.py             |  79 ++++++++
 7 files changed, 307 insertions(+), 232 deletions(-)
 create mode 100644 sdk/backend_debug/TARGETS
 create mode 100644 sdk/backend_debug/__init__.py
 create mode 100644 sdk/backend_debug/delegation_info.py
 create mode 100644 sdk/backend_debug/tests/TARGETS
 create mode 100644 sdk/backend_debug/tests/test_delegation_info.py

diff --git a/exir/backend/test/test_utils.py b/exir/backend/test/test_utils.py
index 2f24b734b7..0fc522dd68 100644
--- a/exir/backend/test/test_utils.py
+++ b/exir/backend/test/test_utils.py
@@ -6,8 +6,6 @@
 
 import unittest
 
-import pandas as pd
-
 import torch
 from executorch import exir
 from executorch.exir import to_edge
@@ -15,16 +13,13 @@
 from executorch.exir.backend.partitioner import Partitioner, PartitionResult
 from executorch.exir.backend.test.op_partitioner_demo import AddMulPartitionerDemo
 from executorch.exir.backend.utils import (
-    DelegationBreakdown,
     format_delegated_graph,
     get_delegates,
-    get_delegation_info,
     get_non_lowered_nodes,
     is_identical_graph,
 )
 
 from executorch.exir.dialects._ops import bind_pattern_to_op, ops as exir_ops
-from pandas.testing import assert_frame_equal
 from torch.export import export, ExportedProgram
 from torch.fx import symbolic_trace
 from torch.fx.passes.utils.matcher_utils import SubgraphMatcher
@@ -277,65 +272,3 @@ def forward(self, a, x, b):
             graph_str,
             "Expect to see the aten.mm in the delegated graph",
         )
-
-    def test_get_delegation_info(self):
-        class Model(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, a, x, b):
-                y = torch.mm(a, x)
-                z = y + b
-                a = z - a
-                y = torch.mm(a, x)
-                z = y + b
-                return z
-
-        m = Model()
-        inputs = (torch.randn(2, 2), torch.randn(2, 2), torch.randn(2, 2))
-        edge = to_edge(torch.export.export(m, inputs)).to_backend(
-            AddMulPartitionerDemo()
-        )
-        delegation_info = get_delegation_info(edge.exported_program().graph_module)
-
-        self.assertEqual(delegation_info.num_delegated_subgraphs, 2)
-        self.assertEqual(delegation_info.num_delegated_nodes, 4)
-        self.assertEqual(delegation_info.num_non_delegated_nodes, 3)
-        expected_delegation_by_op_dict = {
-            "aten_add_tensor": DelegationBreakdown(
-                op_type="aten_add_tensor", delegated=2, non_delegated=0
-            ),
-            "aten_mm_default": DelegationBreakdown(
-                op_type="aten_mm_default", delegated=2, non_delegated=0
-            ),
-            "aten_sub_tensor": DelegationBreakdown(
-                op_type="aten_sub_tensor", delegated=0, non_delegated=1
-            ),
-            "getitem": DelegationBreakdown(
-                op_type="getitem", delegated=0, non_delegated=2
-            ),
-        }
-        self.assertEqual(
-            delegation_info.delegation_by_operator, expected_delegation_by_op_dict
-        )
-
-        self.assertIn(
-            "Total delegated subgraphs",
-            delegation_info.get_summary(),
-        )
-
-        df = delegation_info.get_operator_delegation_dataframe()
-        expected_df = pd.DataFrame(
-            {
-                "op_type": [
-                    "aten_add_tensor",
-                    "aten_mm_default",
-                    "aten_sub_tensor",
-                    "getitem",
-                    "Total",
-                ],
-                "occurrences_in_delegated_graphs": [2, 2, 0, 0, 4],
-                "occurrences_in_non_delegated_graphs": [0, 0, 1, 2, 3],
-            }
-        )
-        assert_frame_equal(expected_df, df)
diff --git a/exir/backend/utils.py b/exir/backend/utils.py
index 6b85e4b603..68f36cdb7b 100644
--- a/exir/backend/utils.py
+++ b/exir/backend/utils.py
@@ -6,13 +6,10 @@
 
 import logging
 import operator
-import re
 from collections import defaultdict
-from dataclasses import asdict, dataclass
 from functools import lru_cache
 from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
 
-import pandas as pd
 import torch
 from executorch.exir.backend.backend_details import ExportedProgram
 from executorch.exir.backend.canonical_partitioners.duplicate_constant_node_pass import (
@@ -30,11 +27,6 @@
 T_QuantPerTensor = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
 T_DQuantPerTensor = exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
 
-# Column names of the DataFrame returned by DelegationInfo.get_operator_delegation_dataframe()
-# which describes the summarized delegation information grouped by each operator type
-_OCCURRENCES_IN_DELEGATED_GRAPHS = "occurrences_in_delegated_graphs"
-_OCCURRENCES_IN_NON_DELEGATED_GRAPHS = "occurrences_in_non_delegated_graphs"
-
 
 log: logging.Logger = logging.getLogger(__name__)
 
@@ -291,163 +283,6 @@ def get_delegates(graph: torch.fx.Graph) -> List[torch.fx.Node]:
     ]
 
 
-@dataclass
-class DelegationBreakdown:
-    """
-    DelegationBreakdown contains the number of delegated and non-delegated nodes
-    of the operator type op_type.
-
-    Args:
-        delegated: The number of delegated nodes.
-        non_delegated: The number of non-delegated nodes.
-    """
-
-    op_type: str = ""
-    delegated: int = 0
-    non_delegated: int = 0
-
-
-@dataclass
-class DelegationInfo:
-    """
-    DelegationInfo contains information of a delegated graph module.
-
-    Args:
-        num_delegated_subgraphs: The number of delegated subgraphs.
-        num_delegated_nodes: The number of delegated nodes.
-        num_non_delegated_nodes: The number of non-delegated nodes.
-        delegation_by_operator: A dictionary of operator type to DelegationBreakdown.
-    """
-
-    num_delegated_subgraphs: int
-    num_delegated_nodes: int
-    num_non_delegated_nodes: int
-    delegation_by_operator: Dict[str, DelegationBreakdown]
-
-    def get_summary(self) -> str:
-        """
-        Get a summary of the delegation information in string format.
-
-        Args:
-            None
-
-        Returns:
-            A string containing information of some class attributes for easy print-out.
-        """
-
-        # Assemble and return the summary string
-        summary_str = f"Total delegated subgraphs: {self.num_delegated_subgraphs}\n"
-        summary_str += f"Number of delegated nodes: {self.num_delegated_nodes}\n"
-        summary_str += (
-            f"Number of non-delegated nodes: {self.num_non_delegated_nodes}\n"
-        )
-        return summary_str
-
-    def get_operator_delegation_dataframe(self) -> pd.DataFrame:
-        """
-        Get the delegation information grouped by operator type in a pandas DataFrame.
-
-        Args:
-            None
-
-        Returns:
-            Returns a pandas DataFrame containing the following columns:
-            - op_type: The operator type, with the last row being "Total".
-            - occurrences_in_delegated_graphs: The number of occurrences of the op_type in delegated subgraphs.
-            - occurrences_in_non_delegated_graphs: The number of occurrences of the op_type not in delegated subgraphs.
-            With the last row being the total number of delegated and non-delegated occurrences of each op_type.
-        """
-
-        # Convert the dict to a dataframe
-        list_of_dicts = [
-            asdict(breakdown) for breakdown in self.delegation_by_operator.values()
-        ]
-        df = pd.DataFrame(list_of_dicts)
-        # Rename columns for better understandability
-        df = df.rename(
-            columns={
-                "delegated": _OCCURRENCES_IN_DELEGATED_GRAPHS,
-                "non_delegated": _OCCURRENCES_IN_NON_DELEGATED_GRAPHS,
-            }
-        )
-        df = df.sort_values(by="op_type", ignore_index=True)
-
-        # Add a Total row at the bottom
-        total_delegated_nodes = df[_OCCURRENCES_IN_DELEGATED_GRAPHS].sum()
-        total_non_delegated_nodes = df[_OCCURRENCES_IN_NON_DELEGATED_GRAPHS].sum()
-        df.loc[len(df)] = ["Total", total_delegated_nodes, total_non_delegated_nodes]
-
-        return df
-
-
-def get_delegation_info(
-    graph_module: torch.fx.GraphModule,
-) -> DelegationInfo:
-    """
-    Util function to get the delegation information of the given graph module.
-
-    Args:
-        graph_module: The lowered graph module to get the delegation information from.
-
-    Returns:
-        Return a DelegationInfo object containing the delegation information.
-    """
-
-    def _get_op_type(node_name: str) -> str:
-        # node_name is in format <op_type> or <op_type>_x in which x is an integer suffix.
-        return re.sub(r"_[\d]+$", "", node_name)
-
-    op_occurrences_dict = defaultdict(lambda: DelegationBreakdown())
-
-    def _insert_op_occurrences_dict(node_name: str, delegated: bool) -> None:
-        op_type = _get_op_type(node_name)
-        op_occurrences_dict[op_type].op_type = op_type
-        if delegated:
-            op_occurrences_dict[op_type].delegated += 1
-        else:
-            op_occurrences_dict[op_type].non_delegated += 1
-
-    delegated_subgraph_counter = 0
-
-    lowered_module_dict = {
-        node.name: getattr(graph_module, node.name)
-        for node in graph_module.graph.nodes
-        if node.op == "get_attr" and node.name.startswith("lowered_module_")
-    }
-
-    for node in graph_module.graph.nodes:
-        if (
-            node.op == "call_function"
-            and _get_op_type(node.name) != "executorch_call_delegate"
-        ):
-            # Non-delegated node
-            _insert_op_occurrences_dict(node_name=node.name, delegated=False)
-        # Check if the node is a lowered module
-        if node.op == "get_attr" and node.name.startswith("lowered_module_"):
-            lowered_module = lowered_module_dict[node.name]
-            delegated_subgraph_counter += 1
-            for node_in_lowered_module in lowered_module.original_module.graph.nodes:
-                if node_in_lowered_module.op == "call_function":
-                    # Delegated node
-                    _insert_op_occurrences_dict(
-                        node_name=node_in_lowered_module.name, delegated=True
-                    )
-
-    # Calculate the total number of delegated and non-delegated nodes
-    num_delegated_nodes = 0
-    num_non_delegated_nodes = 0
-    for value in op_occurrences_dict.values():
-        num_delegated_nodes += value.delegated
-        num_non_delegated_nodes += value.non_delegated
-
-    return DelegationInfo(
-        num_delegated_nodes=num_delegated_nodes,
-        num_non_delegated_nodes=num_non_delegated_nodes,
-        num_delegated_subgraphs=delegated_subgraph_counter,
-        delegation_by_operator=op_occurrences_dict,
-    )
-
-
 def print_delegated_graph(graph_module: torch.fx.GraphModule) -> None:
     """
     Print the formatted graph string.
diff --git a/sdk/backend_debug/TARGETS b/sdk/backend_debug/TARGETS
new file mode 100644
index 0000000000..95529192a3
--- /dev/null
+++ b/sdk/backend_debug/TARGETS
@@ -0,0 +1,23 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+oncall("executorch")
+
+runtime.python_library(
+    name = "delegation_info",
+    srcs = [
+        "__init__.py",
+        "delegation_info.py",
+    ],
+    visibility = [
+        "//executorch/...",
+        "//executorch/exir/backend/...",
+        "//executorch/test/...",
+        "@EXECUTORCH_CLIENTS",
+    ],
+    deps = [
+        "fbsource//third-party/pypi/pandas:pandas",
+        "//caffe2:torch",
+        "//executorch/exir:lowered_backend_module",
+        "//executorch/exir/backend/canonical_partitioners:duplicate_constant_node_pass",
+    ],
+)
diff --git a/sdk/backend_debug/__init__.py b/sdk/backend_debug/__init__.py
new file mode 100644
index 0000000000..c1c9726b86
--- /dev/null
+++ b/sdk/backend_debug/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.sdk.backend_debug.delegation_info import (
+    DelegationBreakdown,
+    get_delegation_info,
+)
+
+__all__ = ["DelegationBreakdown", "get_delegation_info"]
diff --git a/sdk/backend_debug/delegation_info.py b/sdk/backend_debug/delegation_info.py
new file mode 100644
index 0000000000..b237d162f7
--- /dev/null
+++ b/sdk/backend_debug/delegation_info.py
@@ -0,0 +1,176 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import re
+from collections import defaultdict
+from dataclasses import asdict, dataclass
+from typing import Dict
+
+import pandas as pd
+import torch
+
+
+# Column names of the DataFrame returned by DelegationInfo.get_operator_delegation_dataframe()
+# which describes the summarized delegation information grouped by each operator type
+_OCCURRENCES_IN_DELEGATED_GRAPHS = "occurrences_in_delegated_graphs"
+_OCCURRENCES_IN_NON_DELEGATED_GRAPHS = "occurrences_in_non_delegated_graphs"
+
+
+@dataclass
+class DelegationBreakdown:
+    """
+    DelegationBreakdown contains the number of delegated and non-delegated nodes
+    of the operator type op_type.
+
+    Args:
+        delegated: The number of delegated nodes.
+        non_delegated: The number of non-delegated nodes.
+    """
+
+    op_type: str = ""
+    delegated: int = 0
+    non_delegated: int = 0
+
+
+@dataclass
+class DelegationInfo:
+    """
+    DelegationInfo contains information of a delegated graph module.
+
+    Args:
+        num_delegated_subgraphs: The number of delegated subgraphs.
+        num_delegated_nodes: The number of delegated nodes.
+        num_non_delegated_nodes: The number of non-delegated nodes.
+        delegation_by_operator: A dictionary of operator type to DelegationBreakdown.
+    """
+
+    num_delegated_subgraphs: int
+    num_delegated_nodes: int
+    num_non_delegated_nodes: int
+    delegation_by_operator: Dict[str, DelegationBreakdown]
+
+    def get_summary(self) -> str:
+        """
+        Get a summary of the delegation information in string format.
+
+        Args:
+            None
+
+        Returns:
+            A string containing information of some class attributes for easy print-out.
+        """
+
+        # Assemble and return the summary string
+        summary_str = f"Total delegated subgraphs: {self.num_delegated_subgraphs}\n"
+        summary_str += f"Number of delegated nodes: {self.num_delegated_nodes}\n"
+        summary_str += (
+            f"Number of non-delegated nodes: {self.num_non_delegated_nodes}\n"
+        )
+        return summary_str
+
+    def get_operator_delegation_dataframe(self) -> pd.DataFrame:
+        """
+        Get the delegation information grouped by operator type in a pandas DataFrame.
+
+        Args:
+            None
+
+        Returns:
+            Returns a pandas DataFrame containing the following columns:
+            - op_type: The operator type, with the last row being "Total".
+            - occurrences_in_delegated_graphs: The number of occurrences of the op_type in delegated subgraphs.
+            - occurrences_in_non_delegated_graphs: The number of occurrences of the op_type not in delegated subgraphs.
+            With the last row being the total number of delegated and non-delegated occurrences of each op_type.
+        """
+
+        # Convert the dict to a dataframe
+        list_of_dicts = [
+            asdict(breakdown) for breakdown in self.delegation_by_operator.values()
+        ]
+        df = pd.DataFrame(list_of_dicts)
+        # Rename columns for better understandability
+        df = df.rename(
+            columns={
+                "delegated": _OCCURRENCES_IN_DELEGATED_GRAPHS,
+                "non_delegated": _OCCURRENCES_IN_NON_DELEGATED_GRAPHS,
+            }
+        )
+        df = df.sort_values(by="op_type", ignore_index=True)
+
+        # Add a Total row at the bottom
+        total_delegated_nodes = df[_OCCURRENCES_IN_DELEGATED_GRAPHS].sum()
+        total_non_delegated_nodes = df[_OCCURRENCES_IN_NON_DELEGATED_GRAPHS].sum()
+        df.loc[len(df)] = ["Total", total_delegated_nodes, total_non_delegated_nodes]
+
+        return df
+
+
+def get_delegation_info(
+    graph_module: torch.fx.GraphModule,
+) -> DelegationInfo:
+    """
+    Util function to get the delegation information of the given graph module.
+
+    Args:
+        graph_module: The lowered graph module to get the delegation information from.
+
+    Returns:
+        Return a DelegationInfo object containing the delegation information.
+    """
+
+    def _get_op_type(node_name: str) -> str:
+        # node_name is in format <op_type> or <op_type>_x in which x is an integer suffix.
+        return re.sub(r"_[\d]+$", "", node_name)
+
+    op_occurrences_dict = defaultdict(lambda: DelegationBreakdown())
+
+    def _insert_op_occurrences_dict(node_name: str, delegated: bool) -> None:
+        op_type = _get_op_type(node_name)
+        op_occurrences_dict[op_type].op_type = op_type
+        if delegated:
+            op_occurrences_dict[op_type].delegated += 1
+        else:
+            op_occurrences_dict[op_type].non_delegated += 1
+
+    delegated_subgraph_counter = 0
+
+    lowered_module_dict = {
+        node.name: getattr(graph_module, node.name)
+        for node in graph_module.graph.nodes
+        if node.op == "get_attr" and node.name.startswith("lowered_module_")
+    }
+
+    for node in graph_module.graph.nodes:
+        if (
+            node.op == "call_function"
+            and _get_op_type(node.name) != "executorch_call_delegate"
+        ):
+            # Non-delegated node
+            _insert_op_occurrences_dict(node_name=node.name, delegated=False)
+        # Check if the node is a lowered module
+        if node.op == "get_attr" and node.name.startswith("lowered_module_"):
+            lowered_module = lowered_module_dict[node.name]
+            delegated_subgraph_counter += 1
+            for node_in_lowered_module in lowered_module.original_module.graph.nodes:
+                if node_in_lowered_module.op == "call_function":
+                    # Delegated node
+                    _insert_op_occurrences_dict(
+                        node_name=node_in_lowered_module.name, delegated=True
+                    )
+
+    # Calculate the total number of delegated and non-delegated nodes
+    num_delegated_nodes = 0
+    num_non_delegated_nodes = 0
+    for value in op_occurrences_dict.values():
+        num_delegated_nodes += value.delegated
+        num_non_delegated_nodes += value.non_delegated
+
+    return DelegationInfo(
+        num_delegated_nodes=num_delegated_nodes,
+        num_non_delegated_nodes=num_non_delegated_nodes,
+        num_delegated_subgraphs=delegated_subgraph_counter,
+        delegation_by_operator=op_occurrences_dict,
+    )
diff --git a/sdk/backend_debug/tests/TARGETS b/sdk/backend_debug/tests/TARGETS
new file mode 100644
index 0000000000..3c9f6c2e64
--- /dev/null
+++ b/sdk/backend_debug/tests/TARGETS
@@ -0,0 +1,17 @@
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+
+oncall("executorch")
+
+python_unittest(
+    name = "test_delegation_info",
+    srcs = [
+        "test_delegation_info.py",
+    ],
+    deps = [
+        "fbsource//third-party/pypi/pandas:pandas",
+        "//caffe2:torch",
+        "//executorch/exir:lib",
+        "//executorch/exir/backend/test:op_partitioner_demo",
+        "//executorch/sdk/backend_debug:delegation_info",
+    ],
+)
diff --git a/sdk/backend_debug/tests/test_delegation_info.py b/sdk/backend_debug/tests/test_delegation_info.py
new file mode 100644
index 0000000000..2d98e9a595
--- /dev/null
+++ b/sdk/backend_debug/tests/test_delegation_info.py
@@ -0,0 +1,79 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import pandas as pd
+
+import torch
+from executorch.exir import to_edge
+from executorch.exir.backend.test.op_partitioner_demo import AddMulPartitionerDemo
+from executorch.sdk.backend_debug import DelegationBreakdown, get_delegation_info
+from pandas.testing import assert_frame_equal
+
+
+class TestUtils(unittest.TestCase):
+    def test_get_delegation_info(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, a, x, b):
+                y = torch.mm(a, x)
+                z = y + b
+                a = z - a
+                y = torch.mm(a, x)
+                z = y + b
+                return z
+
+        m = Model()
+        inputs = (torch.randn(2, 2), torch.randn(2, 2), torch.randn(2, 2))
+        edge = to_edge(torch.export.export(m, inputs)).to_backend(
+            AddMulPartitionerDemo()
+        )
+        delegation_info = get_delegation_info(edge.exported_program().graph_module)
+
+        self.assertEqual(delegation_info.num_delegated_subgraphs, 2)
+        self.assertEqual(delegation_info.num_delegated_nodes, 4)
+        self.assertEqual(delegation_info.num_non_delegated_nodes, 3)
+        expected_delegation_by_op_dict = {
+            "aten_add_tensor": DelegationBreakdown(
+                op_type="aten_add_tensor", delegated=2, non_delegated=0
+            ),
+            "aten_mm_default": DelegationBreakdown(
+                op_type="aten_mm_default", delegated=2, non_delegated=0
+            ),
+            "aten_sub_tensor": DelegationBreakdown(
+                op_type="aten_sub_tensor", delegated=0, non_delegated=1
+            ),
+            "getitem": DelegationBreakdown(
+                op_type="getitem", delegated=0, non_delegated=2
+            ),
+        }
+        self.assertEqual(
+            delegation_info.delegation_by_operator, expected_delegation_by_op_dict
+        )
+
+        self.assertIn(
+            "Total delegated subgraphs",
+            delegation_info.get_summary(),
+        )
+
+        df = delegation_info.get_operator_delegation_dataframe()
+        expected_df = pd.DataFrame(
+            {
+                "op_type": [
+                    "aten_add_tensor",
+                    "aten_mm_default",
+                    "aten_sub_tensor",
+                    "getitem",
+                    "Total",
+                ],
+                "occurrences_in_delegated_graphs": [2, 2, 0, 0, 4],
+                "occurrences_in_non_delegated_graphs": [0, 0, 1, 2, 3],
+            }
+        )
+        assert_frame_equal(expected_df, df)

From 227b49de9eb65c19a98eaf44d2884ab27a5e9e8a Mon Sep 17 00:00:00 2001
From: Guang Yang <guangyang@fb.com>
Date: Tue, 30 Jul 2024 17:09:04 -0700
Subject: [PATCH 36/75] Fix unsupported linker flag on Mac (#4473)

Summary:
Run into the following linker error on Mac:
```
[ 83%] Linking CXX executable llama_main
ld: warning: -s is obsolete
ld: unknown options: --gc-sections
clang: error: linker command failed with exit code 1 (use -v to see invocation)
make[2]: *** [llama_main] Error 1
make[1]: *** [CMakeFiles/llama_main.dir/all] Error 2
make: *** [all] Error 2
```
Env:
`clang --version`
```
Apple clang version 15.0.0 (clang-1500.3.9.4)
Target: arm64-apple-darwin23.6.0
Thread model: posix
InstalledDir: /Library/Developer/CommandLineTools/usr/bin
```

Fix:
It turns out that Apple Clang linker doesn't support the `--gc-sections` options, where it instructs the linker to remove unused sections from the final executable or shared library. Use an equivalent flags `-dead_strip` instead on Mac.

Pull Request resolved: https://github.com/pytorch/executorch/pull/4473

Reviewed By: cccclai

Differential Revision: D60474232

Pulled By: guangy10

fbshipit-source-id: d1f2745913a49588bdac4e8f5f792dfae4d7c9cd
---
 examples/models/llama2/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt
index 7a45026139..4da825a6a2 100644
--- a/examples/models/llama2/CMakeLists.txt
+++ b/examples/models/llama2/CMakeLists.txt
@@ -216,7 +216,11 @@ endif()
 
 add_executable(llama_main ${_srcs})
 if(CMAKE_BUILD_TYPE STREQUAL "Release")
-  target_link_options(llama_main PRIVATE "LINKER:--gc-sections,-s")
+  if(APPLE)
+    target_link_options(llama_main PRIVATE "LINKER:-dead_strip")
+  else()
+    target_link_options(llama_main PRIVATE "LINKER:--gc-sections,-s")
+  endif()
 endif()
 
 target_include_directories(llama_main PUBLIC ${_common_include_directories})

From 6cd7f380e31a1f53f4448ea2143e93efb8a9383c Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Tue, 30 Jul 2024 18:12:32 -0700
Subject: [PATCH 37/75] Move llm custom ops to extension (#4467)

Summary:
Move sdpa from examples/models/llama2/custom_ops to extension/llm/custom_ops.

Consolidate llm custom ops into one place - later, other custom ops (eg. pad, reshape) can be placed in the same place.

Pull Request resolved: https://github.com/pytorch/executorch/pull/4467

Test Plan:
Imported from GitHub, without a `Test Plan:` line.

Build executorch, export and run stories with xnnpack and quant in oss.

Reviewed By: helunwencser

Differential Revision: D60469989

Pulled By: lucylq

fbshipit-source-id: e2950f13976c422fc51be8204a44f652faf2f9a9
---
 CMakeLists.txt                                       |  2 +-
 build/cmake_deps.toml                                |  2 +-
 examples/models/llama2/CMakeLists.txt                |  5 ++++-
 examples/models/llama2/TARGETS                       |  4 ++--
 examples/models/llama2/runner/targets.bzl            |  4 ++--
 examples/models/llama2/source_transformation/sdpa.py |  4 +++-
 extension/android/CMakeLists.txt                     |  2 +-
 .../llm}/custom_ops/CMakeLists.txt                   | 12 ++++++------
 .../llama2 => extension/llm}/custom_ops/TARGETS      |  0
 .../llama2 => extension/llm}/custom_ops/__init__.py  |  0
 .../llama2 => extension/llm}/custom_ops/op_sdpa.cpp  |  5 +++--
 .../llama2 => extension/llm}/custom_ops/op_sdpa.h    |  0
 .../llm}/custom_ops/op_sdpa_aot.cpp                  |  2 +-
 .../llm}/custom_ops/op_sdpa_test.cpp                 |  3 +--
 .../llm}/custom_ops/op_sdpa_with_kv_cache_test.cpp   |  3 +--
 .../llm}/custom_ops/sdpa_with_kv_cache.py            |  2 ++
 .../llama2 => extension/llm}/custom_ops/targets.bzl  |  2 +-
 .../llm}/custom_ops/test_sdpa_with_kv_cache.py       |  2 ++
 setup.py                                             |  4 ++--
 19 files changed, 33 insertions(+), 25 deletions(-)
 rename {examples/models/llama2 => extension/llm}/custom_ops/CMakeLists.txt (86%)
 rename {examples/models/llama2 => extension/llm}/custom_ops/TARGETS (100%)
 rename {examples/models/llama2 => extension/llm}/custom_ops/__init__.py (100%)
 rename {examples/models/llama2 => extension/llm}/custom_ops/op_sdpa.cpp (99%)
 rename {examples/models/llama2 => extension/llm}/custom_ops/op_sdpa.h (100%)
 rename {examples/models/llama2 => extension/llm}/custom_ops/op_sdpa_aot.cpp (97%)
 rename {examples/models/llama2 => extension/llm}/custom_ops/op_sdpa_test.cpp (99%)
 rename {examples/models/llama2 => extension/llm}/custom_ops/op_sdpa_with_kv_cache_test.cpp (99%)
 rename {examples/models/llama2 => extension/llm}/custom_ops/sdpa_with_kv_cache.py (99%)
 rename {examples/models/llama2 => extension/llm}/custom_ops/targets.bzl (97%)
 rename {examples/models/llama2 => extension/llm}/custom_ops/test_sdpa_with_kv_cache.py (99%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 897fdefd14..afda38f190 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -766,7 +766,7 @@ endif()
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
   # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom
   add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/examples/models/llama2/custom_ops
+    ${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops
   )
 endif()
 
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
index 80abd46409..2a94a32fa4 100644
--- a/build/cmake_deps.toml
+++ b/build/cmake_deps.toml
@@ -282,7 +282,7 @@ filters = [
 # ---------------------------------- LLama start ----------------------------------
 [targets.custom_ops]
 buck_targets = [
-  "//examples/models/llama2/custom_ops:custom_ops",
+  "//extension/llm/custom_ops:custom_ops",
 ]
 filters = [
   ".cpp$",
diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt
index 4da825a6a2..81089a438d 100644
--- a/examples/models/llama2/CMakeLists.txt
+++ b/examples/models/llama2/CMakeLists.txt
@@ -86,7 +86,10 @@ endif()
 
 # custom ops library
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  add_subdirectory(custom_ops)
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/custom_ops
+    ${CMAKE_CURRENT_BINARY_DIR}/../../../extension/llm/custom_ops
+  )
 endif()
 
 # llama_runner library
diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS
index 319527321e..9bdbff5fbb 100644
--- a/examples/models/llama2/TARGETS
+++ b/examples/models/llama2/TARGETS
@@ -54,7 +54,7 @@ runtime.python_binary(
     main_function = "executorch.examples.models.llama2.export_llama.main",
     # visibility = ["//executorch/examples/..."],
     preload_deps = [
-        "//executorch/examples/models/llama2/custom_ops:custom_ops_aot_lib",
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
         "//executorch/kernels/quantized:aot_lib",
     ],
     deps = [
@@ -86,7 +86,7 @@ runtime.python_library(
         "//caffe2:torch",
         "//executorch/examples/models:model_base",
         "//executorch/examples/models:models",
-        "//executorch/examples/models/llama2/custom_ops:custom_ops_aot_py",
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_py",
         "//executorch/extension/llm/export:export_lib",
         # one definition has to be included in the user of the libarary
         # depending on what library the client wants to use
diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl
index d525628174..c8b63b6a54 100644
--- a/examples/models/llama2/runner/targets.bzl
+++ b/examples/models/llama2/runner/targets.bzl
@@ -4,9 +4,9 @@ def _get_operator_lib(aten = False):
     if aten:
         return ["//executorch/kernels/aten:generated_lib"]
     elif runtime.is_oss:
-        return ["//executorch/kernels/portable:generated_lib", "//executorch/examples/models/llama2/custom_ops:custom_ops"]
+        return ["//executorch/kernels/portable:generated_lib", "//executorch/extension/llm/custom_ops:custom_ops"]
     else:
-        return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/examples/models/llama2/custom_ops:custom_ops"]
+        return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/extension/llm/custom_ops:custom_ops"]
 
 def use_tiktoken():
     return native.read_config("llama", "use_tiktoken", "0") == "1"
diff --git a/examples/models/llama2/source_transformation/sdpa.py b/examples/models/llama2/source_transformation/sdpa.py
index cc5890a175..4e0ac71868 100644
--- a/examples/models/llama2/source_transformation/sdpa.py
+++ b/examples/models/llama2/source_transformation/sdpa.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 # Example script for exporting Llama2 to flatbuffer
 
 import math
@@ -61,7 +63,7 @@ def _replace_sdpa_with_custom_op(module: torch.nn.Module):
 
 
 def replace_sdpa_with_custom_op(module: torch.nn.Module) -> torch.nn.Module:
-    from executorch.examples.models.llama2.custom_ops import sdpa_with_kv_cache  # noqa
+    from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa
 
     _replace_sdpa_with_custom_op(module)
     return module
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index aa5a40a875..ee571050c0 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -80,7 +80,7 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
   )
 
   set(CUSTOM_OPS_PATH
-      ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/custom_ops/libcustom_ops.a
+      ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/custom_ops/libcustom_ops.a
   )
   add_library(custom_ops STATIC IMPORTED)
   set_property(TARGET custom_ops PROPERTY IMPORTED_LOCATION ${CUSTOM_OPS_PATH})
diff --git a/examples/models/llama2/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
similarity index 86%
rename from examples/models/llama2/custom_ops/CMakeLists.txt
rename to extension/llm/custom_ops/CMakeLists.txt
index 0de9f4df88..3f242e3d7d 100644
--- a/examples/models/llama2/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -17,7 +17,7 @@ endif()
 
 # Source root directory for executorch.
 if(NOT EXECUTORCH_ROOT)
-  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 endif()
 
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)
@@ -29,7 +29,7 @@ include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
 # The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
 #
 set(EXECUTORCH_SRCS_FILE
-    "${CMAKE_CURRENT_BINARY_DIR}/../../../../executorch_srcs.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/../../../executorch_srcs.cmake"
 )
 
 extract_sources(${EXECUTORCH_SRCS_FILE})
@@ -53,8 +53,8 @@ if(NOT EXECUTORCH_BUILD_XNNPACK)
   list(
     APPEND
     _custom_ops__srcs
-    "${CMAKE_CURRENT_SOURCE_DIR}/../../../../backends/xnnpack/threadpool/threadpool.cpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/../../../../backends/xnnpack/threadpool/threadpool_guard.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/threadpool.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/threadpool_guard.cpp"
   )
 else()
   list(APPEND custom_ops_libs xnnpack_backend)
@@ -64,7 +64,7 @@ add_library(custom_ops ${_custom_ops__srcs})
 
 target_include_directories(custom_ops PUBLIC "${_common_include_directories}")
 target_include_directories(
-  custom_ops PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../../include"
+  custom_ops PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../include"
 )
 target_link_libraries(
   custom_ops PUBLIC ${custom_ops_libs} executorch_no_prim_ops
@@ -88,7 +88,7 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   )
   target_include_directories(
     custom_ops_aot_lib
-    PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../../include"
+    PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../include"
   )
   if(TARGET portable_lib)
     # If we have portable_lib built, custom_ops_aot_lib gives the ability to use
diff --git a/examples/models/llama2/custom_ops/TARGETS b/extension/llm/custom_ops/TARGETS
similarity index 100%
rename from examples/models/llama2/custom_ops/TARGETS
rename to extension/llm/custom_ops/TARGETS
diff --git a/examples/models/llama2/custom_ops/__init__.py b/extension/llm/custom_ops/__init__.py
similarity index 100%
rename from examples/models/llama2/custom_ops/__init__.py
rename to extension/llm/custom_ops/__init__.py
diff --git a/examples/models/llama2/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp
similarity index 99%
rename from examples/models/llama2/custom_ops/op_sdpa.cpp
rename to extension/llm/custom_ops/op_sdpa.cpp
index 758973bcb7..727c04774b 100644
--- a/examples/models/llama2/custom_ops/op_sdpa.cpp
+++ b/extension/llm/custom_ops/op_sdpa.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/examples/models/llama2/custom_ops/op_sdpa.h>
+#include <executorch/extension/llm/custom_ops/op_sdpa.h>
 
 #include <executorch/kernels/optimized/blas/CPUBlas.h>
 #include <executorch/kernels/optimized/vec/functional.h>
@@ -16,6 +16,7 @@
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 
 #include <array>
+// patternlint-disable-next-line executorch-cpp-nostdinc
 #include <vector>
 
 #ifdef ET_USE_THREADPOOL
@@ -698,7 +699,7 @@ void update_cache(
     const Tensor& projected_value,
     const Tensor& cache,
     int64_t start_pos,
-    int64_t seq_length) {
+    int64_t seq_length) { // NOLINT: unused parameter 'seq_length'
   ET_CHECK_MSG(
       projected_value.size(0) == 1,
       "projected_value must have batch size of 1");
diff --git a/examples/models/llama2/custom_ops/op_sdpa.h b/extension/llm/custom_ops/op_sdpa.h
similarity index 100%
rename from examples/models/llama2/custom_ops/op_sdpa.h
rename to extension/llm/custom_ops/op_sdpa.h
diff --git a/examples/models/llama2/custom_ops/op_sdpa_aot.cpp b/extension/llm/custom_ops/op_sdpa_aot.cpp
similarity index 97%
rename from examples/models/llama2/custom_ops/op_sdpa_aot.cpp
rename to extension/llm/custom_ops/op_sdpa_aot.cpp
index ed735406ad..3fc790af79 100644
--- a/examples/models/llama2/custom_ops/op_sdpa_aot.cpp
+++ b/extension/llm/custom_ops/op_sdpa_aot.cpp
@@ -6,9 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/examples/models/llama2/custom_ops/op_sdpa.h>
 #include <executorch/extension/aten_util/make_aten_functor_from_et_functor.h>
 #include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
+#include <executorch/extension/llm/custom_ops/op_sdpa.h>
 
 #include <torch/library.h>
 
diff --git a/examples/models/llama2/custom_ops/op_sdpa_test.cpp b/extension/llm/custom_ops/op_sdpa_test.cpp
similarity index 99%
rename from examples/models/llama2/custom_ops/op_sdpa_test.cpp
rename to extension/llm/custom_ops/op_sdpa_test.cpp
index 971e8cf45c..116be2508d 100644
--- a/examples/models/llama2/custom_ops/op_sdpa_test.cpp
+++ b/extension/llm/custom_ops/op_sdpa_test.cpp
@@ -8,10 +8,9 @@
 
 #include <limits>
 
-#include <executorch/examples/models/llama2/custom_ops/op_sdpa.h>
+#include <executorch/extension/llm/custom_ops/op_sdpa.h>
 
 #include <executorch/kernels/test/TestUtil.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
diff --git a/examples/models/llama2/custom_ops/op_sdpa_with_kv_cache_test.cpp b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp
similarity index 99%
rename from examples/models/llama2/custom_ops/op_sdpa_with_kv_cache_test.cpp
rename to extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp
index 5ea7856dba..819dd70217 100644
--- a/examples/models/llama2/custom_ops/op_sdpa_with_kv_cache_test.cpp
+++ b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp
@@ -8,9 +8,8 @@
 
 #include <limits>
 
-#include <executorch/examples/models/llama2/custom_ops/op_sdpa.h> // Declares the operator
+#include <executorch/extension/llm/custom_ops/op_sdpa.h> // Declares the operator
 #include <executorch/kernels/test/TestUtil.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
diff --git a/examples/models/llama2/custom_ops/sdpa_with_kv_cache.py b/extension/llm/custom_ops/sdpa_with_kv_cache.py
similarity index 99%
rename from examples/models/llama2/custom_ops/sdpa_with_kv_cache.py
rename to extension/llm/custom_ops/sdpa_with_kv_cache.py
index 1b89dddce3..7673f64d92 100644
--- a/examples/models/llama2/custom_ops/sdpa_with_kv_cache.py
+++ b/extension/llm/custom_ops/sdpa_with_kv_cache.py
@@ -8,6 +8,8 @@
 # C++ APIs for registration so here we need to import the shared library.
 # This is only needed for OSS.
 
+# pyre-unsafe
+
 import logging
 from pathlib import Path
 
diff --git a/examples/models/llama2/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
similarity index 97%
rename from examples/models/llama2/custom_ops/targets.bzl
rename to extension/llm/custom_ops/targets.bzl
index cac83abe07..8c38eb6a0a 100644
--- a/examples/models/llama2/custom_ops/targets.bzl
+++ b/extension/llm/custom_ops/targets.bzl
@@ -22,7 +22,7 @@ def define_common_targets():
         compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"],
         visibility = [
             "//executorch/...",
-            "//executorch/examples/models/llama2/custom_ops/...",
+            "//executorch/extension/llm/custom_ops/...",
             "@EXECUTORCH_CLIENTS",
         ],
         # @lint-ignore BUCKLINT link_whole
diff --git a/examples/models/llama2/custom_ops/test_sdpa_with_kv_cache.py b/extension/llm/custom_ops/test_sdpa_with_kv_cache.py
similarity index 99%
rename from examples/models/llama2/custom_ops/test_sdpa_with_kv_cache.py
rename to extension/llm/custom_ops/test_sdpa_with_kv_cache.py
index 1309cc65ba..a1b36e688f 100644
--- a/examples/models/llama2/custom_ops/test_sdpa_with_kv_cache.py
+++ b/extension/llm/custom_ops/test_sdpa_with_kv_cache.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import unittest
 
 import torch
diff --git a/setup.py b/setup.py
index d71133b7bd..157d798bb4 100644
--- a/setup.py
+++ b/setup.py
@@ -569,8 +569,8 @@ def get_ext_modules() -> List[Extension]:
         ext_modules.append(
             # Install the prebuilt library for custom ops used in llama.
             BuiltFile(
-                "examples/models/llama2/custom_ops/libcustom_ops_aot_lib.*",
-                "executorch/examples/models/llama2/custom_ops",
+                "extension/llm/custom_ops/libcustom_ops_aot_lib.*",
+                "executorch/extension/llm/custom_ops",
             )
         )
 

From dcdd25477177364c163700e857472130b2d83b81 Mon Sep 17 00:00:00 2001
From: Guang Yang <guangyang@meta.com>
Date: Tue, 30 Jul 2024 18:32:09 -0700
Subject: [PATCH 38/75] Non-GenAI models coverage XNNPACK (#4474)

Summary:
XNNPACK on non-GenAI models

Pull Request resolved: https://github.com/pytorch/executorch/pull/4474

Reviewed By: mcr229

Differential Revision: D60486312

Pulled By: guangy10

fbshipit-source-id: a2e4d0ba0b85786bc18fc1af0e12d5fd189de543
---
 .ci/scripts/gather_test_models.py | 8 +++++++-
 examples/xnnpack/__init__.py      | 9 ++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py
index 55289140c4..d94b6ee3ec 100755
--- a/.ci/scripts/gather_test_models.py
+++ b/.ci/scripts/gather_test_models.py
@@ -35,9 +35,11 @@
     # Just some examples on how custom timeout can be set
     "linux": {
         "mobilebert": 90,
+        "emformer_predict": 360,
     },
     "macos": {
         "mobilebert": 90,
+        "emformer_predict": 360,
     },
 }
 
@@ -84,7 +86,11 @@ def model_should_run_on_event(model: str, event: str) -> bool:
     """
     if event == "pull_request":
         return model in ["mv3", "vit"]
-    return True
+    elif event == "push":
+        # 'emformer_predict' is running super slow. Only run it periodically
+        return model not in ["emformer_predict"]
+    else:
+        return True
 
 
 def model_should_run_on_target_os(model: str, target_os: str) -> bool:
diff --git a/examples/xnnpack/__init__.py b/examples/xnnpack/__init__.py
index 49ef93b673..81404dcf6b 100644
--- a/examples/xnnpack/__init__.py
+++ b/examples/xnnpack/__init__.py
@@ -24,11 +24,14 @@ class XNNPACKOptions(object):
     "mv3": XNNPACKOptions(True, True),
     "resnet18": XNNPACKOptions(True, True),
     "resnet50": XNNPACKOptions(True, True),
-    "vit": XNNPACKOptions(False, True),
-    "w2l": XNNPACKOptions(False, True),
+    "vit": XNNPACKOptions(False, True),  # T161242362
+    "w2l": XNNPACKOptions(True, True),
     "edsr": XNNPACKOptions(True, True),
-    "mobilebert": XNNPACKOptions(False, True),  # T170286473
+    "mobilebert": XNNPACKOptions(False, True),  # T197452682
     "llama2": XNNPACKOptions(False, True),
+    "emformer_join": XNNPACKOptions(True, True),
+    "emformer_predict": XNNPACKOptions(False, True),  # T197457838
+    "emformer_transcribe": XNNPACKOptions(False, True),  # T197449765
 }
 
 

From febd9c138714d3c52793792808b180c326393fe3 Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Tue, 30 Jul 2024 19:37:18 -0700
Subject: [PATCH 39/75] Change deprecated impl_abstract to register_fake
 (#4392)

Summary:
Removes some warnings when running pytest discovery

Change-Id: I85a4f672784c3bdcdccd0c3b6eb4871c2550f888

Pull Request resolved: https://github.com/pytorch/executorch/pull/4392

Reviewed By: larryliu0820

Differential Revision: D60453855

Pulled By: digantdesai

fbshipit-source-id: 02419e385fa09d9172d31924536871752e75995e
---
 exir/passes/_quant_patterns_and_replacements.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py
index a7fd75a350..88d082b87d 100644
--- a/exir/passes/_quant_patterns_and_replacements.py
+++ b/exir/passes/_quant_patterns_and_replacements.py
@@ -15,7 +15,7 @@
 )
 from torch import fx
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib
-from torch.library import impl, impl_abstract
+from torch.library import impl, register_fake
 
 
 __all__ = [
@@ -103,7 +103,7 @@ def embedding_byte(
     return torch.ops.aten.embedding.default(weight, indices)
 
 
-@impl_abstract("quantized_decomposed::embedding_byte.out")
+@register_fake("quantized_decomposed::embedding_byte.out")
 def embedding_byte_out_meta(
     weight: torch.Tensor,
     weight_scales: torch.Tensor,
@@ -150,7 +150,7 @@ def embedding_byte_dtype(
     return torch.ops.aten.embedding.default(weight, indices)
 
 
-@impl_abstract("quantized_decomposed::embedding_byte.dtype_out")
+@register_fake("quantized_decomposed::embedding_byte.dtype_out")
 def embedding_byte_dtype_out_meta(
     weight: torch.Tensor,
     weight_scales: torch.Tensor,
@@ -225,7 +225,7 @@ def embedding_4bit(
     return torch.ops.aten.embedding.default(weight, indices)
 
 
-@impl_abstract("quantized_decomposed::embedding_4bit.out")
+@register_fake("quantized_decomposed::embedding_4bit.out")
 def embedding_4bit_out_meta(
     weight: torch.Tensor,
     weight_scales: torch.Tensor,
@@ -278,7 +278,7 @@ def embedding_4bit_dtype(
     return torch.ops.aten.embedding.default(weight, indices)
 
 
-@impl_abstract("quantized_decomposed::embedding_4bit.dtype_out")
+@register_fake("quantized_decomposed::embedding_4bit.dtype_out")
 def embedding_4bit_dtype_out_meta(
     weight: torch.Tensor,
     weight_scales: torch.Tensor,

From 6bfefa84b248b1f5ac2096eadf452d69d70d0f5b Mon Sep 17 00:00:00 2001
From: yifan_shen3 <yifan_shen3@apple.com>
Date: Wed, 31 Jul 2024 00:36:10 -0700
Subject: [PATCH 40/75] Use Core ML Quantizer in Llama Export (#4458)

Summary:
This PR is an initial step to add Core ML quantizer in Llama export. We start with "quantize model with XNNPack quantizer then fully delegate to Core ML backend". "Quantize with Core ML quantizer" is under development

This PR does 2 things:
1. Add Core ML quantizer options then use them in Llama export
2. Use different iOS versions for different features: fp16 model can run on iOS 15, while 8a8w quantization requires iOS 17, and 4w quantization requires iOS 18

Pull Request resolved: https://github.com/pytorch/executorch/pull/4458

Reviewed By: kirklandsign

Differential Revision: D60462384

Pulled By: cccclai

fbshipit-source-id: b8698789c06f9f6314f48f567b414d0929353453
---
 examples/models/llama2/export_llama_lib.py | 15 ++++++-
 extension/llm/export/partitioner_lib.py    | 23 +++++++++-
 extension/llm/export/quantizer_lib.py      | 49 ++++++++++++++++++++++
 3 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index cf5d1b6e6f..d3148c9542 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -35,6 +35,7 @@
 )
 
 from executorch.extension.llm.export.quantizer_lib import (
+    get_coreml_quantizer,
     get_pt2e_quantization_params,
     get_pt2e_quantizers,
     get_qnn_quantizer,
@@ -128,6 +129,11 @@ def build_args_parser() -> argparse.ArgumentParser:
             "qnn_8a8w",
             "qnn_16a16w",
             "qnn_16a4w",
+            "coreml_c4w",
+            "coreml_8a_c8w",
+            "coreml_8a_c4w",
+            "coreml_baseline_8a_c8w",
+            "coreml_baseline_8a_c4w",
         ],
         help="Use PT2E quantization. Comma separated options. e.g. xnnpack_dynamic (for per channel 8 bit weight), xnnpack_dynamic_qc4 (for per channel 4 bit weight), embedding.",
     )
@@ -416,6 +422,10 @@ def get_quantizer_and_quant_params(args):
             args.pt2e_quantize, args.quantization_mode
         )
         quantizers.append(qnn_quantizer)
+    if args.coreml and args.pt2e_quantize:
+        assert len(quantizers) == 0, "Should not enable both xnnpack / qnn and coreml"
+        coreml_quantizer = get_coreml_quantizer(args.pt2e_quantize)
+        quantizers.append(coreml_quantizer)
     logging.info(f"Applying quantizers: {quantizers}")
     return pt2e_quant_params, quantizers, quant_dtype
 
@@ -469,7 +479,10 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
         modelname = f"mps_{modelname}"
 
     if args.coreml:
-        partitioners.append(get_coreml_partitioner(args.use_kv_cache))
+        coreml_partitioner = get_coreml_partitioner(
+            args.use_kv_cache, args.pt2e_quantize
+        )
+        partitioners.append(coreml_partitioner)
         modelname = f"coreml_{modelname}"
 
     if args.qnn:
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
index e8f32e4b47..bcbeeeee15 100644
--- a/extension/llm/export/partitioner_lib.py
+++ b/extension/llm/export/partitioner_lib.py
@@ -55,7 +55,9 @@ def get_mps_partitioner(use_kv_cache: bool = False):
     return MPSPartitioner(compile_specs)
 
 
-def get_coreml_partitioner(use_kv_cache: bool = False):
+def get_coreml_partitioner(
+    use_kv_cache: bool = False, pt2e_quantize: Optional[str] = None
+):
     assert (
         use_kv_cache is True
     ), "CoreML backend currently only supports static shape and use_kv_cache=True is the only way to support it at the moment"
@@ -72,7 +74,26 @@ def get_coreml_partitioner(use_kv_cache: bool = False):
             "Please install the CoreML backend follwing https://pytorch.org/executorch/main/build-run-coreml.html"
         )
 
+    minimum_deployment_target = ct.target.iOS15
+    # In Core ML, quantization in introduced in iOS 16
+    if pt2e_quantize is not None:
+        minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS16)
+    # In Core ML, 8-bit activation quantization is introduced in iOS 17
+    if pt2e_quantize in ("coreml_8a_c8w", "coreml_baseline_8a_c8w"):
+        minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS17)
+    # In Core ML, 4-bit weight compression is introduced in iOS 18
+    if pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w"):
+        minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)
+    # In Core ML, stateful execution is introduced in iOS 18
+    # TODO (https://github.com/pytorch/executorch/issues/4209)
+    # For now, since mutable buffer is kept in executorch runtime,
+    # state is out of place and can be handled by older iOS.
+    # Once mutable buffer can be handed over to delegate, i.e. state becomes in-place, we will have
+    # if use_kv_cache:
+    #     minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)
+
     compile_specs = CoreMLBackend.generate_compile_specs(
+        minimum_deployment_target=minimum_deployment_target,
         compute_precision=ct.precision(ct.precision.FLOAT16.value),
         # using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU`
         compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()],
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 441f673302..fe6ad1c201 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -193,3 +193,52 @@ def get_qnn_quantizer(
     ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
     qnn_quantizer.add_custom_quant_annotations(custom_annotations)
     return qnn_quantizer, quant_dtype
+
+
+def get_coreml_quantizer(pt2e_quantize: str):
+    try:
+        from coremltools.optimize.torch.quantization.quantization_config import (
+            LinearQuantizerConfig,
+            QuantizationScheme,
+        )
+
+        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.coreml.quantizer`.
+        from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
+    except ImportError:
+        raise ImportError(
+            "Please install the CoreML backend follwing https://pytorch.org/executorch/main/build-run-coreml.html"
+        )
+
+    if pt2e_quantize == "coreml_8a_c8w":
+        config = LinearQuantizerConfig.from_dict(
+            {
+                "global_config": {
+                    "quantization_scheme": QuantizationScheme.affine,
+                    "activation_dtype": torch.quint8,
+                    "weight_dtype": torch.qint8,
+                    "weight_per_channel": True,
+                }
+            }
+        )
+        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `apple`.
+        quantizer = CoreMLQuantizer(config)
+
+    elif pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w"):
+        raise NotImplementedError("4-bit Core ML quantizer is still under development")
+
+    elif pt2e_quantize == "coreml_baseline_8a_c8w":
+        config = get_symmetric_quantization_config(
+            is_per_channel=True, is_dynamic=False
+        )
+        quantizer = XNNPACKQuantizer().set_global(config)
+
+    elif pt2e_quantize == "coreml_baseline_8a_c4w":
+        config = get_symmetric_quantization_config(
+            is_per_channel=True, is_dynamic=False, weight_qmin=-8, weight_qmax=7
+        )
+        quantizer = XNNPACKQuantizer().set_global(config)
+
+    else:
+        raise ValueError(f"Unsupported Core ML quantizer specification {pt2e_quantize}")
+
+    return quantizer

From 5890a9c52748c5c56f94fbf38d493cb538ab7e28 Mon Sep 17 00:00:00 2001
From: Adam Cantrowitz <acantrowitz@meta.com>
Date: Wed, 31 Jul 2024 04:17:48 -0700
Subject: [PATCH 41/75] Allow us to move the proto files outside of fbcode
 (#4417)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4417

Allow `rust_protobuf_library` to take in a target or a source. This is an effort to move proto files to a standard, shareable location. The decision was made to make users of this macro to explicitly define the export file if they needed.

Reviewed By: ndmitchell, huydhn

Differential Revision: D60187261

fbshipit-source-id: 4a044b07d3096e433d987118f363aba42f54f716
---
 shim/shims.bzl | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/shim/shims.bzl b/shim/shims.bzl
index 242e9d174c..2357a8ee78 100644
--- a/shim/shims.bzl
+++ b/shim/shims.bzl
@@ -351,13 +351,6 @@ def rust_protobuf_library(
         ] + (deps or []),
     )
 
-    # For python tests only
-    for proto in protos:
-        prelude.export_file(
-            name = proto,
-            visibility = ["PUBLIC"],
-        )
-
 def ocaml_binary(
         deps = [],
         visibility = ["PUBLIC"],

From f9d2de1c79cab931743f19fd0a0cc99881528b03 Mon Sep 17 00:00:00 2001
From: "Di Xu (SWE)" <dixu@meta.com>
Date: Wed, 31 Jul 2024 08:20:13 -0700
Subject: [PATCH 42/75] Create a buck genrule for schema_generated.h

Summary:
Create a buck genrule for schema_generated.h
- Genrule example: https://www.internalfb.com/code/fbsource/[6e2ed7840cb2]/fbcode/executorch/schema/targets.bzl?lines=20-41

This is needed to add dependency of qualcomm/schema_generated.h into the ET-QNN for Meta Wearable app

bypass-github-export-checks
bypass-github-pytorch-ci-checks
bypass-github-executorch-ci-checks

Reviewed By: kirklandsign

Differential Revision: D60456032

fbshipit-source-id: fbf30b3b3468aca5f287f0ab159701e553dc5db7
---
 backends/qualcomm/targets.bzl | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 backends/qualcomm/targets.bzl

diff --git a/backends/qualcomm/targets.bzl b/backends/qualcomm/targets.bzl
new file mode 100644
index 0000000000..cb89bb24ef
--- /dev/null
+++ b/backends/qualcomm/targets.bzl
@@ -0,0 +1,28 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+
+def generate_schema_header(rule_name, srcs, headers, default_header):
+    """Generate header file given flatbuffer schema
+    """
+    runtime.genrule(
+        name = rule_name,
+        srcs = srcs,
+        # We're only generating a single file, so it seems like we could use
+        # `out`, but `flatc` takes a directory as a parameter, not a single
+        # file. Use `outs` so that `${OUT}` is expanded as the containing
+        # directory instead of the file itself.
+        outs = {header: [header] for header in headers},
+        default_outs = [default_header],
+        cmd = " ".join([
+            "$(exe {})".format(runtime.external_dep_location("flatc")),
+            "--cpp",
+            "--cpp-std c++11",
+            "--gen-mutable",
+            "--scoped-enums",
+            "-o ${OUT}",
+            "${SRCS}",
+            # Let our infra know that the file was generated.
+            " ".join(["&& echo // @" + "generated >> ${OUT}/" + header for header in headers]),
+        ]),
+        visibility = [],  # Private
+    )

From f611219e02531b4f2bacfb007a843408574e51b4 Mon Sep 17 00:00:00 2001
From: Guang Yang <guangyang@meta.com>
Date: Wed, 31 Jul 2024 10:54:27 -0700
Subject: [PATCH 43/75] Add workflow for on-demand benchmarking (#4441)

Summary:
Ability to schedule an on-demand benchmark job from GA UI with params, e.g. models, delegates, devices, etc
Ability to schedule from PR via tagging (doubt it could work with non-default args)

Pull Request resolved: https://github.com/pytorch/executorch/pull/4441

Reviewed By: huydhn, kirklandsign

Differential Revision: D60419239

Pulled By: guangy10

fbshipit-source-id: 4e331c36b28357c8e789746778fd0a63f87cb9c8
---
 .ci/scripts/test_llama.sh          |  11 ++
 .github/workflows/android-perf.yml | 196 +++++++++++++++++++++++++++++
 2 files changed, 207 insertions(+)
 create mode 100644 .github/workflows/android-perf.yml

diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index 30b77ee38f..ae795b12ab 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -13,6 +13,7 @@ MODEL_NAME=$1 # stories110M.pt
 BUILD_TOOL=$2 # buck2 or cmake
 DTYPE=$3 # fp16 or fp32
 MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe
+UPLOAD_DIR=${5:-}
 if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
     echo "Expecting atleast 4 positional arguments"
     echo "Usage: [...]"
@@ -126,6 +127,15 @@ cleanup_files() {
   rm params.json
 }
 
+prepare_artifacts_upload() {
+  if [ -n "$UPLOAD_DIR" ]; then
+    echo "Preparing for uploading generated artifacs"
+    mkdir -p "${UPLOAD_DIR}"
+    zip -j "model.zip" "${MODEL_NAME}" tokenizer.bin
+    cp "model.zip" "${UPLOAD_DIR}"
+  fi
+}
+
 # Download and create artifacts.
 PARAMS="params.json"
 touch "${PARAMS}"
@@ -205,6 +215,7 @@ if [[ "${RESULT}" == "${EXPECTED_PREFIX}"* ]]; then
   echo "Actual result: ${RESULT}"
   echo "Success"
 
+  prepare_artifacts_upload
   cleanup_files
 else
   echo "Expected result prefix: ${EXPECTED_PREFIX}"
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
new file mode 100644
index 0000000000..9bf66775e6
--- /dev/null
+++ b/.github/workflows/android-perf.yml
@@ -0,0 +1,196 @@
+name: android-perf
+
+on:
+  schedule:
+    - cron: 0 0 * * *
+  # Note: GitHub has an upper limit of 10 inputs
+  workflow_dispatch:
+    inputs:
+      models:
+        description: Models to be benchmarked
+        required: false
+        type: string
+        default: stories110M
+      devices:
+        description: Target devices to run benchmark
+        required: false
+        type: string
+        default: false
+      delegates:
+        description: Backend delegates
+        required: false
+        type: string
+        default: xnnpack
+      threadpool:
+        description: Run with threadpool?
+        required: false
+        type: boolean
+        default: false
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  set-models:
+    runs-on: linux.2xlarge
+    outputs:
+      models: ${{ steps.set-models.outputs.models }}
+    steps:
+      - name: Set models
+        id: set-models
+        shell: bash
+        run: |
+          set -ex
+          MODELS="${{ inputs.models }}"
+          echo "models=$(echo $MODELS | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
+
+  export-models:
+    name: export-models
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    needs: set-models
+    strategy:
+      matrix:
+          model: ${{ fromJson(needs.set-models.outputs.models) }}
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      timeout: 60
+      upload-artifact: android-models
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+        echo "Exporting model: ${{ matrix.model }}"
+        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}
+
+        # Install requirements for export_llama
+        PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
+        # Test llama2
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh "${{ matrix.model }}.pt" "cmake" "fp32" "xnnpack+custom+qe" "${ARTIFACTS_DIR_NAME}"\
+
+  # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
+  upload-models:
+    needs: export-models
+    runs-on: linux.2xlarge
+    steps:
+      - name: Download the artifacts from GitHub
+        uses: actions/download-artifact@v3
+        with:
+          # The name here needs to match the name of the upload-artifact parameter
+          name: android-models
+          path: ${{ runner.temp }}/artifacts/
+
+      - name: Verify the artifacts
+        shell: bash
+        working-directory: ${{ runner.temp }}/artifacts/
+        run: |
+          ls -lah ./
+
+      - name: Upload the artifacts to S3
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifact
+          retention-days: 1
+          if-no-files-found: ignore
+          path: ${{ runner.temp }}/artifacts/
+
+  build-llm-demo:
+    name: build-llm-demo
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    needs: set-models
+    strategy:
+      matrix:
+          tokenizer: [bpe]
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12-android
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      upload-artifact: android-apps
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
+        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
+
+        # TODO: This needs to be replaced with a generic loader .apk
+        # Build LLM Demo for Android
+        bash build/build_android_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME}
+
+  # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
+  upload-android-apps:
+    needs: build-llm-demo
+    runs-on: linux.2xlarge
+    steps:
+      - name: Download the artifacts from GitHub
+        uses: actions/download-artifact@v3
+        with:
+          # The name here needs to match the name of the upload-artifact parameter
+          name: android-apps
+          path: ${{ runner.temp }}/artifacts/
+
+      - name: Verify the artifacts
+        shell: bash
+        working-directory: ${{ runner.temp }}/artifacts/
+        run: |
+          ls -lah ./
+
+      - name: Upload the artifacts to S3
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifact
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ${{ runner.temp }}/artifacts/
+
+  # Let's see how expensive this job is, we might want to tone it down by running it periodically
+  benchmark-on-device:
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
+    needs:
+      - set-models
+      - upload-models
+      - upload-android-apps
+    strategy:
+      matrix:
+        model: ${{ fromJson(needs.set-models.outputs.models) }}
+    with:
+      device-type: android
+      runner: linux.2xlarge
+      test-infra-ref: ''
+      # This is the ARN of ExecuTorch project on AWS
+      project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
+      # This is the custom Android device pool that only includes Samsung Galaxy S2x
+      device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa
+      # Uploaded to S3 from the previous job, the name of the app comes from the project itself.
+      # Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer.
+      # It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only
+      # one app+flavor that could load and run the model.
+      # TODO: Hard code llm_demo_bpe for now in this job.
+      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug.apk
+      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug-androidTest.apk
+      # The test spec can be downloaded from https://ossci-assets.s3.amazonaws.com/android-llama2-device-farm-test-spec.yml
+      test-spec: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/abd86868-fa63-467e-a5c7-218194665a77
+      # Uploaded to S3 from the previous job
+      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}/model.zip

From 1114539fd14b84450d6993a96c37ae176ff2144e Mon Sep 17 00:00:00 2001
From: Chen Lai <chenlai@meta.com>
Date: Wed, 31 Jul 2024 17:11:25 -0700
Subject: [PATCH 44/75] Change warning to a different log level (#4482)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4482

This is misleading and causes confusion. Change the log level. There are a few issues related to this message but it's actually just warnning.

Reviewed By: larryliu0820

Differential Revision: D60528790

fbshipit-source-id: 37c951667de324390bf73aa12ae4262830d322a3
---
 examples/qualcomm/executor_runner/qnn_executor_runner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index e5ced476ac..0ae6e4e6e4 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -260,7 +260,7 @@ int main(int argc, char** argv) {
       // This can error if the outputs are already pre-allocated. Ignore
       // this error because it doesn't affect correctness, but log it.
       ET_LOG(
-          Error, "ignoring error from set_output_data_ptr(): 0x%" PRIx32, ret);
+          Info, "ignoring error from set_output_data_ptr(): 0x%" PRIx32, ret);
     }
   }
   ET_LOG(Info, "Inputs prepared.");

From 5b37524987b81668308a49e7f9c6e066456e868d Mon Sep 17 00:00:00 2001
From: helunwencser <lwhecser@gmail.com>
Date: Thu, 1 Aug 2024 01:41:46 -0700
Subject: [PATCH 45/75] Add customized static cache implementation (#4490)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4490

imported-using-ghimport

Test Plan: Imported from OSS

Reviewed By: iseeyuan

Differential Revision: D60554455

Pulled By: helunwencser

fbshipit-source-id: defc2953afb265b5e21b2fa540c3b1eb2e90d0a8
---
 examples/models/phi-3-mini/__init__.py     |  0
 examples/models/phi-3-mini/eager.py        | 15 +++++++-
 examples/models/phi-3-mini/static_cache.py | 42 ++++++++++++++++++++++
 3 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 examples/models/phi-3-mini/__init__.py
 create mode 100644 examples/models/phi-3-mini/static_cache.py

diff --git a/examples/models/phi-3-mini/__init__.py b/examples/models/phi-3-mini/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/models/phi-3-mini/eager.py b/examples/models/phi-3-mini/eager.py
index a2aeb8415c..6794cd89bb 100644
--- a/examples/models/phi-3-mini/eager.py
+++ b/examples/models/phi-3-mini/eager.py
@@ -14,6 +14,8 @@
 
 from transformers import AutoTokenizer, Phi3ForCausalLM
 
+from .static_cache import ETStaticCache
+
 end_of_text_token = 32000
 
 
@@ -40,7 +42,18 @@ def _generate_token(args, model, prompt_tokens):
 def _generate_token_with_kv_cache(args, model, prompt_tokens):
     print("Generating tokens:", end="", flush=True)
 
-    result = model.forward(input_ids=prompt_tokens, use_cache=True, return_dict=True)
+    result = model.forward(
+        input_ids=prompt_tokens,
+        use_cache=True,
+        return_dict=True,
+        past_key_values=ETStaticCache(
+            model.config,
+            prompt_tokens.shape[0],
+            args.seq_len + prompt_tokens.shape[-1],
+            device=model.device,
+            dtype=model.dtype,
+        ),
+    )
 
     current_token = torch.argmax(result.logits[:, -1, :], dim=-1).item()
     current_key_value = result.past_key_values
diff --git a/examples/models/phi-3-mini/static_cache.py b/examples/models/phi-3-mini/static_cache.py
new file mode 100644
index 0000000000..2469ae742d
--- /dev/null
+++ b/examples/models/phi-3-mini/static_cache.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Optional
+
+import torch
+from transformers import PretrainedConfig, StaticCache
+
+
+class ETStaticCache(StaticCache):
+    """
+    A customized static cache implementation, which overrides a few methods to make it exportable to ExecuTorch.
+    This can be removed once transformers supports static cache for Phi3 properly.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        max_batch_size: int,
+        max_cache_len: int,
+        device,
+        dtype=torch.float32,
+    ) -> None:
+        super().__init__(
+            config=config,
+            max_batch_size=max_batch_size,
+            max_cache_len=max_cache_len,
+            device=device,
+            dtype=dtype,
+        )
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum().item()
+
+    def get_usable_length(
+        self, new_seq_length: int, layer_idx: Optional[int] = 0
+    ) -> int:
+        return self.get_seq_length(layer_idx)

From a65700cf6f9a39e40006b19f6a5293a2b0909077 Mon Sep 17 00:00:00 2001
From: helunwencser <lwhecser@gmail.com>
Date: Thu, 1 Aug 2024 01:41:46 -0700
Subject: [PATCH 46/75] add a wrapper for running phi-3-mini with kv cache
 (#4491)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4491

imported-using-ghimport

Test Plan: Imported from OSS

Reviewed By: iseeyuan

Differential Revision: D60554454

Pulled By: helunwencser

fbshipit-source-id: 01974a94ac1826cf63e796247a0200128293d27d
---
 examples/models/phi-3-mini/__init__.py   | 11 ++++++++
 examples/models/phi-3-mini/eager.py      | 29 ++++++-------------
 examples/models/phi-3-mini/phi_3_mini.py | 36 ++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 21 deletions(-)
 create mode 100644 examples/models/phi-3-mini/phi_3_mini.py

diff --git a/examples/models/phi-3-mini/__init__.py b/examples/models/phi-3-mini/__init__.py
index e69de29bb2..6940879375 100644
--- a/examples/models/phi-3-mini/__init__.py
+++ b/examples/models/phi-3-mini/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .phi_3_mini import Phi3Mini
+
+__all__ = [
+    Phi3Mini,
+]
diff --git a/examples/models/phi-3-mini/eager.py b/examples/models/phi-3-mini/eager.py
index 6794cd89bb..f3d6f9a224 100644
--- a/examples/models/phi-3-mini/eager.py
+++ b/examples/models/phi-3-mini/eager.py
@@ -14,7 +14,7 @@
 
 from transformers import AutoTokenizer, Phi3ForCausalLM
 
-from .static_cache import ETStaticCache
+from .phi_3_mini import Phi3Mini
 
 end_of_text_token = 32000
 
@@ -42,35 +42,22 @@ def _generate_token(args, model, prompt_tokens):
 def _generate_token_with_kv_cache(args, model, prompt_tokens):
     print("Generating tokens:", end="", flush=True)
 
-    result = model.forward(
-        input_ids=prompt_tokens,
-        use_cache=True,
-        return_dict=True,
-        past_key_values=ETStaticCache(
-            model.config,
-            prompt_tokens.shape[0],
-            args.seq_len + prompt_tokens.shape[-1],
-            device=model.device,
-            dtype=model.dtype,
-        ),
-    )
+    model = Phi3Mini(model, 1, args.seq_len + prompt_tokens.shape[-1])
 
-    current_token = torch.argmax(result.logits[:, -1, :], dim=-1).item()
-    current_key_value = result.past_key_values
+    for input_pos in range(prompt_tokens.shape[-1]):
+        result = model.forward(
+            input_ids=prompt_tokens[:, input_pos : input_pos + 1],
+        )
 
+    current_token = torch.argmax(result, dim=-1).item()
     print(f" {current_token}", end="", flush=True)
-
     generated_tokens = [current_token]
 
     while current_token != end_of_text_token and len(generated_tokens) < args.seq_len:
         result = model.forward(
             input_ids=torch.tensor([[current_token]], dtype=torch.long),
-            use_cache=True,
-            return_dict=True,
-            past_key_values=current_key_value,
         )
-        current_token = torch.argmax(result.logits[:, -1, :], dim=-1).item()
-        current_key_value = result.past_key_values
+        current_token = torch.argmax(result, dim=-1).item()
         print(f" {current_token}", end="", flush=True)
         generated_tokens.append(current_token)
 
diff --git a/examples/models/phi-3-mini/phi_3_mini.py b/examples/models/phi-3-mini/phi_3_mini.py
new file mode 100644
index 0000000000..a7c906f0cd
--- /dev/null
+++ b/examples/models/phi-3-mini/phi_3_mini.py
@@ -0,0 +1,36 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch.nn
+from transformers import Phi3ForCausalLM
+
+from .static_cache import ETStaticCache
+
+
+class Phi3Mini(torch.nn.Module):
+
+    def __init__(self, model: Phi3ForCausalLM, max_batch_size: int, max_seq_len: int):
+        super().__init__()
+        self.model = model
+        self.cache = ETStaticCache(
+            config=model.config,
+            max_batch_size=max_batch_size,
+            max_cache_len=max_seq_len,
+            device=self.model.device,
+            dtype=self.model.dtype,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+    ) -> torch.FloatTensor:
+        return self.model.forward(
+            input_ids=input_ids,
+            use_cache=True,
+            return_dict=True,
+            past_key_values=self.cache,
+        ).logits[:, -1, :]

From a743a3be6ed1a3307379b44c2cf3eb6de4a24c7c Mon Sep 17 00:00:00 2001
From: helunwencser <lwhecser@gmail.com>
Date: Thu, 1 Aug 2024 01:41:46 -0700
Subject: [PATCH 47/75] export phi-3-mini-wrapper (#4478)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4478

imported-using-ghimport

Test Plan: Imported from OSS

Reviewed By: iseeyuan

Differential Revision: D60483506

Pulled By: helunwencser

fbshipit-source-id: f5f019035af66af6380186e4bc57a949e6cc5480
---
 .../models/phi-3-mini/export_phi-3-mini.py    | 63 +++++++++++++------
 1 file changed, 43 insertions(+), 20 deletions(-)

diff --git a/examples/models/phi-3-mini/export_phi-3-mini.py b/examples/models/phi-3-mini/export_phi-3-mini.py
index cb20a36510..af933ff292 100644
--- a/examples/models/phi-3-mini/export_phi-3-mini.py
+++ b/examples/models/phi-3-mini/export_phi-3-mini.py
@@ -4,6 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+
+import argparse
+
 import torch
 
 from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
@@ -20,30 +23,43 @@
     XNNPACKQuantizer,
 )
 
-from transformers import Phi3ForCausalLM
+from transformers import AutoTokenizer, Phi3ForCausalLM
+
+from .phi_3_mini import Phi3Mini
 
 
-def main() -> None:
+def main(args) -> None:
     torch.manual_seed(0)
 
-    # pyre-ignore: Undefined attribute [16]: Module `transformers` has no attribute `Phi3ForCausalLM`
-    model = Phi3ForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+    model_name = "microsoft/Phi-3-mini-4k-instruct"
 
-    example_inputs = (torch.randint(0, 100, (1, 100), dtype=torch.long),)
-    dynamic_shape = {"input_ids": {1: torch.export.Dim("sequence_length", max=128)}}
+    with torch.no_grad():
+        model = Phi3Mini(
+            # pyre-ignore: Undefined attribute [16]: Module `transformers` has no attribute `Phi3ForCausalLM`
+            model=Phi3ForCausalLM.from_pretrained(model_name),
+            max_batch_size=1,
+            max_seq_len=args.seq_len,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
 
-    xnnpack_quant_config = get_symmetric_quantization_config(
-        is_per_channel=True, is_dynamic=True
-    )
-    xnnpack_quantizer = XNNPACKQuantizer()
-    xnnpack_quantizer.set_global(xnnpack_quant_config)
-
-    with torch.nn.attention.sdpa_kernel(
-        [torch.nn.attention.SDPBackend.MATH]
-    ), torch.no_grad():
-        model = capture_pre_autograd_graph(
-            model, example_inputs, dynamic_shapes=dynamic_shape
+        tokens = tokenizer.encode("Tell me a story", return_tensors="pt")
+        for input_pos in range(tokens.shape[-1]):
+            result = model.forward(
+                input_ids=tokens[:, input_pos : input_pos + 1],
+            )
+        current_token = torch.argmax(result, dim=-1).item()
+
+        example_inputs = (
+            torch.tensor([[current_token]], dtype=torch.long, requires_grad=False),
+        )
+
+        xnnpack_quant_config = get_symmetric_quantization_config(
+            is_per_channel=True, is_dynamic=True
         )
+        xnnpack_quantizer = XNNPACKQuantizer()
+        xnnpack_quantizer.set_global(xnnpack_quant_config)
+
+        model = capture_pre_autograd_graph(model, example_inputs)
         model = prepare_pt2e(model, xnnpack_quantizer)
         model(*example_inputs)
         model = convert_pt2e(model, fold_quantize=False)
@@ -53,14 +69,13 @@ def main() -> None:
         model = torch.export._trace._export(
             model,
             example_inputs,
-            dynamic_shapes=dynamic_shape,
             strict=False,
             pre_dispatch=False,
         )
 
     edge_config = get_xnnpack_edge_compile_config()
     edge_manager = to_edge(model, compile_config=edge_config)
-    edge_manager = edge_manager.to_backend(XnnpackPartitioner(has_dynamic_shapes=True))
+    edge_manager = edge_manager.to_backend(XnnpackPartitioner())
     et_program = edge_manager.to_executorch()
 
     with open("phi-3-mini.pte", "wb") as file:
@@ -68,4 +83,12 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-s",
+        "--seq_len",
+        type=int,
+        default=128,
+        help="Maximum number of tokens including prompt to generate",
+    )
+    main(parser.parse_args())

From ad371a4e4968ecb65555934f40f1291da0ea3a02 Mon Sep 17 00:00:00 2001
From: Esteban Padilla Cerdio <estebanpadilla@meta.com>
Date: Thu, 1 Aug 2024 09:23:32 -0700
Subject: [PATCH 48/75] Move calculations away from GPU in Bandwidth profilers
 (#4445)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4445

This is simply to have a more accurate result when doing bandwidth profiling by removing calculations that can be done outside the shader, leaving only the read operations behind.

Reviewed By: copyrightly

Differential Revision: D60396870

fbshipit-source-id: 53dc774f08b1284b7e1eff0cd14ac06c01c7532a
---
 .../vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl  | 13 +++++--------
 .../vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl  | 13 +++++--------
 backends/vulkan/tools/gpuinfo/include/buffers.h   | 15 ++++++++++++++-
 backends/vulkan/tools/gpuinfo/include/textures.h  | 15 ++++++++++++++-
 4 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl
index c16ad5d14b..38c9befec6 100644
--- a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl
+++ b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl
@@ -26,6 +26,11 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 layout(constant_id = 3) const int niter = 1;
 layout(constant_id = 4) const int nvec = 1;
 layout(constant_id = 5) const int local_group_size = 1;
+// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
+// This will help us limit address accessing to a specific set of unique
+// addresses depending on the access size we want to measure.
+layout(constant_id = 6) const int addr_mask = 1;
+layout(constant_id = 7) const int workgroup_width = 1;
 
 $if MEMTYPE == "shared":
     shared vec4 A[nvec];
@@ -36,15 +41,7 @@ void main() {
         A[gl_LocalInvocationID[0]][0] = gl_LocalInvocationID[0];
         memoryBarrierShared();
 
-    // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
-    // This will help us limit address accessing to a specific set of unique
-    // addresses depending on the access size we want to measure.
-    const int addr_mask = nvec - 1;
     vec4 sum = vec4(0);
-
-    // This is to distribute the accesses to unique addresses across the workgroups, once the
-    // size of the access excedes the workgroup width.
-    const uint workgroup_width = local_group_size * niter * ${NUNROLL};
     uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;
 
     int i = 0;
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
index d848fc0475..7ab67bd2d0 100644
--- a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
@@ -21,17 +21,14 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 layout(constant_id = 3) const int niter = 1;
 layout(constant_id = 4) const int nvec = 1;
 layout(constant_id = 5) const int local_group_size = 1;
+// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
+// This will help us limit address accessing to a specific set of unique
+// addresses depending on the access size we want to measure.
+layout(constant_id = 6) const int addr_mask = 1;
+layout(constant_id = 7) const int workgroup_width = 1;
 
 void main() {
-    // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
-    // This will help us limit address accessing to a specific set of unique
-    // addresses depending on the access size we want to measure.
-    const int addr_mask = nvec - 1;
     vec4 sum = vec4(0);
-
-    // This is to distribute the accesses to unique addresses across the workgroups, once the
-    // size of the access excedes the workgroup width.
-    const uint workgroup_width = local_group_size * niter * ${NUNROLL};
     uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;
 
     int i = 0;
diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h
index 8cb0da49ca..c8cf93c4a1 100644
--- a/backends/vulkan/tools/gpuinfo/include/buffers.h
+++ b/backends/vulkan/tools/gpuinfo/include/buffers.h
@@ -123,6 +123,15 @@ void _bandwidth(
     // Number of vectors that fit in this iteration
     const uint32_t nvec_access = access_size / VEC_SIZE;
 
+    // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
+    // This will help us limit address accessing to a specific set of unique
+    // addresses depending on the access size we want to measure.
+    const uint32_t addr_mask = nvec_access - 1;
+
+    // This is to distribute the accesses to unique addresses across the
+    // workgroups, once the size of the access excedes the workgroup width.
+    const uint32_t workgroup_width = local_x * NITER * NUNROLL;
+
     StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
     StorageBuffer out_buf(
         context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
@@ -136,7 +145,11 @@ void _bandwidth(
           pipeline_barrier,
           {global_x, 1, 1},
           {local_x, 1, 1},
-          {SV(NITER), SV(nvec_access), SV(local_x)},
+          {SV(NITER),
+           SV(nvec_access),
+           SV(local_x),
+           SV(addr_mask),
+           SV(workgroup_width)},
           VK_NULL_HANDLE,
           0,
           in_buf.buffer(),
diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h
index bb8a3371a9..7679f11b0c 100644
--- a/backends/vulkan/tools/gpuinfo/include/textures.h
+++ b/backends/vulkan/tools/gpuinfo/include/textures.h
@@ -164,6 +164,15 @@ void tex_bandwidth(const App& app) {
       // Number of texels that fit in this iteration
       const uint32_t ntexel_access = access_size / VEC_SIZE;
 
+      // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
+      // This will help us limit address accessing to a specific set of unique
+      // addresses depending on the access size we want to measure.
+      const uint32_t addr_mask = ntexel_access - 1;
+
+      // This is to distribute the accesses to unique addresses across the
+      // workgroups, once the size of the access excedes the workgroup width.
+      const uint32_t workgroup_width = local_x * NITER * NUNROLL;
+
       StorageBuffer out_buf(
           context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
       vkapi::PipelineBarrier pipeline_barrier{};
@@ -174,7 +183,11 @@ void tex_bandwidth(const App& app) {
             pipeline_barrier,
             {global_x, 1, 1},
             {local_x, 1, 1},
-            {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)},
+            {SV(NITER),
+             SV(ntexel_access),
+             SV(local_x),
+             SV(addr_mask),
+             SV(workgroup_width)},
             VK_NULL_HANDLE,
             0,
             in_tensor.image(),

From 1882837f37b052ae8cd93b24001984c88f29367a Mon Sep 17 00:00:00 2001
From: Guang Yang <guangyang@meta.com>
Date: Thu, 1 Aug 2024 10:00:30 -0700
Subject: [PATCH 49/75] Fixed typo in default devices (#4483)

Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4483

Reviewed By: huydhn

Differential Revision: D60535060

Pulled By: guangy10

fbshipit-source-id: a5ba98108917d1cb03580322798f5cba870a2127
---
 .github/workflows/android-perf.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 9bf66775e6..78f41ada20 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -15,7 +15,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: false
+        default: samsung_galaxy_s2x
       delegates:
         description: Backend delegates
         required: false

From 0bbcabe357c7a46f029d278228dce2ec569ac073 Mon Sep 17 00:00:00 2001
From: shewu-quic <quic_shewu@quicinc.com>
Date: Thu, 1 Aug 2024 10:36:44 -0700
Subject: [PATCH 50/75] Qualcomm AI Engine Direct - Add index and index_put op
 (#4481)

Summary:
- Add op builder for index and index_put ops
- Add unit test for index and index_put ops

Pull Request resolved: https://github.com/pytorch/executorch/pull/4481

Reviewed By: kirklandsign

Differential Revision: D60574336

Pulled By: cccclai

fbshipit-source-id: 6f2260480e63906479a3020e3645e5aabc3c43af
---
 backends/qualcomm/builders/__init__.py       |  4 +
 backends/qualcomm/builders/op_index.py       | 83 ++++++++++++++++++++
 backends/qualcomm/builders/op_index_put.py   | 83 ++++++++++++++++++++
 backends/qualcomm/builders/qnn_constants.py  | 34 +++++---
 backends/qualcomm/partition/common_defs.py   |  3 +-
 backends/qualcomm/quantizer/utils.py         | 32 ++++++++
 backends/qualcomm/tests/models.py            | 23 ++++++
 backends/qualcomm/tests/test_qnn_delegate.py | 28 +++++++
 8 files changed, 277 insertions(+), 13 deletions(-)
 create mode 100644 backends/qualcomm/builders/op_index.py
 create mode 100644 backends/qualcomm/builders/op_index_put.py

diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
index c4fbdeae14..d3bf98bae7 100644
--- a/backends/qualcomm/builders/__init__.py
+++ b/backends/qualcomm/builders/__init__.py
@@ -23,6 +23,8 @@
     op_hardsigmoid,
     op_hardswish,
     op_hardtanh,
+    op_index,
+    op_index_put,
     op_layer_norm,
     op_linear,
     op_log_softmax,
@@ -75,6 +77,8 @@
     op_hardswish,
     op_hardtanh,
     op_hardsigmoid,
+    op_index,
+    op_index_put,
     op_layer_norm,
     op_linear,
     op_log_softmax,
diff --git a/backends/qualcomm/builders/op_index.py b/backends/qualcomm/builders/op_index.py
new file mode 100644
index 0000000000..6f8dc558fe
--- /dev/null
+++ b/backends/qualcomm/builders/op_index.py
@@ -0,0 +1,83 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpGather, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class Index(NodeVisitor):
+    # schema = aten::index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+    target = ["aten.index.Tensor"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=True,
+        )
+
+        if len(node.args[1]) > 1:
+            # TODO consider to implement it in a recursive way.
+            raise NotImplementedError("Not support tuple of tensor.")
+
+        indices_node = node.args[1][0]
+        indices_tensor = self.get_tensor(indices_node, node).to(torch.int32)
+        assert indices_tensor.size(0) != 0, "Not support empty indices list"
+
+        indices_tensor_wrapper = self.define_tensor(
+            indices_node,
+            indices_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=True,
+        )
+
+        gather_input_tensors = [input_tensor_wrapper, indices_tensor_wrapper]
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+        gather_output_tensors = [output_tensor_wrapper]
+
+        gather_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpGather.op_name,
+        )
+        gather_op.AddInputTensors(gather_input_tensors)
+        gather_op.AddOutputTensors(gather_output_tensors)
+
+        # If support tuple of tensor, need to refine it based on len
+        gather_op.AddScalarParam(
+            OpGather.param_axis,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_32,
+            {"data": np.int32(0)},
+        )
+
+        return gather_op
diff --git a/backends/qualcomm/builders/op_index_put.py b/backends/qualcomm/builders/op_index_put.py
new file mode 100644
index 0000000000..af5311dfb2
--- /dev/null
+++ b/backends/qualcomm/builders/op_index_put.py
@@ -0,0 +1,83 @@
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import torch
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpScatterNd, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class IndexPutVisitor(NodeVisitor):
+    target = ["aten.index_put.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=True,
+        )
+        indicies_node = node.args[1]
+        indices_list = [
+            self.get_tensor(idx, idx) for idx in indicies_node if idx is not None
+        ]
+
+        # Unpack the tuple
+        indices_unpacked = [torch.flatten(idx) for idx in indices_list]
+
+        # Convert to 2-D tensor
+        indices_qnn = torch.cat(indices_unpacked).unsqueeze(0)
+        indice_node = [n for n in indicies_node if isinstance(n, torch.fx.Node)]
+        # TODO consider to write a pass to combine to one input tensor for indices
+        assert len(indice_node) == 1, "Not support mutilple indices tensor"
+
+        indices_tensor_wrapper = self.define_tensor(
+            indice_node[0],
+            indices_qnn,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=True,
+        )
+        value_node = node.args[2]
+
+        value_tensor = self.get_tensor(value_node, node)
+
+        value_tensor_wrapper = self.define_tensor(
+            value_node,
+            value_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=True,
+        )
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+
+        index_put_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpScatterNd.op_name,
+        )
+        index_put_op.AddInputTensors(
+            [input_tensor_wrapper, indices_tensor_wrapper, value_tensor_wrapper]
+        )
+        index_put_op.AddOutputTensors([output_tensor_wrapper])
+
+        return index_put_op
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
index dca47ebeec..4a87e5dbbb 100644
--- a/backends/qualcomm/builders/qnn_constants.py
+++ b/backends/qualcomm/builders/qnn_constants.py
@@ -124,13 +124,6 @@ class OpExpandDims:
     param_axis: str = "axis"
 
 
-@dataclass(init=False, frozen=True)
-class OpReduceSum:
-    op_name: str = "ReduceSum"
-    param_axes: str = "axes"
-    param_keep_dims: str = "keep_dims"
-
-
 @dataclass(init=False, frozen=True)
 class OpFullyConnected:
     op_name: str = "FullyConnected"
@@ -144,13 +137,14 @@ class OpGather:
 
 
 @dataclass(init=False, frozen=True)
-class OpGelu:
-    op_name: str = "Gelu"
+class OpGatherND:
+    op_name: str = "GatherNd"
+    param_batch_dims: str = "batch_dims"
 
 
 @dataclass(init=False, frozen=True)
-class OpSqrt:
-    op_name: str = "ElementWiseSquareRoot"
+class OpGelu:
+    op_name: str = "Gelu"
 
 
 @dataclass(init=False, frozen=True)
@@ -246,6 +240,13 @@ class OpReduceMean:
     param_keep_dims: str = "keep_dims"
 
 
+@dataclass(init=False, frozen=True)
+class OpReduceSum:
+    op_name: str = "ReduceSum"
+    param_axes: str = "axes"
+    param_keep_dims: str = "keep_dims"
+
+
 @dataclass(init=False, frozen=True)
 class OpRelu:
     op_name: str = "Relu"
@@ -277,6 +278,12 @@ class OpResizeNearestNeighbor:
     param_half_pixel_centers: str = "half_pixel_centers"
 
 
+@dataclass(init=False, frozen=True)
+class OpScatterNd:
+    op_name: str = "ScatterNd"
+    param_reduction: str = "reduction"
+
+
 @dataclass(init=False, frozen=True)
 class OpSigmoid:
     op_name: str = "Sigmoid"
@@ -307,6 +314,11 @@ class OpSplit:
     param_split_index: str = "split_index"
 
 
+@dataclass(init=False, frozen=True)
+class OpSqrt:
+    op_name: str = "ElementWiseSquareRoot"
+
+
 @dataclass(init=False, frozen=True)
 class OpSqueeze:
     op_name: str = "Squeeze"
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
index 61935cf353..c60afc2dd3 100644
--- a/backends/qualcomm/partition/common_defs.py
+++ b/backends/qualcomm/partition/common_defs.py
@@ -13,8 +13,7 @@
     exir_ops.edge.aten.clone.default,
     exir_ops.edge.aten.full.default,
     exir_ops.edge.aten.slice_scatter.default,
-    exir_ops.edge.aten.index.Tensor,
-    exir_ops.edge.aten.index_put.default,
+    exir_ops.edge.aten.copy.default,
 ]
 
 allow_list_operator = [
diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py
index f2265daf32..d31b4753a3 100644
--- a/backends/qualcomm/quantizer/utils.py
+++ b/backends/qualcomm/quantizer/utils.py
@@ -784,6 +784,38 @@ def annotate_embedding(node: Node, quantization_config: QuantizationConfig) -> N
     )
 
 
+@register_annotator([torch.ops.aten.index.Tensor])
+def annotate_index(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_in_out_obs_sharing_op(node, quantization_config)
+    if not _is_annotated([node]):
+        input_qspec_map = {}
+        input = node.args[0]
+        input_qspec_map[input] = quantization_config.input_activation
+        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=SharedQuantizationSpec((input, node)),
+            _annotated=True,
+        )
+
+
+@register_annotator(
+    [torch.ops.aten.index_put.default, torch.ops.aten.index_put_.default]
+)
+def annotate_index_put(node: Node, quantization_config: QuantizationConfig) -> None:
+    input = node.args[0]
+    value = node.args[2]
+
+    input_qspec_map = {}
+    input_qspec_map[input] = quantization_config.input_activation
+    input_qspec_map[value] = SharedQuantizationSpec((input, node))
+
+    node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=SharedQuantizationSpec((input, node)),
+        _annotated=True,
+    )
+
+
 @register_annotator([torch.ops.aten.expand.default])
 def annotate_expand(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_in_out_obs_sharing_op(node, quantization_config)
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index fe72b1e893..ff52fc61b5 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -443,6 +443,29 @@ def forward(self, x):
         return self.hardtanh(x)
 
 
+class Index(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.idx0 = torch.tensor([[0, 1], [2, 3], [4, 5]])
+        self.idx1 = torch.tensor([[1, 2], [3, 4], [5, 6]])
+
+    def forward(self, x):
+        return x[self.idx0] + x[self.idx1]
+
+
+class IndexPut(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.register_buffer(
+            "k_cache",
+            torch.zeros((1, 1024, 12, 64), dtype=torch.float32),
+        )
+
+    def forward(self, input_pos, k_val):
+        k_out = torch.ops.aten.index_put_(self.k_cache, [None, input_pos], k_val)
+        return k_out
+
+
 class LayerNorm(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index f9d05131bb..80fc71ef7c 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -256,6 +256,19 @@ def test_qnn_backend_hardtanh(self):
         sample_input = (torch.randn([2, 5, 1, 3]),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_index(self):
+        module = Index()  # noqa: F405
+        sample_input = (torch.randn([8, 172, 64]),)
+        self.lower_module_and_test_output(module, sample_input)
+
+    def test_qnn_backend_index_put(self):
+        module = IndexPut()  # noqa: F405
+        sample_input = (
+            torch.tensor([2], dtype=torch.int32),
+            torch.randn([1, 1, 12, 64]),
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_interpolate_bilinear_2d(self):
         module = ResizeBilinear2D()  # noqa: F405
         sample_input = (torch.randn(2, 3, 4, 5),)
@@ -827,6 +840,21 @@ def test_qnn_backend_hardtanh(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_index(self):
+        module = Index()  # noqa: F405
+        sample_input = (torch.randn([8, 172, 64]),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
+    def test_qnn_backend_index_put(self):
+        module = IndexPut()  # noqa: F405
+        sample_input = (
+            torch.tensor([2], dtype=torch.int32),
+            torch.randn([1, 1, 12, 64]),
+        )
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_interpolate_bilinear_2d(self):
         module = ResizeBilinear2D()  # noqa: F405
         sample_input = (torch.randn(2, 3, 4, 5),)

From 34c3c3d7cc8df05bda986c2b463059617c1fac73 Mon Sep 17 00:00:00 2001
From: Lev1ty <adamxhy123@outlook.com>
Date: Thu, 1 Aug 2024 11:16:39 -0700
Subject: [PATCH 51/75] make rust-project deps oss compatible (#4506)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4506

Fixes while trying to get `buck2 run //integrations/rust-project` working

X-link: https://github.com/facebook/buck2/pull/729

Reviewed By: Wilfred, larryliu0820, darichey

Differential Revision: D60592378

Pulled By: davidbarsky

fbshipit-source-id: 6225e06c975642e004b376a6361683456b2eac7d
---
 shim/third-party/rust/Cargo.toml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/shim/third-party/rust/Cargo.toml b/shim/third-party/rust/Cargo.toml
index 718d9ea5a6..036742ff95 100644
--- a/shim/third-party/rust/Cargo.toml
+++ b/shim/third-party/rust/Cargo.toml
@@ -70,6 +70,7 @@ digest = "0.10"
 dirs = "3.0.1"
 dunce = "1.0.2"
 either = "1.8"
+elf = "0.7.4"
 enum-iterator = "1.4.1"
 enum-map = "0.6.3"
 env_logger = "0.9.0"
@@ -208,7 +209,8 @@ tower = "0.4"
 tower-layer = "0.3.1"
 tower-service = "0.3.2"
 tracing = "0.1.22"
-tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+tracing-core = "0.1.32"
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
 triomphe = "0.1.11"
 trybuild = "1.0.56"
 typed-arena = "2.0"
@@ -217,6 +219,7 @@ unicode-segmentation = "1.7"
 uuid = { version = "1.2", features = ["v4"] }
 walkdir = "2.3.2"
 which = "4.3.0"
+whoami = "1.5.1"
 windows_x86_64_msvc = "=0.48.0"  # our fixup only works if we are on precisely 0.48.0
 winapi = { version = "0.3", features = ["everything"] }
 x509-parser = { version = "0.14.0", features = ["verify"] }

From 6fce77f2809cd45995a5e515c6ad358af85c3e80 Mon Sep 17 00:00:00 2001
From: Songhao Jia <gasoonjia@meta.com>
Date: Thu, 1 Aug 2024 11:40:05 -0700
Subject: [PATCH 52/75] example of updating dim order for specific part of
 graph (#4404)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4404

An example of how to update the tensor's memory format of specific part in graph by inserting `_to_dim_order_copy` op before and after the specific part we want to update (here is the mul op).

Making it as a test for memory format pass.

Reviewed By: digantdesai, larryliu0820

Differential Revision: D60200604

fbshipit-source-id: 1dc88e0b84ba085a1fbcfe52b142ba6a014c2673
---
 exir/tests/TARGETS                        |   5 +
 exir/tests/test_memory_format_ops_pass.py | 187 ++++++++++++++++++++++
 2 files changed, 192 insertions(+)

diff --git a/exir/tests/TARGETS b/exir/tests/TARGETS
index 1333036b24..7ef04d1283 100644
--- a/exir/tests/TARGETS
+++ b/exir/tests/TARGETS
@@ -375,6 +375,11 @@ python_unittest(
     deps = [
         ":test_memory_format_ops_pass_utils",
         "//caffe2:torch",
+        "//executorch/exir:dim_order_utils",
+        "//executorch/exir:lib",
+        "//executorch/exir:pass_base",
+        "//executorch/exir/dialects:lib",
+        "//executorch/exir/dialects/edge:lib",
         "//executorch/extension/pybindings:portable_lib",  # @manual
     ],
 )
diff --git a/exir/tests/test_memory_format_ops_pass.py b/exir/tests/test_memory_format_ops_pass.py
index aac5927a32..7c9e1bd248 100644
--- a/exir/tests/test_memory_format_ops_pass.py
+++ b/exir/tests/test_memory_format_ops_pass.py
@@ -4,9 +4,22 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import copy
 import unittest
 
+from typing import Union
+
 import torch
+from executorch.exir import EdgeCompileConfig, to_edge
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
+
+from executorch.exir.dim_order_utils import (
+    get_dim_order,
+    is_channel_last_dim_order,
+    is_contiguous_dim_order,
+)
+from executorch.exir.pass_base import ExportPass, ProxyValue
 
 from executorch.exir.tests.test_memory_format_ops_pass_utils import (
     MemoryFormatOpsPassTestUtils,
@@ -20,6 +33,9 @@
     _load_for_executorch_from_buffer,
 )
 
+from torch.export import export
+from torch.testing import FileCheck
+
 
 class TestMemoryFormatOpsPass(unittest.TestCase):
     def test_op_to_copy_replacement_2d(self) -> None:
@@ -77,3 +93,174 @@ def test_op_dim_order_propagation(self) -> None:
                 _load_for_executorch_from_buffer=_load_for_executorch_from_buffer,
             ),
         )
+
+    # Only test dim order replacement result in lean mode test.
+    # This test is irrelevant with operator mode.
+    def test_dim_order_replacement(self) -> None:
+        class ToyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                t1 = x + y
+                t2 = t1 * x
+                t3 = t2 + y
+                return t3
+
+        def grab_tensor(arg: Union[ProxyValue, torch.Tensor]):
+            t = (
+                arg.to_tensor()
+                if isinstance(arg, ProxyValue) and arg.is_tensor()
+                else arg
+            )
+            assert isinstance(
+                t, torch.Tensor
+            ), f"Expecting a Tensor or a ProxyValue but got {type(t)}"
+            return t
+
+        class MulIOToChannelsLastPass(ExportPass):
+            """
+            This pass updates the dim order of the input tensors of mul op from contiguous to channels_last, and change the output dim order back to contiguous.
+            """
+
+            def call_operator(self, op, args, kwargs, meta):
+                if not (
+                    isinstance(op, EdgeOpOverload) and op.__name__ == "aten.mul.Tensor"
+                ):
+                    return super().call_operator(
+                        op,
+                        args,
+                        kwargs,
+                        meta,
+                    )
+
+                # new kwargs with dim_order, and no memory_format for the new op
+                nkwargs = dict(copy.deepcopy(kwargs))  # orig kwargs are immutable
+
+                _to_dim_order_copy_op = (
+                    exir_ops.edge.dim_order_ops._to_dim_order_copy.default
+                )
+
+                # update the dim order of the input tensors of mul op from contiguous to channels_last
+                new_args = []
+                for arg in args:
+                    # can always get the shape, assuming rank is specialized
+                    tensor_input = grab_tensor(arg)
+                    ndim = tensor_input.dim()
+                    dtype = tensor_input.dtype
+
+                    assert tensor_input.is_contiguous(
+                        memory_format=torch.contiguous_format
+                    ), "mul input should be in contiguous memory format"
+
+                    channels_last_dim_order = get_dim_order(torch.channels_last, ndim)
+                    contiguous_dim_order = get_dim_order(torch.contiguous_format, ndim)
+
+                    # convert the input tensors to channels_last
+                    arg_channels_last = super().call_operator(
+                        _to_dim_order_copy_op,
+                        (arg,),
+                        {"dtype": dtype, "dim_order": channels_last_dim_order},
+                        meta,
+                    )
+
+                    new_args.append(arg_channels_last)
+
+                new_args = tuple(new_args)
+
+                # call the mul op with the self tensor in channels_last
+                # mul op is using the same kernel to handle contiguous and channels_last inputs.
+                mul_out = super().call_operator(
+                    op,
+                    new_args,
+                    nkwargs,
+                    meta,
+                )
+
+                # convert the mul op output to contiguous
+                mul_out_contiguous = super().call_operator(
+                    _to_dim_order_copy_op,
+                    (mul_out,),
+                    {"dtype": dtype, "dim_order": contiguous_dim_order},
+                    meta,
+                )
+
+                return mul_out_contiguous
+
+        class MulIOCheckChannelsLastPass(ExportPass):
+            """
+            This pass updates the dim order of the input tensors of mul op from contiguous to channels_last, and change the output dim order back to contiguous.
+            """
+
+            def call_operator(self, op, args, kwargs, meta):
+                if not (
+                    isinstance(op, EdgeOpOverload) and op.__name__ == "aten.mul.Tensor"
+                ):
+                    return super().call_operator(
+                        op,
+                        args,
+                        kwargs,
+                        meta,
+                    )
+
+                # new kwargs with dim_order, and no memory_format for the new op
+                nkwargs = dict(copy.deepcopy(kwargs))  # orig kwargs are immutable
+
+                # check if the dim order of the input tensors of mul op is channels_last
+                for arg in args:
+                    # can always get the shape, assuming rank is specialized
+                    tensor_input = grab_tensor(arg)
+
+                    assert is_channel_last_dim_order(tensor_input)
+
+                # call the mul op with the self tensor in channels_last
+                # mul op is using the same kernel to handle contiguous and channels_last inputs.
+                mul_out = super().call_operator(
+                    op,
+                    args,
+                    nkwargs,
+                    meta,
+                )
+
+                # check if the dim order of the mul op output is channels_last
+                output_tensor = grab_tensor(mul_out)
+                assert is_channel_last_dim_order(output_tensor)
+
+                return mul_out
+
+        toy_model = ToyModel()
+        sample_input = (
+            torch.randn([2, 2, 2, 2], dtype=torch.float32),
+            torch.randn([2, 2, 2, 2], dtype=torch.float32),
+        )
+
+        _to_dim_order_op_str = "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default"
+
+        before_epm = to_edge(
+            export(toy_model, sample_input),
+            compile_config=EdgeCompileConfig(_skip_dim_order=False),
+        )
+
+        # should not contain _to_dim_order_copy op
+        FileCheck().check_not(_to_dim_order_op_str).run(
+            before_epm.exported_program().graph_module.code
+        )
+
+        # add the pass to update the input and output of mul op from contiguous to channels_last
+        updated_epm = before_epm.transform([MulIOToChannelsLastPass()])
+
+        # should contain exactly three _to_dim_order_copy op to update the dim order of both inputs and output of mul op
+        FileCheck().check_count(_to_dim_order_op_str, 3, exactly=True).run(
+            updated_epm.exported_program().graph_module.code
+        )
+
+        # add the pass to check the dim order of the input and output of mul op are channels_last
+        updated_epm = updated_epm.transform([MulIOCheckChannelsLastPass()])
+
+        # check original graph and update graph should have same result
+        expected = before_epm.exported_program().module()(*sample_input)
+        actual = updated_epm.exported_program().module()(*sample_input)
+        self.assertTrue(torch.allclose(actual, expected))
+
+        self.assertTrue(is_contiguous_dim_order(actual))
+        self.assertTrue(is_contiguous_dim_order(expected))

From c1d53baf0953c3bcff3dedc1cb7be1f89fade014 Mon Sep 17 00:00:00 2001
From: Guang Yang <guangyang@meta.com>
Date: Thu, 1 Aug 2024 12:02:04 -0700
Subject: [PATCH 53/75] Fix CI OOM issue (#4507)

Summary:
[OOM](https://github.com/pytorch/executorch/actions/runs/10202393165/job/28226397905) in CI. This model will require a larger runner

Pull Request resolved: https://github.com/pytorch/executorch/pull/4507

Reviewed By: JacobSzwejbka

Differential Revision: D60602064

Pulled By: guangy10

fbshipit-source-id: 3239f3496582ea7d9840ff2e92f6e762b4c9ab18
---
 .ci/scripts/gather_test_models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py
index d94b6ee3ec..36a64e4241 100755
--- a/.ci/scripts/gather_test_models.py
+++ b/.ci/scripts/gather_test_models.py
@@ -27,6 +27,7 @@
         # This one causes timeout on smaller runner, the root cause is unclear (T161064121)
         "dl3": "linux.12xlarge",
         "emformer_join": "linux.12xlarge",
+        "emformer_predict": "linux.12xlarge",
     }
 }
 

From d207eb0bec46866542ce378a8907d8e02660a083 Mon Sep 17 00:00:00 2001
From: Chen Lai <chenlai@meta.com>
Date: Thu, 1 Aug 2024 12:12:33 -0700
Subject: [PATCH 54/75] move get tokenizer to export_llama_lib (#4451)

Summary:
Move the get tokenizer logic to export_llama_lib so we can use it during export as well. It's needed in the second diff when to run pt2e for calibration

Pull Request resolved: https://github.com/pytorch/executorch/pull/4451

ghstack-source-id: 235865854
exported-using-ghexport

Reviewed By: larryliu0820

Differential Revision: D60419366

fbshipit-source-id: c9d6d7b6c2615bfa957791e13b2f640f0dbf3158
---
 examples/models/llama2/eval_llama_lib.py |  8 ++------
 extension/llm/tokenizer/targets.bzl      | 21 +++++++++++++++++++++
 extension/llm/tokenizer/utils.py         | 20 ++++++++++++++++++++
 3 files changed, 43 insertions(+), 6 deletions(-)
 create mode 100644 extension/llm/tokenizer/utils.py

diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py
index 5e23a25002..d44820e187 100644
--- a/examples/models/llama2/eval_llama_lib.py
+++ b/examples/models/llama2/eval_llama_lib.py
@@ -20,7 +20,7 @@
 from executorch.extension.llm.tokenizer.tokenizer import (
     Tokenizer as SentencePieceTokenizer,
 )
-
+from executorch.extension.llm.tokenizer.utils import get_tokenizer
 from lm_eval.api.model import LM
 
 from .export_llama_lib import (
@@ -103,11 +103,7 @@ def gen_eval_wrapper(
     Returns:
         eval_wrapper (LM): A wrapper interface for the lm-evaluation-harness library.
     """
-    try:
-        tokenizer = SentencePieceTokenizer(model_path=str(args.tokenizer_path))
-    except Exception:
-        print("Using Tiktokenizer")
-        tokenizer = Tiktoken(model_path=str(args.tokenizer_path))
+    tokenizer = get_tokenizer(args.tokenizer_path)
 
     # ExecuTorch Binary Evaluation
     if (model := args.pte) is not None:
diff --git a/extension/llm/tokenizer/targets.bzl b/extension/llm/tokenizer/targets.bzl
index 023968165b..8229bced89 100644
--- a/extension/llm/tokenizer/targets.bzl
+++ b/extension/llm/tokenizer/targets.bzl
@@ -25,6 +25,27 @@ def define_common_targets():
         ],
     )
 
+    runtime.python_library(
+        name = "utils",
+        srcs = [
+            "utils.py",
+        ],
+        base_module = "executorch.extension.llm.utils",
+        visibility = [
+            "//executorch/examples/...",
+            "//executorch/extension/llm/tokenizer/...",
+            "//bento/...",
+            "//bento_kernels/...",
+        ],
+        deps = [
+            "//executorch/examples/models/llama2/tokenizer:tiktoken",
+        ],
+        _is_external_target = True,
+        external_deps = [
+            "sentencepiece-py",
+        ],
+    )
+
     runtime.python_binary(
         name = "tokenizer_py",
         main_module = "executorch.extension.llm.tokenizer.tokenizer",
diff --git a/extension/llm/tokenizer/utils.py b/extension/llm/tokenizer/utils.py
new file mode 100644
index 0000000000..97aa4bf0c0
--- /dev/null
+++ b/extension/llm/tokenizer/utils.py
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.examples.models.llama2.tokenizer.tiktoken`.
+from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken
+from executorch.extension.llm.tokenizer.tokenizer import (
+    Tokenizer as SentencePieceTokenizer,
+)
+
+
+def get_tokenizer(tokenizer_path):
+    try:
+        tokenizer = SentencePieceTokenizer(model_path=str(tokenizer_path))
+    except Exception:
+        print("Using Tiktokenizer")
+        tokenizer = Tiktoken(model_path=str(tokenizer_path))
+    return tokenizer

From 64b77338bba0ccb1425617e1279bd90b455aaf87 Mon Sep 17 00:00:00 2001
From: Lucy Qiu <lfq@meta.com>
Date: Thu, 1 Aug 2024 13:00:54 -0700
Subject: [PATCH 55/75] update sampler reference (#4508)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4508

fix build https://www.internalfb.com/sandcastle/workflow/3607383301527440057

Reviewed By: JacobSzwejbka

Differential Revision: D60602001

fbshipit-source-id: eae526013812ffe5f831f35a35bcf7e4b080621a
---
 examples/qualcomm/llama2/qaihub_runner/runner.h | 2 +-
 examples/qualcomm/llama2/runner/runner.h        | 2 +-
 extension/llm/sampler/test/targets.bzl          | 2 +-
 extension/llm/sampler/test/test_sampler.cpp     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/qualcomm/llama2/qaihub_runner/runner.h b/examples/qualcomm/llama2/qaihub_runner/runner.h
index 3968388695..355616fa8f 100644
--- a/examples/qualcomm/llama2/qaihub_runner/runner.h
+++ b/examples/qualcomm/llama2/qaihub_runner/runner.h
@@ -17,8 +17,8 @@
 #include <string>
 #include <unordered_map>
 
-#include <executorch/examples/models/llama2/sampler/sampler.h>
 #include <executorch/examples/qualcomm/llama2/qaihub_runner/io_memory.h>
+#include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
diff --git a/examples/qualcomm/llama2/runner/runner.h b/examples/qualcomm/llama2/runner/runner.h
index 8bfe27ebab..cdbb2cdd2e 100644
--- a/examples/qualcomm/llama2/runner/runner.h
+++ b/examples/qualcomm/llama2/runner/runner.h
@@ -18,7 +18,7 @@
 #include <unordered_map>
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
-#include <executorch/examples/models/llama2/sampler/sampler.h>
+#include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
diff --git a/extension/llm/sampler/test/targets.bzl b/extension/llm/sampler/test/targets.bzl
index 5b24780433..83b3d31e4c 100644
--- a/extension/llm/sampler/test/targets.bzl
+++ b/extension/llm/sampler/test/targets.bzl
@@ -13,7 +13,7 @@ def define_common_targets():
             "test_sampler.cpp",
         ],
         deps = [
-            "//executorch/examples/models/llama2/sampler:sampler_aten",
+            "//executorch/extension/llm/sampler:sampler_aten",
         ],
         xplat_deps = [
             "//caffe2:torch_mobile_all_ops_et",
diff --git a/extension/llm/sampler/test/test_sampler.cpp b/extension/llm/sampler/test/test_sampler.cpp
index 866bff8367..2dac03d976 100644
--- a/extension/llm/sampler/test/test_sampler.cpp
+++ b/extension/llm/sampler/test/test_sampler.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/examples/models/llama2/sampler/sampler.h>
+#include <executorch/extension/llm/sampler/sampler.h>
 
 #include <gtest/gtest.h>
 #include <torch/torch.h>

From 4ba11e34fe43aa6e0804dcbf5f94e0b4ebabd15b Mon Sep 17 00:00:00 2001
From: Yujie Hui <huiyujie0105@meta.com>
Date: Thu, 1 Aug 2024 13:05:39 -0700
Subject: [PATCH 56/75] Register grid_priors nn.Module test (#4489)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4489

Register customized op `grid_priors` in nn.Module test. Now it can be exported from PyTorch nn.module to Vulkan.

bypass-github-export-checks
bypass-github-pytorch-ci-checks
bypass-github-executorch-ci-checks

Reviewed By: jorgep31415

Differential Revision: D60474638

fbshipit-source-id: 0ae75300f5833d9b0fb0565e639500b4b507b4b4
---
 backends/vulkan/partitioner/supported_ops.py  |  3 +++
 backends/vulkan/passes/custom_ops_defs.py     | 27 ++++++++++++++++---
 .../runtime/graph/ops/impl/GridPriors.cpp     |  2 +-
 backends/vulkan/test/test_vulkan_delegate.py  | 18 +++++++++++++
 .../vulkan/test/vulkan_compute_api_test.cpp   |  2 +-
 5 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py
index 26436a0eb9..08d7f96a6b 100644
--- a/backends/vulkan/partitioner/supported_ops.py
+++ b/backends/vulkan/partitioner/supported_ops.py
@@ -8,6 +8,8 @@
 
 import operator
 
+from executorch.backends.vulkan.passes.custom_ops_defs import grid_priors_op  # noqa
+
 from executorch.exir.dialects._ops import ops as exir_ops
 
 
@@ -129,6 +131,7 @@ def __contains__(self, op):
     exir_ops.edge.aten.upsample_nearest2d.vec,
     exir_ops.edge.aten.zeros.default,
     exir_ops.edge.aten.zeros_like.default,
+    exir_ops.edge.et_vk.grid_priors.default,
 ]
 
 
diff --git a/backends/vulkan/passes/custom_ops_defs.py b/backends/vulkan/passes/custom_ops_defs.py
index c76f7ebf75..62f21bfee6 100644
--- a/backends/vulkan/passes/custom_ops_defs.py
+++ b/backends/vulkan/passes/custom_ops_defs.py
@@ -48,15 +48,18 @@ def conv_with_clamp_impl(
 conv_with_clamp_op = getattr(getattr(torch.ops, namespace), name)
 
 
+# The dimension of x should be larger than 1
 def grid_priors_impl(
     x,
     stride,
     offset,
 ):
     height, width = x.shape[-2:]
-    shift_x = (torch.arange(0, width) + offset) * stride
-    shift_y = (torch.arange(0, height) + offset) * stride
-    shift_xx, shift_yy = torch.meshgrid(shift_y, shift_x)
+    # Need to specify device of torch.arange to avoid executorch exporting error
+    shift_x = (torch.arange(0, width, device=x.device) + offset) * stride
+    shift_y = (torch.arange(0, height, device=x.device) + offset) * stride
+    # Need to specify indexing parameter ('ij' is the default value) to avoid executorch exporting error
+    shift_xx, shift_yy = torch.meshgrid([shift_y, shift_x], indexing="ij")
     shift_xx = shift_xx.reshape(-1)
     shift_yy = shift_yy.reshape(-1)
     shifts = torch.stack((shift_yy, shift_xx), dim=-1)
@@ -67,3 +70,21 @@ def grid_priors_impl(
 lib.define(f"{name}(Tensor self, int stride, float offset) -> Tensor")
 lib.impl(name, grid_priors_impl, "CompositeExplicitAutograd")
 grid_priors_op = getattr(getattr(torch.ops, namespace), name)
+
+
+# When lowering to executorch, ops are converted from default to out variant. Hence, custom ops define both variants.
+def grid_priors_out_impl(
+    x,
+    stride,
+    offset,
+    out,
+):
+    out = grid_priors_impl(x, stride, offset)
+    return out
+
+
+name = "grid_priors_out"
+lib.define(
+    f"{name}(Tensor self, int stride, float offset, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.impl(name, grid_priors_out_impl, "CompositeExplicitAutograd")
diff --git a/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp b/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp
index b0658e37c2..17b6b351db 100644
--- a/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp
@@ -74,6 +74,6 @@ void grid_priors(ComputeGraph& graph, const std::vector<ValueRef>& args) {
 }
 
 REGISTER_OPERATORS {
-  VK_REGISTER_OP(grid_priors.default, grid_priors);
+  VK_REGISTER_OP(et_vk.grid_priors.default, grid_priors);
 }
 } // namespace vkcompute
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
index 778ce67787..b3fde403f7 100644
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ b/backends/vulkan/test/test_vulkan_delegate.py
@@ -1632,3 +1632,21 @@ def forward(self, x):
             (torch.tensor([[[0, 1], [0, 1]], [[4, 2], [3, 3]]]),),
             memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
         )
+
+    def test_vulkan_backend_grid_priors(self):
+        class GridPriorsModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.ops.et_vk.grid_priors(
+                    x,
+                    stride=8,
+                    offset=0.5,
+                )
+
+        self.lower_module_and_test_output(
+            GridPriorsModule(),
+            (torch.rand(size=[1, 5, 2, 3]),),
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 9d87de8bff..5f4fa519ca 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -2224,7 +2224,7 @@ void test_grid_priors(
       vkapi::kFloat,
       utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
 
-  VK_GET_OP_FN("grid_priors.default")
+  VK_GET_OP_FN("et_vk.grid_priors.default")
   (graph,
    {in.value,
     graph.add_scalar<int64_t>(stride),

From 301a017bb41557cbed7363bc0a16885eae75b8bb Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Thu, 1 Aug 2024 13:41:35 -0700
Subject: [PATCH 57/75] Add weights to model outputs (#4302)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4302

Added a pass that flags weights with associated gradients and adds them to the models outputs. Updated the emitter to handle that something could be both a 'constant' and have an allocation_info

Reviewed By: tarun292

Differential Revision: D59929421

fbshipit-source-id: 6c3c85d260754b7c8271afc1f89877af68943fab
---
 exir/_serialize/_program.py            |  24 ++++-
 exir/emit/_emit_program.py             |   7 +-
 exir/emit/_emitter.py                  | 118 ++++++++++++++-----------
 exir/memory_planning.py                |   6 +-
 exir/passes/TARGETS                    |  11 +++
 exir/passes/__init__.py                |   2 +
 exir/passes/weights_to_outputs_pass.py |  91 +++++++++++++++++++
 exir/program/TARGETS                   |   1 +
 exir/program/_program.py               |   5 ++
 exir/schema.py                         |   1 +
 exir/tensor.py                         |   5 --
 exir/tests/TARGETS                     |  11 +++
 exir/tests/test_joint_graph.py         |  91 +++++++++++++++++++
 13 files changed, 314 insertions(+), 59 deletions(-)
 create mode 100644 exir/passes/weights_to_outputs_pass.py
 create mode 100644 exir/tests/test_joint_graph.py

diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py
index bb5bdc9aa7..a82b947cec 100644
--- a/exir/_serialize/_program.py
+++ b/exir/_serialize/_program.py
@@ -309,7 +309,7 @@ def _extract_delegate_segments(
 
 def _extract_constant_segment(
     constant_buffer: List[Buffer],
-    tensor_alignment: int,
+    tensor_alignment: Optional[int] = None,
 ) -> Tuple[Cord, List[int]]:
     """Copies the tensors from the provided list into a Cord and tracks the offsets
         of each tensor.
@@ -329,7 +329,11 @@ def _extract_constant_segment(
         buffer = constant_buffer[i]
         constant_segment_data.append(buffer.storage)
         buffer_length = len(buffer.storage)
-        pad_length = _padding_required(buffer_length, tensor_alignment)
+        pad_length = (
+            _padding_required(buffer_length, tensor_alignment)
+            if tensor_alignment is not None
+            else 0
+        )
         if i < len(constant_buffer) - 1:
             constant_segment_data.append(b"\x00" * pad_length)
         constant_segment_offsets.append(current_offset)
@@ -341,6 +345,7 @@ def _extract_constant_segment(
 def serialize_pte_binary(
     program: Program,
     *,
+    mutable_data: Optional[List[Buffer]] = None,
     extract_delegate_segments: bool = False,
     extract_constant_segment: bool = False,
     segment_alignment: int = 4096,
@@ -396,6 +401,21 @@ def serialize_pte_binary(
             # Add to the aggregate segments cord.
             segments.append(constant_segment_data)
 
+    if mutable_data is not None:
+        mutable_segment_data, mutable_segment_offsets = _extract_constant_segment(
+            mutable_data,
+            tensor_alignment=None,  # data is copied at Method load so no need to align.
+        )
+        if len(mutable_segment_data) > 0:
+            # Update program.mutable_segment_data with constant subsegment offset information.
+            program.mutable_data_segments = [
+                SubsegmentOffsets(
+                    segment_index=len(segments), offsets=mutable_segment_offsets
+                ),
+            ]
+            # Add to the aggregate segments cord.
+            segments.append(mutable_segment_data)
+
     if extract_delegate_segments:
         _extract_delegate_segments(program, segments)
 
diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py
index 6b545e0a7d..3dcf8fb759 100644
--- a/exir/emit/_emit_program.py
+++ b/exir/emit/_emit_program.py
@@ -17,7 +17,8 @@
     _TopLevelEmitter,
 )
 from executorch.exir.error import ExportError, ExportErrorType
-from executorch.exir.schema import Program, SubsegmentOffsets
+
+from executorch.exir.schema import Buffer, Program, SubsegmentOffsets
 from executorch.exir.version import EXECUTORCH_SCHEMA_VERSION
 from torch.export.exported_program import ExportedProgram, OutputKind
 from torch.utils import _pytree as pytree
@@ -44,6 +45,8 @@ class EmitterOutput:
         str, Dict[int, Dict[str, Union[str, _DelegateDebugIdentifierMap]]]
     ]
 
+    mutable_data: List[Buffer]
+
 
 def _remove_non_user_outputs(exported_program: ExportedProgram) -> torch.fx.GraphModule:
     gm = exported_program.graph_module
@@ -156,5 +159,7 @@ def emit_program(
             segments=[],
             # Subsegment offsets may be added at serialization time.
             constant_segment=SubsegmentOffsets(segment_index=0, offsets=[]),
+            mutable_data_segments=None,  # Will be filled in during serialization
         ),
+        mutable_data=program_state.mutable_buffer,
     )
diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
index f57ed15d10..9b12d1f4ad 100644
--- a/exir/emit/_emitter.py
+++ b/exir/emit/_emitter.py
@@ -110,8 +110,11 @@ class _ProgramState:
     # emitted graph modules, not any weights emitted from itself. This should speed up the lookup,
     # from O(N) to O(1)
     cached_spec_hash_values: Dict[str, int] = field(default_factory=dict)
+    cached_spec_mutable_hash_values: Dict[str, int] = field(default_factory=dict)
     # The 0 index is reserved to be pointed to by non-constant tensors, so add an empty placeholder.
     constant_buffer: List[Buffer] = field(default_factory=lambda: [Buffer(storage=b"")])
+    # The 0 index is reserved to be pointed to by non-constant tensors, so add an empty placeholder.
+    mutable_buffer: List[Buffer] = field(default_factory=lambda: [Buffer(storage=b"")])
     # Delegate data stored directly in the flatbuffer. Pointed to by BackendDelegateDataReference,
     # and should be copied to Program.backend_delegate_data.
     backend_delegate_data: List[BackendDelegateInlineData] = field(default_factory=list)
@@ -326,68 +329,83 @@ def _emit_list(self, val: List[_Argument], val_type: _SchemaType) -> EValue:
 
     def _tensor_spec_to_evalue(self, spec: TensorSpec) -> EValue:
         """Constructs an EValue from the given TensorSpec."""
-        if not spec.const:
-            if spec.mem_id is not None:
-                # Tensor is an activation.
-                self._internal_assert_emitter(
-                    isinstance(spec.mem_id, int) and spec.mem_id >= 0,
-                    self.node,
-                    "Non-const tensor should be an activation tensor",
-                )
 
-                self._internal_assert_emitter(
-                    isinstance(spec.mem_offset, int) and spec.mem_offset >= 0,
-                    self.node,
-                    "Non-const tensor should be an activation tensor",
-                )
-                allocation_info = make_allocation_info(spec.mem_id, spec.mem_offset)
-            else:
-                # Tensor is an input/placeholder.
-                allocation_info = None
+        allocation_info = None
+        buffer_idx = 0
 
-            # For non-constant tensors, constant_buffer = 0.
-            return EValue(make_tensor_value(0, allocation_info, spec))
+        # Need to memory plan
+        # Some users set mem_id on all tensors and then rely on the
+        # default algos to set offsets, so need to check both.
+        if spec.mem_id is not None and spec.mem_offset is not None:
+            # Tensor is an activation.
+            self._internal_assert_emitter(
+                isinstance(spec.mem_id, int) and spec.mem_id >= 0,
+                self.node,
+                f"Non-const tensor should be an activation tensor: mem_id {spec.mem_id}",
+            )
 
-        # Constant tensor. Reserve a buffer for the constant tensor.
-        spec_array_type = (
-            ctypes.c_char * typing.cast(torch.UntypedStorage, spec.storage).nbytes()
-        )
+            self._internal_assert_emitter(
+                isinstance(spec.mem_offset, int) and spec.mem_offset >= 0,
+                self.node,
+                f"Non-const tensor should be an activation tensor: mem_offset {spec.mem_offset}",
+            )
+            allocation_info = make_allocation_info(spec.mem_id, spec.mem_offset)
 
-        buffer_data = (
-            bytes(
-                ctypes.cast(
-                    typing.cast(torch.UntypedStorage, spec.storage).data_ptr(),
-                    ctypes.POINTER(spec_array_type),
-                ).contents
+        if spec.const:
+            # Tensor with a blob we need to serialize. May not actually be constant at runtime
+            # if it's a weight with an associated gradient
+            spec_array_type = (
+                ctypes.c_char * typing.cast(torch.UntypedStorage, spec.storage).nbytes()
             )
-            if spec.allocated_memory != 0
-            else b""
-        )
 
-        hashed = hashlib.sha256(buffer_data).hexdigest()
+            buffer_data = (
+                bytes(
+                    ctypes.cast(
+                        typing.cast(torch.UntypedStorage, spec.storage).data_ptr(),
+                        ctypes.POINTER(spec_array_type),
+                    ).contents
+                )
+                if spec.allocated_memory != 0
+                else b""
+            )
 
-        buffer_idx = self.program_state.cached_spec_hash_values.get(hashed, -1)
+            hashed = hashlib.sha256(buffer_data).hexdigest()
 
-        # Haven't seen this constant before
-        if buffer_idx == -1:
-            # Update buffer_idx to point to the end of the list where we are adding the new buffer.
-            buffer = Buffer(storage=buffer_data)
-            buffer_idx = len(self.program_state.constant_buffer)
-            self.program_state.allocated_specs.append(spec)
-            # +1 because the first buffer location is reserved
-            self.program_state.cached_spec_hash_values[hashed] = buffer_idx
-            self.program_state.constant_buffer.append(buffer)
+            if allocation_info:
+                buffer_idx = self.program_state.cached_spec_mutable_hash_values.get(
+                    hashed, -1
+                )
+            else:
+                buffer_idx = self.program_state.cached_spec_hash_values.get(hashed, -1)
+
+            # Haven't seen this constant before
+            if buffer_idx == -1:
+                # Update buffer_idx to point to the end of the list where we are adding the new buffer.
+                buffer = Buffer(storage=buffer_data)
+                self.program_state.allocated_specs.append(spec)
+                # +1 because the first buffer location is reserved
+
+                if allocation_info:
+                    buffer_idx = len(self.program_state.mutable_buffer)
+                    self.program_state.cached_spec_mutable_hash_values[hashed] = (
+                        buffer_idx
+                    )
+                    self.program_state.mutable_buffer.append(buffer)
+                else:
+                    buffer_idx = len(self.program_state.constant_buffer)
+                    self.program_state.cached_spec_hash_values[hashed] = buffer_idx
+                    self.program_state.constant_buffer.append(buffer)
 
-        if spec.const and spec.nbytes() != len(buffer_data):
-            raise InternalError(
-                self._emit_node_specific_error(
-                    self.node,
-                    f"Tensor spec has buffer of size {len(buffer_data)}, but expected nbytes of {spec.nbytes()}",
+            if spec.const and spec.nbytes() != len(buffer_data):
+                raise InternalError(
+                    self._emit_node_specific_error(
+                        self.node,
+                        f"Tensor spec has buffer of size {len(buffer_data)}, but expected nbytes of {spec.nbytes()}",
+                    )
                 )
-            )
 
         # For constant tensors, allocation_info = None.
-        return EValue(make_tensor_value(buffer_idx, None, spec))
+        return EValue(make_tensor_value(buffer_idx, allocation_info, spec))
 
     def _get_list_tuple_jit_type(
         self, val: Union[Tuple[_Argument], List[_Argument]]
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
index 9e722f48d5..e3f3ebc7c8 100644
--- a/exir/memory_planning.py
+++ b/exir/memory_planning.py
@@ -427,7 +427,11 @@ def collect_specs_from_nodes(  # noqa: C901
                 continue
             if ignore_graph_output and spec in graph_output_tensors:
                 continue
-            if ignore_const and spec.const:
+            if (
+                ignore_const
+                and spec.const
+                and not node.meta.get("weight_has_gradient", False)
+            ):
                 continue
             if dedup:
                 if spec in unique_spec:
diff --git a/exir/passes/TARGETS b/exir/passes/TARGETS
index 7dcde950b9..4e59af26ea 100644
--- a/exir/passes/TARGETS
+++ b/exir/passes/TARGETS
@@ -25,6 +25,7 @@ python_library(
         ":spec_prop_pass",
         ":sym_shape_eval_pass",
         ":sym_to_tensor_pass",
+        ":weights_to_outputs_pass",
         "//caffe2:torch",
         "//executorch/exir:common",
         "//executorch/exir:control_flow",
@@ -62,6 +63,16 @@ python_library(
     ],
 )
 
+python_library(
+    name = "weights_to_outputs_pass",
+    srcs = [
+        "weights_to_outputs_pass.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+    ],
+)
+
 python_library(
     name = "const_prop_pass",
     srcs = [
diff --git a/exir/passes/__init__.py b/exir/passes/__init__.py
index 594de3f79a..99507ccdc9 100644
--- a/exir/passes/__init__.py
+++ b/exir/passes/__init__.py
@@ -54,6 +54,7 @@
 from executorch.exir.passes.spec_prop_pass import SpecPropPass
 from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass
 from executorch.exir.passes.sym_to_tensor_pass import SymToTensorPass
+from executorch.exir.passes.weights_to_outputs_pass import weights_to_outputs_pass
 from torch import fx
 from torch._subclasses import FakeTensor
 from torch.fx.passes.infra.pass_base import PassBase, PassResult
@@ -69,6 +70,7 @@
     "MemoryPlanningPass",
     "HintBasedSymShapeEvalPass",
     "insert_write_back_for_buffers_pass",
+    "weights_to_outputs_pass",
 ]
 
 Argument = Optional[
diff --git a/exir/passes/weights_to_outputs_pass.py b/exir/passes/weights_to_outputs_pass.py
new file mode 100644
index 0000000000..216830c2e6
--- /dev/null
+++ b/exir/passes/weights_to_outputs_pass.py
@@ -0,0 +1,91 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from torch.export import ExportedProgram
+from torch.export.exported_program import OutputKind, OutputSpec, TensorArgument
+
+
+def weights_to_outputs_pass(
+    exported_program: ExportedProgram,
+) -> ExportedProgram:
+    """
+    This pass is for training graphs with gradients returned. It flags the weights as having a gradient attached,
+    and appends them to the outputs in order to make the weights easier to handle in memory planning and the emitter.
+
+    Args:
+        exported_program: The ExportedProgram to update.
+
+    Returns:
+        The modified ExportedProgram.
+    """
+    if (
+        len([node for node in exported_program.graph.nodes if node.op == "placeholder"])
+        == 0
+    ):
+        return exported_program
+
+    gs = exported_program.graph_signature
+    gm = exported_program.graph_module
+
+    # Check for/ get gradients
+    grad_targets = [
+        spec.target
+        for spec in gs.output_specs
+        if spec.kind == OutputKind.GRADIENT_TO_PARAMETER
+    ]
+
+    # If no gradients, return
+    if len(grad_targets) == 0:
+        return exported_program
+
+    inputs_to_params = gs.inputs_to_parameters
+
+    # Get output node
+    output_node = None
+    for node in gm.graph.nodes:
+        if node.op == "output":
+            output_node = node
+            break
+    assert output_node is not None
+
+    # Get place holder nodes with gradients
+    placeholder_nodes = [
+        node
+        for node in gm.graph.nodes
+        if node.op == "placeholder" and node.target in inputs_to_params.keys()
+    ]
+
+    # Flag these placeholder nodes as having a gradient attached so that memory planning will operate on them.
+    for node in placeholder_nodes:
+        node.meta["weight_has_gradient"] = True
+
+    # add to output node
+    new_output_nodes = []
+    new_output_nodes.extend(output_node.args[0])
+    new_output_nodes.extend(placeholder_nodes)
+    # Remove old outputs
+    new_output = gm.graph.output(tuple(new_output_nodes))
+    output_node.replace_all_uses_with(new_output)
+    gm.graph.erase_node(output_node)
+
+    # add to output signature
+    for node in placeholder_nodes:
+        gs.output_specs.append(
+            OutputSpec(
+                OutputKind.TOKEN,  # This is a hack. We are returning the raw weights here to make it easier for memory
+                # planning and the emitter. There is no outputkind.Parameter so I am using TOKEN which is currently unused in Edge.
+                TensorArgument(node.target),
+                None,
+            )
+        )
+
+    # Cleanup the graph.
+    exported_program.graph.eliminate_dead_code()
+    exported_program.graph_module.recompile()
+
+    return exported_program
diff --git a/exir/program/TARGETS b/exir/program/TARGETS
index ef4e619e1e..730c9e93ae 100644
--- a/exir/program/TARGETS
+++ b/exir/program/TARGETS
@@ -40,6 +40,7 @@ python_library(
         "//executorch/exir/passes:replace_aten_with_edge_pass",
         "//executorch/exir/passes:replace_view_copy_with_view_pass",
         "//executorch/exir/passes:spec_prop_pass",
+        "//executorch/exir/passes:weights_to_outputs_pass",
         "//executorch/exir/verification:verifier",
     ],
 )
diff --git a/exir/program/_program.py b/exir/program/_program.py
index fd6253a8aa..dda2da7fa7 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import copy
 import io
 import logging
@@ -43,6 +45,7 @@
     ReplaceViewCopyWithViewPass,
 )
 from executorch.exir.passes.spec_prop_pass import SpecPropPass
+from executorch.exir.passes.weights_to_outputs_pass import weights_to_outputs_pass
 from executorch.exir.print_program import pretty_print, print_program
 from executorch.exir.schema import Program
 from executorch.exir.tracer import _default_decomposition_table
@@ -1227,6 +1230,7 @@ def to_executorch(
 
         execution_programs: Dict[str, ExportedProgram] = {}
         for name, program in self._edge_programs.items():
+            program = weights_to_outputs_pass(program)
             program = unsafe_remove_auto_functionalized_pass(program)
             gm, new_signature = insert_write_back_for_buffers_pass(program)
             new_gm = program.graph_module
@@ -1322,6 +1326,7 @@ def __init__(
         # Serialize emitter output, ready to be written to a file.
         self._pte_data: Cord = _serialize_pte_binary(
             program=self._emitter_output.program,
+            mutable_data=self._emitter_output.mutable_data,
             extract_delegate_segments=backend_config.extract_delegate_segments,
             extract_constant_segment=backend_config.extract_constant_segment,
             segment_alignment=backend_config.segment_alignment,
diff --git a/exir/schema.py b/exir/schema.py
index e9b589f839..706bc61140 100644
--- a/exir/schema.py
+++ b/exir/schema.py
@@ -265,3 +265,4 @@ class Program:
     backend_delegate_data: List[BackendDelegateInlineData]
     segments: List[DataSegment]
     constant_segment: SubsegmentOffsets
+    mutable_data_segments: Optional[List[SubsegmentOffsets]] = None
diff --git a/exir/tensor.py b/exir/tensor.py
index da35c2c491..7380a96ebc 100644
--- a/exir/tensor.py
+++ b/exir/tensor.py
@@ -326,11 +326,6 @@ def to_list(
         else:
             return x
 
-    internal_assert(
-        not spec.const or not allocation_info,
-        "We only create non-constant tensors as the constant tensors are directly written to buffer",
-    )
-
     tensor_size = to_list(spec.shape)
     tensor_dim_order = to_list(spec.dim_order)
 
diff --git a/exir/tests/TARGETS b/exir/tests/TARGETS
index 7ef04d1283..8c20467ae6 100644
--- a/exir/tests/TARGETS
+++ b/exir/tests/TARGETS
@@ -101,6 +101,17 @@ python_unittest(
     ],
 )
 
+python_unittest(
+    name = "joint_graph",
+    srcs = [
+        "test_joint_graph.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/exir:lib",
+    ],
+)
+
 python_unittest(
     name = "error",
     srcs = [
diff --git a/exir/tests/test_joint_graph.py b/exir/tests/test_joint_graph.py
new file mode 100644
index 0000000000..0aa724479b
--- /dev/null
+++ b/exir/tests/test_joint_graph.py
@@ -0,0 +1,91 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+import unittest
+
+import torch
+import torch._dynamo
+
+from executorch.exir import to_edge
+from torch.export._trace import _export
+from torch.export.experimental import _export_forward_backward
+from torch.export.exported_program import OutputKind
+
+
+class TestJointGraph(unittest.TestCase):
+    def test_joint_graph(self) -> None:
+        class Module(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 3)
+                self.loss = torch.nn.CrossEntropyLoss()
+
+            def forward(self, x, y):
+                return self.loss(self.linear(x).softmax(dim=0), y)
+
+        m = Module()
+        example_inputs = (torch.ones(3), torch.tensor([1.0, 0.0, 0.0]))
+        m(*example_inputs)
+        ep = _export(m, example_inputs, pre_dispatch=True)
+        joint_ep = _export_forward_backward(ep)
+        edge = to_edge(joint_ep)
+
+        output_node = None
+        for node in edge.exported_program().graph.nodes:
+            if node.op == "output":
+                output_node = node
+                break
+
+        orig_outputs = len(output_node.args[0])
+
+        et = edge.to_executorch()
+
+        weight_output_specs = [
+            spec
+            for spec in et.exported_program().graph_signature.output_specs
+            if spec.kind == OutputKind.TOKEN
+        ]
+
+        output_node = None
+        for node in et.exported_program().graph.nodes:
+            if node.op == "output":
+                output_node = node
+                break
+
+        weight_outputs = len(output_node.args[0])
+
+        # make sure 2 new outputs are added to both the node and the spec
+        self.assertEqual(len(weight_output_specs), 2)  # linear layer weight and bias
+        self.assertEqual(
+            weight_outputs - orig_outputs, 2
+        )  # linear layer weight and bias
+
+        # assert that the weight and bias have proper data_buffer_idx and allocation_info
+        self.assertEqual(
+            et.executorch_program.execution_plan[0]  # pyre-ignore
+            .values[0]
+            .val.data_buffer_idx,
+            1,
+        )
+        self.assertEqual(
+            et.executorch_program.execution_plan[0]  # pyre-ignore
+            .values[1]
+            .val.data_buffer_idx,
+            2,
+        )
+        self.assertEqual(
+            et.executorch_program.execution_plan[0]  # pyre-ignore
+            .values[0]
+            .val.allocation_info.memory_offset_low,
+            0,
+        )
+        self.assertEqual(
+            et.executorch_program.execution_plan[0]  # pyre-ignore
+            .values[1]
+            .val.allocation_info.memory_offset_low,
+            48,
+        )

From aa56e8ceb97d3bd0261f724bd668085cd0500af7 Mon Sep 17 00:00:00 2001
From: Matthias Cremon <matthiascremon@meta.com>
Date: Thu, 1 Aug 2024 14:20:02 -0700
Subject: [PATCH 58/75] Split quantize_pt2 to allow calling the same APIs in
 testing and regular flows (#4505)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4505

Splitting `quantize_pt2` into two steps: `convert_pt2` and `fuse_pt2`. Convert will return the converted model after `convert_pt2e`, which allows getting reference outputs for testing. Fuse will return the final fused graph. Those calls should be always be using the same quantizer. Note that we will probably split the convert step again to allow calibration in a follow up diff.

`quantize_pt2` is still the one-liner API, for anything that doesn't require converted reference outputs (so mostly for e2e testing).

Main benefit is that we can use the same API everywhere now, and things like decomposing SDPA and any other ATen IR passes that need to run before quantization can be done in one location (in `convert_pt2`).

Reviewed By: dulinriley

Differential Revision: D60544102

fbshipit-source-id: 7866d26c6ed05cb8a8bf02eb7920a7adbac5f03a
---
 backends/cadence/aot/compiler.py | 65 ++++++++++++++++++++++++++++----
 1 file changed, 58 insertions(+), 7 deletions(-)

diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
index 39511ae917..509e254b55 100644
--- a/backends/cadence/aot/compiler.py
+++ b/backends/cadence/aot/compiler.py
@@ -7,6 +7,7 @@
 # pyre-strict
 
 import logging
+from typing import Optional
 
 import torch
 
@@ -36,16 +37,24 @@
 from torch.export.exported_program import ExportedProgram
 
 
-def quantize_pt2(
+# Note: this is not meant as a primary API since it can create inconsistencies
+# if the quantizer here is different from the quantizer used to convert. It is
+# however useful for unit tests to separate the converted model from the fused
+# model, to be able to get reference numerics.
+# If this does not apply, please use quantize_and_fuse_pt2 instead.
+def convert_pt2(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
+    quantizer: CadenceQuantizer,
 ) -> torch.fx.GraphModule:
     """
-    Instantiate the CadenceQuantizer (PTQ), prepare, convert and fuse the model.
-    Returns a GraphModule with the quantized model.
+    Prepare and convert a model using the given quantizer.
+    The quantizer must be supplied and be the same as the one used to
+    fuse the model later, if applicable. If you do not expect that behavior,
+    please use quantize_and_fuse_pt2 instead, which will instantiate a
+    default quantizer for you if needed.
+    Returns a GraphModule with the converted model.
     """
-    # Quantizer
-    quantizer = CadenceQuantizer()
 
     # Export with dynamo
     model_exp = capture_pre_autograd_graph(model, inputs)
@@ -62,12 +71,54 @@ def quantize_pt2(
     # Convert
     converted_model = convert_pt2e(prepared_model)
 
+    return converted_model
+
+
+# Note: this is not meant as a primary API since it can create inconsistencies
+# if the quantizer here is different from the quantizer used to convert. It is
+# however useful for unit tests to separate the converted model from the fused
+# model, to be able to get reference numerics.
+# If this does not apply, please use quantize_and_fuse_pt2 instead.
+def fuse_pt2(
+    converted_graph_module: torch.fx.GraphModule,
+    quantizer: CadenceQuantizer,
+) -> torch.fx.GraphModule:
+    """
+    Fuse a converted graph module using the given quantizer.
+    The quantizer must be the same as the one used to convert the model.
+    If you do not expect that behavior, please use quantize_and_fuse_pt2 instead,
+    which will instantiate a default quantizer for you if needed.
+    Returns a GraphModule with the fused model.
+    """
     # Get patterns and apply fusion of dq -> op -> q to qop
     # pyre-ignore[16]: no attribute
     patterns = [q.pattern for q in quantizer.quantizers]
-    QuantFusion(patterns)(converted_model)
+    QuantFusion(patterns)(converted_graph_module)
 
-    return converted_model
+    return converted_graph_module
+
+
+# Note: this is the one-liner API to quantize and fuse a model.
+def quantize_pt2(
+    model: torch.nn.Module,
+    inputs: tuple[object, ...],
+    quantizer: Optional[CadenceQuantizer] = None,
+) -> torch.fx.GraphModule:
+    """
+    Prepare, convert and fuse the model using the given quantizer.
+    Returns a GraphModule with the quantized model.
+    """
+    # Quantizer
+    if not quantizer:
+        quantizer = CadenceQuantizer()
+
+    # Get converted graph module
+    converted_gm = convert_pt2(model, inputs, quantizer)
+
+    # Get fused model
+    fused_gm = fuse_pt2(converted_gm, quantizer)
+
+    return fused_gm
 
 
 # Export the model and lower it to an ExportedProgram (in aten IR)

From c329d6af52489d793c165aa4fdeb85fbccc27aab Mon Sep 17 00:00:00 2001
From: Max Ren <maxren@meta.com>
Date: Thu, 1 Aug 2024 14:43:53 -0700
Subject: [PATCH 59/75] BC Deprecate XN00 Support (#4450)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4450

Final step, we deprecate XN00 support:
We delete the runtime_schema.fbs and move completely to the schema used AoT (this schema only supports XN01)
Remove all wiring within the runtime which supports the XN00 format. This includes any logic surrounding constant_buffer and dq_datatype

Reviewed By: digantdesai

Differential Revision: D60403908

fbshipit-source-id: b7e692a06d9308112626b8e3afc441e1fdb39342
---
 backends/xnnpack/CMakeLists.txt               |  16 +-
 backends/xnnpack/runtime/XNNCompiler.cpp      | 163 ++------
 .../xnnpack/serialization/runtime_schema.fbs  | 354 ------------------
 .../serialization/schema_version_history.txt  |   0
 backends/xnnpack/serialization/targets.bzl    |   4 +-
 5 files changed, 43 insertions(+), 494 deletions(-)
 delete mode 100644 backends/xnnpack/serialization/runtime_schema.fbs
 create mode 100644 backends/xnnpack/serialization/schema_version_history.txt

diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index 1ac7867f3c..b0b80d8633 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -37,21 +37,10 @@ set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 
 set(_xnnpack_schema__include_dir "${CMAKE_BINARY_DIR}/schema/include")
 # Paths to headers generated from the .fbs files.
-set(_xnnpack_flatbuffer__outputs)
-foreach(fbs_file ${_xnnpack_schema__srcs})
-  string(REGEX REPLACE "([^/]+)[.]fbs$" "\\1_generated.h" generated
-                       "${fbs_file}"
-  )
-  list(APPEND _xnnpack_flatbuffer__outputs
-       "${_xnnpack_schema__include_dir}/executorch/${generated}"
-  )
-endforeach()
-
 set(_xnnpack_schema__outputs)
 foreach(fbs_file ${_xnnpack_schema__srcs})
-  string(REGEX REPLACE "runtime_([^/]+)[.]fbs$" "\\1_generated.h" generated
-                       "${fbs_file}"
-  )
+  string(REGEX REPLACE "([^/]+)[.]fbs$" "\\1_generated.h"
+                       generated "${fbs_file}")
   list(APPEND _xnnpack_schema__outputs
        "${_xnnpack_schema__include_dir}/executorch/${generated}"
   )
@@ -64,7 +53,6 @@ add_custom_command(
     ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --scoped-enums -o
     "${_xnnpack_schema__include_dir}/executorch/backends/xnnpack/serialization"
     ${_xnnpack_schema__srcs}
-  COMMAND mv ${_xnnpack_flatbuffer__outputs} ${_xnnpack_schema__outputs}
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
   COMMENT "Generating xnnpack_schema headers"
   VERBATIM
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index 8c8db60065..722dabdfbe 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -124,17 +124,9 @@ const uint8_t* getConstantDataPtr(
     const uint8_t* constant_data_ptr) {
   auto buffer_idx = tensor_value->constant_buffer_idx();
   if (buffer_idx) {
-    if (!constant_data_ptr) {
-      // TODO(T172265611): Remove constant_buffer in flatbuffer path after BC
-      // window
-      const auto& constant_buffer = *flatbuffer_graph->constant_buffer();
-      return constant_buffer[buffer_idx]->storage()->data();
-    } else {
-      const auto& constant_data_offsets = *flatbuffer_graph->constant_data();
-      uint64_t constant_data_offset =
-          constant_data_offsets[buffer_idx]->offset();
-      return constant_data_ptr + constant_data_offset;
-    }
+    const auto& constant_data_offsets = *flatbuffer_graph->constant_data();
+    uint64_t constant_data_offset = constant_data_offsets[buffer_idx]->offset();
+    return constant_data_ptr + constant_data_offset;
   }
 
   return nullptr;
@@ -194,105 +186,29 @@ Error defineTensor(
 
   xnn_status status;
   // The type we might have to convert to
-  auto dq_datatype = getDataType(tensor_value->dq_datatype());
-
-  if (dq_datatype != xnn_datatype::xnn_datatype_invalid) {
-    if (dq_datatype != xnn_datatype::xnn_datatype_qint8) {
-      ET_CHECK_OR_RETURN_ERROR(
-          false,
-          Internal,
-          "Only int8_t is supported for dq_datatype for now, got: %d",
-          dq_datatype);
-    } else {
-      ET_CHECK_OR_RETURN_ERROR(
-          (tensor_value->flags() & XNN_VALUE_FLAG_EXTERNAL_INPUT),
-          Internal,
-          "Dynamic quantization of tensor is only allowed for the external input tensor value for now! got flags: %u",
-          tensor_value->flags());
-    }
-  }
+  auto datatype = getDataType(tensor_value->datatype());
 
   if (qtensor_value == nullptr) {
     // FP32 tensor
-    if (!isQuantizedDataType(dq_datatype)) {
-      // Define non-quantied tensor
-      status = xnn_define_tensor_value(
-          /*subgraph=*/subgraph_ptr,
-          /*datatype=*/getDataType(tensor_value->datatype()),
-          /*num_dims=*/tensor_value->num_dims(),
-          /*dims=*/dims_data.data(),
-          /*data=*/buffer_ptr,
-          /*external_id=*/tensor_value->external_id(),
-          /*flags=*/tensor_value->flags(),
-          /*id_out=*/&id);
-    } else if (dq_datatype != xnn_datatype::xnn_datatype_invalid) {
-      ET_CHECK_OR_RETURN_ERROR(
-          isQuantizedDataType(dq_datatype),
-          Internal,
-          "Dynamic quantization can only produce supported quantized dtypes");
-      ET_CHECK_OR_RETURN_ERROR(
-          tensor_value->external_id() != XNN_INVALID_VALUE_ID,
-          Internal,
-          "Dynamic quantization can only work with external inputs for now, got an internal ID");
-      ET_CHECK_OR_RETURN_ERROR(
-          buffer_ptr == nullptr,
-          Internal,
-          "Dynamic quantization can only work with external inputs for now, got const data");
-
-      switch (dq_datatype) {
-        case xnn_datatype::xnn_datatype_qint8: {
-          // HACK TO Maintain FC/BC for ASR this will be removed after 01/2024
-
-          // When encountering a dynamically quantized tensor via dq_datatype,
-          // which is the old flow for serializing dynamically quantized linear.
-          // We replace the definition of a single tensor with a new dynamic
-          // Quantization pattern. We change the pattern from:
-          //     serialized_qd_input
-          //           to
-          // (fp32_input --> convert --> qdint8_input)
-
-          status = xnn_define_dynamically_quantized_tensor_value(
-              /*subgraph=*/subgraph_ptr,
-              /*datatype=*/xnn_datatype_qdint8,
-              /*num_dims=*/tensor_value->num_dims(),
-              /*num_nonbatch_dims=*/1, // always do per token quantization
-              /*dims=*/dims_data.data(),
-              /*external_id=*/XNN_INVALID_VALUE_ID, // always internal value id
-              /*flags=*/0, // this is netiher external input or output
-              /*id_out=*/&id);
-
-          // this is the FP16 or FP32 external value that is being dynamically
-          // quantized
-          uint32_t float_id;
-          enum xnn_datatype fp_datatype = getDataType(tensor_value->datatype());
-          status = xnn_define_tensor_value(
-              /*subgraph=*/subgraph_ptr,
-              /*datatype=*/fp_datatype,
-              /*num_dims=*/tensor_value->num_dims(),
-              /*dims=*/dims_data.data(),
-              /*data=*/buffer_ptr,
-              /*external_id=*/tensor_value->external_id(),
-              /*flags=*/tensor_value->flags(),
-              /*id_out=*/&float_id);
-
-          // Define dynamic conversion from float to qdint8
-          status = xnn_define_convert(
-              /*subgraph=*/subgraph_ptr,
-              /*input_id=*/float_id,
-              /*output_id=*/id,
-              /*flags=*/0);
-          break;
-        }
-        default:
-          ET_CHECK_OR_RETURN_ERROR(
-              false,
-              NotImplemented,
-              "Unhandled Dyanmic Quantization dtype: %d",
-              dq_datatype);
-      }
-    } else {
-      ET_CHECK_OR_RETURN_ERROR(false, NotImplemented, "Unhandled fp32 tensor");
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        !isQuantizedDataType(datatype),
+        Internal,
+        "xnn_datatype is quantized, but is not quantized tensor value");
+
+    status = xnn_define_tensor_value(
+        /*subgraph=*/subgraph_ptr,
+        /*datatype=*/datatype,
+        /*num_dims=*/tensor_value->num_dims(),
+        /*dims=*/dims_data.data(),
+        /*data=*/buffer_ptr,
+        /*external_id=*/tensor_value->external_id(),
+        /*flags=*/tensor_value->flags(),
+        /*id_out=*/&id);
+    ET_CHECK_OR_RETURN_ERROR(
+        xnn_status_success == status,
+        Internal,
+        "Failed to define tensor with id %i",
+        id);
   } else {
     // define tensor for quantized
     switch (qtensor_value->quant_params_type()) {
@@ -306,7 +222,7 @@ Error defineTensor(
             qparams->zero_point());
         status = xnn_define_quantized_tensor_value(
             /*subgraph=*/subgraph_ptr,
-            /*datatype=*/getDataType(tensor_value->datatype()),
+            /*datatype=*/datatype,
             /*zero_point=*/qparams->zero_point(),
             /*scale=*/qparams->scale(),
             /*num_dims=*/tensor_value->num_dims(),
@@ -319,9 +235,8 @@ Error defineTensor(
       }
       case fb_xnnpack::XNNQuantParams::PerChannelQuant: {
         auto qparams = qtensor_value->quant_params_as_PerChannelQuant();
-        enum xnn_datatype dtype = getDataType(tensor_value->datatype());
         int32_t zero_point =
-            (dtype == xnn_datatype::xnn_datatype_qcint4 ? 8 : 0);
+            (datatype == xnn_datatype::xnn_datatype_qcint4 ? 8 : 0);
 
         ET_LOG(
             Debug,
@@ -329,11 +244,11 @@ Error defineTensor(
             buffer_ptr,
             qparams->scale()->size(),
             qparams->channel_dim(),
-            dtype,
+            datatype,
             zero_point);
         status = xnn_define_channelwise_quantized_tensor_value_v2(
             /*subgraph=*/subgraph_ptr,
-            /*datatype=*/dtype,
+            /*datatype=*/datatype,
             /*zero_point=*/zero_point,
             /*scale=*/qparams->scale()->data(),
             /*num_dims=*/tensor_value->num_dims(),
@@ -346,7 +261,6 @@ Error defineTensor(
         break;
       }
       case fb_xnnpack::XNNQuantParams::PerChannelGroupQuant: {
-        xnn_datatype datatype = getDataType(tensor_value->datatype());
         ET_CHECK_OR_RETURN_ERROR(
             datatype == xnn_datatype::xnn_datatype_qbint4,
             Internal,
@@ -410,7 +324,7 @@ Error defineTensor(
             "Dynamically Quantized Tensors currently only support per token quantization");
         status = xnn_define_dynamically_quantized_tensor_value(
             /*subgraph=*/subgraph_ptr,
-            /*datatype=*/getDataType(tensor_value->datatype()),
+            /*datatype=*/datatype,
             /*num_dims=*/tensor_value->num_dims(),
             /*num_nonbatch_dims*/ qparams->num_nonbatch_dims(),
             /*dims=*/dims_data.data(),
@@ -1594,23 +1508,24 @@ __ET_NODISCARD Error XNNCompiler::compileModel(
     constant_data = reinterpret_cast<const uint8_t*>(buffer_pointer) +
         header->constant_data_offset;
   } else if (header.error() == Error::NotFound) {
-    flatbuffer_data = reinterpret_cast<const uint8_t*>(buffer_pointer);
+    ET_LOG(
+        Error,
+        "XNNHeader version mismatch: '%.4s' != expected '%.4s'",
+        // Header Magic and FlatbufferIdentifier are same offset and size
+        flatbuffers::GetBufferIdentifier(buffer_pointer),
+        XNNHeader::kMagic);
+    return header.error();
   } else {
     ET_LOG(Error, "XNNHeader may be corrupt");
     return header.error();
   }
 
-  // Temporarily support identifier XN00 and XN01
-  bool is_supported_version =
-      strncmp(flatbuffers::GetBufferIdentifier(flatbuffer_data), "XN00", 4) ==
-          0 ||
-      strncmp(flatbuffers::GetBufferIdentifier(flatbuffer_data), "XN01", 4) ==
-          0;
   ET_CHECK_OR_RETURN_ERROR(
-      is_supported_version,
+      fb_xnnpack::XNNGraphBufferHasIdentifier(flatbuffer_data),
       DelegateInvalidCompatibility,
-      "XNNPACK Delegate Serialization Format version identifier '%.4s' != expected XN00 or XN01'",
-      flatbuffers::GetBufferIdentifier(flatbuffer_data));
+      "XNNPACK Delegate flatbuffer version mismatch: '%.4s' != expected '%.4s'",
+      flatbuffers::GetBufferIdentifier(flatbuffer_data),
+      fb_xnnpack::XNNGraphIdentifier());
 
   auto flatbuffer_graph = fb_xnnpack::GetXNNGraph(flatbuffer_data);
   // initialize xnnpack
diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs
deleted file mode 100644
index 5ace211149..0000000000
--- a/backends/xnnpack/serialization/runtime_schema.fbs
+++ /dev/null
@@ -1,354 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-
-namespace fb_xnnpack;
-
-// Update after any BC breaking changes
-file_identifier "XN00";
-
-// datatype for xnn-values
-enum XNNDatatype : short {
-  /// Invalid data type. Valid Values never have this datatype.
-  xnn_datatype_invalid = 0,
-  /// IEEE754 single-precision floating-point.
-  xnn_datatype_fp32 = 1,
-  /// IEEE754 half-precision floating-point.
-  xnn_datatype_fp16 = 2,
-  /// Quantized 8-bit signed integer with shared per-Value quantization parameters.
-  xnn_datatype_qint8 = 3,
-  /// Quantized 8-bit unsigned integer with shared per-Value quantization parameters.
-  xnn_datatype_quint8 = 4,
-  /// Quantized 32-bit signed integer with shared per-Value quantization parameters.
-  xnn_datatype_qint32 = 5,
-  /// Quantized 8-bit signed integer with shared per-channel quantization parameters.
-  xnn_datatype_qcint8 = 6,
-  /// Quantized 32-bit signed integer with shared per-channel quantization parameters.
-  xnn_datatype_qcint32 = 7,
-  /// Quantized 4-bit signed integer with shared per-channel quantization parameters.
-  xnn_datatype_qcint4 = 8,
-  /// Dynamically quantized 8-bit signed integer with per-batch quantization parameters.
-  xnn_datatype_qdint8 = 9,
-  /// Quantized 4-bit signed integer with shared blockwise quantization parameters.
-  xnn_datatype_qbint4 = 10,
-}
-
-// type of quantization
-union XNNQuantParams {
-  PerChannelQuant,
-  PerTensorQuant,
-  PerTokenDynamicQuant,
-  PerChannelGroupQuant,
-}
-
-// taken from executorch
-// Data buffer abstraction.
-table Buffer {
-  storage:[ubyte] (force_align: 16);
-}
-
-table PerChannelQuant {
-  scale:[float];
-  channel_dim:int;
-}
-
-table PerTokenDynamicQuant {
-  num_nonbatch_dims:int;
-}
-
-table PerTensorQuant {
-  scale:float;
-  zero_point:int;
-}
-
-table PerChannelGroupQuant {
-  scale:[float];
-  channel_dim:int;
-  group_size:int;
-}
-
-table XNNTensorValue {
-  // type of the tensor elements.
-  datatype:XNNDatatype;
-  // number of dimensions in the shape.
-  num_dims:uint;
-  // pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
-  // XNNPACK does not keep any pointers to this array after the function returns.
-  dims:[uint];
-  // Index to the program's constant buffer table, value 0 is reserved to indicate non constant
-  constant_buffer_idx:uint;
-  // external ID for the Value. The ID must be within the range of reserved Value IDs specified on
-  // the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
-  // created for the Value.
-  external_id:uint;
-  // binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
-  // and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
-  flags:uint;
-  // pointer to the variable that will be initialized with the Value ID upon successful return. If a
-  // valid @a external_id was provided, the variable will be initialized with the @a external_id value.
-  id_out:uint;
-  // does this value need to be quantized dynamically at runtime?
-  // if we are quantizing at runtime, this field points to a target dtype
-  dq_datatype:XNNDatatype = xnn_datatype_invalid;
-}
-
-table XNNQuantizedTensorValue {
-  // Base Tensor Value
-  tensor_value:XNNTensorValue;
-  // Quantization parameters
-  quant_params:XNNQuantParams;
-}
-
-union XNodeUnion {
-  XNNAdd: _XNNNode2x1,
-  XNNFullyConnected,
-  XNNSoftmax: _XNNNode1x1,
-  XNNSigmoid: _XNNNode1x1,
-  XNNStaticTranspose,
-  XNNClamp: _XNNNode1x1,
-  XNNConv2d: _XNNNodeConv,
-  XNNDiv: _XNNNode2x1,
-  XNNStaticResizeBilinear2D,
-  XNNStaticConstantPad,
-  XNNAvgPooling2d: _XNNPooling2D,
-  XNNMinimum: _XNNNode2x1,
-  XNNDepthwiseConv2d: _XNNNodeConv,
-  XNNMaxPooling2d: _XNNPooling2D,
-  XNNMultiply: _XNNNode2x1,
-  XNNSubtract: _XNNNode2x1,
-  XNNFloor: _XNNNode1x1,
-  XNNConvert: _XNNNode1x1,
-  XNNGlobalAvgPooling2d: _XNNNode1x1,
-  XNNStaticReshape,
-  XNNArgMaxPooling2d,
-  XNNSquareRoot: _XNNNode1x1,
-  XNNCeiling: _XNNNode1x1,
-  XNNHardswish: _XNNNode1x1,
-  XNNLeakyReLU,
-  XNNMaximum: _XNNNode2x1,
-  XNNNegate: _XNNNode1x1,
-  XNNSquare: _XNNNode1x1,
-  XNNELU,
-  XNNAbs: _XNNNode1x1,
-  XNNPReLU: _XNNNode2x1,
-  XNNConcatenate2: _XNNCat,
-  XNNConcatenate3: _XNNCat,
-  XNNConcatenate4: _XNNCat,
-  XNNStaticSlice,
-  XNNScaledDotProductAttention,
-}
-
-union XValueUnion {
-  XNNTensorValue,
-  XNNQuantizedTensorValue,
-}
-
-table OutputMinMax {
-  output_min:float;
-  output_max:float;
-}
-
-table XNode {
-  xnode_union:XNodeUnion;
-  // An int which can be linked back to the node in the origin graph
-  debug_handle:uint;
-  output_min_max:OutputMinMax;
-}
-
-table XValue {
-  xvalue_union:XValueUnion;
-}
-
-table XNNStaticTranspose {
-  num_dims:uint;
-  perm:[uint];
-  input_id:uint;
-  output_id:uint;
-  flags:uint;
-}
-
-table XNNStaticResizeBilinear2D {
-  new_height:uint;
-  new_width:uint;
-  input_id:uint;
-  output_id:uint;
-  flags:uint;
-}
-
-table XNNStaticConstantPad {
-  pre_paddings:[uint];
-  post_paddings:[uint];
-  padding_value:float;
-  input_id:uint;
-  output_id:uint;
-  flags:uint;
-}
-
-// A node with two input and one output
-// Not meant to be used directly
-table _XNNNode2x1 {
-  input1_id:uint;
-  input2_id:uint;
-  output_id:uint;
-  flags:uint;
-}
-
-// A node with one input and one output
-// Not meant to be used directly
-table _XNNNode1x1 {
-  input_id:uint;
-  output_id:uint;
-  flags:uint;
-}
-
-table _XNNCat {
-  axis: uint;
-  input1_id: uint;
-  input2_id: uint;
-  input3_id: uint;
-  input4_id: uint;
-  output_id: uint;
-  flags: uint;
-}
-
-table XNNELU {
-  alpha:float;
-  input_id:uint;
-  output_id:uint;
-  flags:uint;
-}
-
-table XNNFullyConnected {
-  input1_id:uint;
-  filter_id:uint;
-  bias_id:uint;
-  output_id:uint;
-  flags:uint;
-}
-
-table _XNNNodeConv {
-  padding_top:uint;
-  padding_right:uint;
-  padding_bottom:uint;
-  padding_left:uint;
-  kernel_height:uint;
-  kernel_width:uint;
-  subsampling_height:uint;
-  subsampling_width:uint;
-  dilation_height:uint;
-  dilation_width:uint;
-  group_input_channels:uint;
-  group_output_channels:uint;
-  groups:uint;
-  adjustment_height:uint;
-  adjustment_width:uint;
-  input1_id:uint;
-  filter_id:uint;
-  bias_id:uint;
-  output_id:uint;
-  flags:uint;
-}
-
-table _XNNPooling2D {
-  padding_top: uint;
-  padding_right: uint;
-  padding_bottom: uint;
-  padding_left: uint;
-  pooling_height: uint;
-  pooling_width: uint;
-  stride_height: uint;
-  stride_width: uint;
-  dilation_height: uint;
-  dilation_width: uint;
-  input_id: uint;
-  output_id: uint;
-  flags: uint;
-}
-
-table XNNStaticReshape {
-  num_dims:uint;
-  new_shape:[uint];
-  input_id: uint;
-  output_id: uint;
-  flags: uint;
-}
-
-table XNNStaticSlice {
-  num_dims:uint;
-  offsets:[uint];
-  sizes:[uint];
-  input_id:uint;
-  output_id:uint;
-  flags:uint;
-}
-
-table XNNScaledDotProductAttention {
-    query_id:uint;
-    key_id:uint;
-    value_id:uint;
-    scale_id:uint;
-    mask_id:uint;
-    output_id:uint;
-    flags:uint;
-}
-
-table XNNArgMaxPooling2d {
-  padding_top: uint;
-  padding_right: uint;
-  padding_bottom: uint;
-  padding_left: uint;
-  pooling_height: uint;
-  pooling_width: uint;
-  input_id: uint;
-  output_value_id: uint;
-  output_index_id: uint;
-  flags: uint;
-}
-
-table XNNLeakyReLU {
-  negative_slope: float;
-  input_id: uint;
-  output_id: uint;
-  flags: uint;
-}
-
-// Describes data offsets for constant data
-table ConstantDataOffset {
-  // Constant data offsets are relative to the constant data base offset provided
-  // in the XNNPACKHeader.
-  offset: uint64;
-
-  // The size in bytes of valid data starting at the offset. The constant data
-  // may be followed by padding before the next piece of constant data
-  size: uint64;
-}
-
-table XNNGraph {
-  // Schema version.
-  version:string;
-  xnodes:[XNode];
-  xvalues:[XValue];
-
-  // Number of external inputs/outputs
-  num_externs:uint;
-
-  // Ids of external inputs
-  input_ids:[uint];
-
-  // Ids of external outputs
-  output_ids:[uint];
-
-  // Tables of constant data, used for constant Values (e.g.
-  // data field of weight tensors). Each constant is assigned an index into the table
-  // which are each individually aligned. 0 index is reserved to be pointed to by non-constant
-  // Tensors. Exactly one of constant_buffer and constant_data must be non-empty
-  constant_buffer:[Buffer];
-
-  // the list index is memory buffer id, the value is the memory buffer size.
-  mem_buffer_sizes: [uint];
-
-  // List of the constant data that follows the XNNGraph in this file. Each constant data is assigned an index into
-  // the table. 0 index is reserved to be pointed to by non-constant Tensor. Exactly one of constant_buffer and
-  // constant_data must be non-empty
-  constant_data:[ConstantDataOffset];
-}
-
-root_type XNNGraph;
diff --git a/backends/xnnpack/serialization/schema_version_history.txt b/backends/xnnpack/serialization/schema_version_history.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/backends/xnnpack/serialization/targets.bzl b/backends/xnnpack/serialization/targets.bzl
index 5eeab3de2b..05e6b89f12 100644
--- a/backends/xnnpack/serialization/targets.bzl
+++ b/backends/xnnpack/serialization/targets.bzl
@@ -4,14 +4,14 @@ def define_common_targets():
     runtime.genrule(
         name = "gen_xnnpack_schema",
         srcs = [
-            "runtime_schema.fbs",
+            "schema.fbs",
         ],
         # We're only generating a single file, so it seems like we could use
         # `out`, but `flatc` takes a directory as a parameter, not a single
         # file. Use `outs` so that `${OUT}` is expanded as the containing
         # directory instead of the file itself.
         outs = {
-            "schema_generated.h": ["runtime_schema_generated.h"],
+            "schema_generated.h": ["schema_generated.h"],
         },
         cmd = " ".join([
             "$(exe {})".format(runtime.external_dep_location("flatc")),

From d59419c4d56f7f14b3e6ac65848ce23c1b3ee108 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Thu, 1 Aug 2024 14:54:18 -0700
Subject: [PATCH 60/75] Rename optimizer buck target to be more specific
 (#4509)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4509

We might add more later so start with specifying the name

Reviewed By: lucylq

Differential Revision: D60611366

fbshipit-source-id: 730d7b8be5b544e6ed4a8ad7a2aa7bb7b71ce45f
---
 extension/training/optimizer/targets.bzl      | 2 +-
 extension/training/optimizer/test/targets.bzl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/extension/training/optimizer/targets.bzl b/extension/training/optimizer/targets.bzl
index 84043d27c9..69682feaee 100644
--- a/extension/training/optimizer/targets.bzl
+++ b/extension/training/optimizer/targets.bzl
@@ -26,7 +26,7 @@ def define_common_targets():
             ]
 
         runtime.cxx_library(
-            name = "optimizer" + aten_suffix,
+            name = "sgd" + aten_suffix,
             srcs = [
                 "sgd.cpp",
             ],
diff --git a/extension/training/optimizer/test/targets.bzl b/extension/training/optimizer/test/targets.bzl
index 7ffa74d614..11269bfa18 100644
--- a/extension/training/optimizer/test/targets.bzl
+++ b/extension/training/optimizer/test/targets.bzl
@@ -15,7 +15,7 @@ def define_common_targets():
                 "sgd_test.cpp",
             ],
             deps = [
-                "//executorch/extension/training/optimizer:optimizer" + aten_suffix,
+                "//executorch/extension/training/optimizer:sgd" + aten_suffix,
                 "//executorch/runtime/core:core",
                 "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
             ],

From 1090bcd26e683188ebc1ebb056f174b35930392c Mon Sep 17 00:00:00 2001
From: chuntl <quic_chuntl@quicinc.com>
Date: Thu, 1 Aug 2024 22:33:05 -0700
Subject: [PATCH 61/75] Qualcomm AI Engine Direct - Enable HTP emulator test in
 x86 host (#4503)

Summary:
- Enable x64 runner
- Enable HTP emulator test on unit test
- Fix unexpected error message
- Fix multi-contexts UT's mismatching datatype issue
- Port x64 dequantize flow instead of using arm_neon intrinsics
- Fix EtDump flow on runner and unittest

Pull Request resolved: https://github.com/pytorch/executorch/pull/4503

Reviewed By: digantdesai

Differential Revision: D60598800

Pulled By: cccclai

fbshipit-source-id: bfb9df7948c3f64b2bd0e140836dfbd2d4655c0b
---
 backends/qualcomm/aot/ir/qcir_utils.cpp       | 18 ++--
 backends/qualcomm/aot/ir/qcir_utils.h         |  4 +-
 .../qualcomm/runtime/QnnExecuTorchBackend.cpp |  4 +-
 backends/qualcomm/runtime/SharedBuffer.cpp    |  7 ++
 backends/qualcomm/scripts/build.sh            | 24 ++++--
 backends/qualcomm/tests/test_qnn_delegate.py  | 15 +++-
 backends/qualcomm/tests/utils.py              | 86 ++++++++++++++-----
 examples/qualcomm/CMakeLists.txt              |  3 -
 .../executor_runner/qnn_executor_runner.cpp   |  5 +-
 .../qualcomm/llama2/qaihub_runner/runner.cpp  | 13 ++-
 examples/qualcomm/scripts/utils.py            | 42 ++++++---
 11 files changed, 160 insertions(+), 61 deletions(-)

diff --git a/backends/qualcomm/aot/ir/qcir_utils.cpp b/backends/qualcomm/aot/ir/qcir_utils.cpp
index e025b8667a..75446bb733 100755
--- a/backends/qualcomm/aot/ir/qcir_utils.cpp
+++ b/backends/qualcomm/aot/ir/qcir_utils.cpp
@@ -100,7 +100,7 @@ Qnn_DataType_t ToDataType(qcir::DataType type) {
 }
 
 flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
-    const Qnn_QuantizeParams_t& param,
+    const Qnn_Tensor_t& tensor,
     flatbuffers::FlatBufferBuilder* builder) {
   static const std::unordered_map<Qnn_Definition_t, qcir::QuantizeDef> def_map{
       {QNN_DEFINITION_IMPL_GENERATED, qcir::QuantizeDef::IMPL_GENERATED},
@@ -124,6 +124,7 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
 
   int32_t axis = 0;
   uint32_t bitwidth = 0;
+  auto param = QNN_VER_PTR(tensor)->quantizeParams;
   auto quant_type = type_map.at(param.quantizationEncoding);
   std::vector<qcir::ScaleOffset> data;
   std::vector<float> scales;
@@ -160,7 +161,9 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
       }
     } break;
     default:
-      QNN_EXECUTORCH_LOG_ERROR("QNN_QUANTIZATION_ENCODING_UNDEFINED detected");
+      QNN_EXECUTORCH_LOG_WARN(
+          "QNN_QUANTIZATION_ENCODING_UNDEFINED detected: %s",
+          QNN_VER_PTR(tensor)->name);
       break;
   }
   return CreateQuantizeParamDirect(
@@ -174,7 +177,7 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
       &data);
 }
 
-Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
+Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor) {
   static const std::unordered_map<qcir::QuantizeDef, Qnn_Definition_t> def_map{
       {qcir::QuantizeDef::IMPL_GENERATED, QNN_DEFINITION_IMPL_GENERATED},
       {qcir::QuantizeDef::DEFINED, QNN_DEFINITION_DEFINED},
@@ -196,6 +199,7 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
           };
 
   Qnn_QuantizeParams_t p = QNN_QUANTIZE_PARAMS_INIT;
+  auto param = tensor->qparam();
   p.encodingDefinition = def_map.at(param->def());
   p.quantizationEncoding = type_map.at(param->type());
   switch (p.quantizationEncoding) {
@@ -225,7 +229,9 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
           const_cast<int32_t*>(param->offsets()->data());
     } break;
     default:
-      QNN_EXECUTORCH_LOG_ERROR("qcir::QuantizeType::UNDEFINED detected");
+      QNN_EXECUTORCH_LOG_WARN(
+          "qcir::QuantizeType::UNDEFINED detected: %s",
+          tensor->name()->c_str());
       break;
   }
   return p;
@@ -248,7 +254,7 @@ flatbuffers::Offset<qcir::Tensor> ToTensor(
       &shape,
       ToTensorType(QNN_VER_PTR(tensor)->type),
       ToDataType(QNN_VER_PTR(tensor)->dataType),
-      ToQuantizeParam(QNN_VER_PTR(tensor)->quantizeParams, builder),
+      ToQuantizeParam(tensor, builder),
       &buffer);
 }
 
@@ -261,7 +267,7 @@ Qnn_Tensor_t ToTensor(const tensor_type& tensor) {
   QNN_VER_PTR(t)->name = tensor->name()->c_str();
   QNN_VER_PTR(t)->type = ToTensorType(tensor->type());
   QNN_VER_PTR(t)->dataType = ToDataType(tensor->dtype());
-  QNN_VER_PTR(t)->quantizeParams = ToQuantizeParam(tensor->qparam());
+  QNN_VER_PTR(t)->quantizeParams = ToQuantizeParam(tensor);
   QNN_VER_PTR(t)->rank = tensor->shape()->size();
   QNN_VER_PTR(t)->dimensions = const_cast<uint32_t*>(tensor->shape()->data());
   QNN_VER_PTR(t)->clientBuf.dataSize = tensor->data()->size();
diff --git a/backends/qualcomm/aot/ir/qcir_utils.h b/backends/qualcomm/aot/ir/qcir_utils.h
index 30a5481f9f..890dfa33ca 100755
--- a/backends/qualcomm/aot/ir/qcir_utils.h
+++ b/backends/qualcomm/aot/ir/qcir_utils.h
@@ -26,9 +26,9 @@ qcir::DataType ToDataType(Qnn_DataType_t type);
 Qnn_DataType_t ToDataType(qcir::DataType type);
 
 flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
-    const Qnn_QuantizeParams_t& param,
+    const Qnn_Tensor_t& tensor,
     flatbuffers::FlatBufferBuilder* builder);
-Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& type);
+Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor);
 
 flatbuffers::Offset<qcir::Tensor> ToTensor(
     const Qnn_Tensor_t& tensor,
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
index f08f688cf9..36512c4ff2 100644
--- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
+++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -23,12 +23,12 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
     ArrayRef<CompileSpec> compile_specs) const {
   // covert SizedBuffer to qnn ExecuTorch option
   QnnExecuTorchContextBinary qnn_context_blob;
-  const qnn_delegate::QnnExecuTorchOptions* qnn_executorch_options;
+  const qnn_delegate::QnnExecuTorchOptions* qnn_executorch_options = nullptr;
 
   qnn_context_blob.buffer = const_cast<void*>(processed->data());
   qnn_context_blob.nbytes = processed->size();
 
-  // covert CompileSpec to qnn ExecuTorch option
+  // convert CompileSpec to qnn ExecuTorch option
   for (auto& compile_spec : compile_specs) {
     if (std::strcmp(compile_spec.key, QNN_COMPILE_SPEC) == 0)
       qnn_executorch_options =
diff --git a/backends/qualcomm/runtime/SharedBuffer.cpp b/backends/qualcomm/runtime/SharedBuffer.cpp
index 430c8f757a..3fa62d09cd 100644
--- a/backends/qualcomm/runtime/SharedBuffer.cpp
+++ b/backends/qualcomm/runtime/SharedBuffer.cpp
@@ -87,7 +87,12 @@ SharedBuffer& SharedBuffer::GetSharedBufferManager() {
   std::lock_guard<std::mutex> lk(init_mutex_);
   static SharedBuffer shared_buffer_manager;
   if (!shared_buffer_manager.GetInitialize()) {
+#if defined(__aarch64__)
     Error status = shared_buffer_manager.Load();
+#else
+    // For x86_64 platform
+    Error status = Error::Ok;
+#endif
     if (status == Error::Ok) {
       shared_buffer_manager.SetInitialize(true);
     }
@@ -96,9 +101,11 @@ SharedBuffer& SharedBuffer::GetSharedBufferManager() {
 }
 
 SharedBuffer::~SharedBuffer() {
+#if defined(__aarch64__)
   if (initialize_) {
     SharedBuffer::GetSharedBufferManager().UnLoad();
   }
+#endif
 };
 
 void* SharedBuffer::AllocMem(size_t bytes, size_t alignment) {
diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
index 3712a83fde..d6b1da62fc 100755
--- a/backends/qualcomm/scripts/build.sh
+++ b/backends/qualcomm/scripts/build.sh
@@ -107,19 +107,33 @@ if [ "$BUILD_X86_64" = true ]; then
         rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT
     fi
     cd $BUILD_ROOT
+    # TODO: Use CMAKE_BUILD_TYPE=RelWithDebInfo, and handle flatcc issues
     cmake \
-        -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+        -DCMAKE_BUILD_TYPE=Debug \
+        -DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \
         -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
         -DEXECUTORCH_BUILD_QNN=ON \
+        -DEXECUTORCH_BUILD_SDK=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-        -DBUCK2=$BUCK2 \
         -S $PRJ_ROOT \
         -B $BUILD_ROOT \
 
-    cmake \
-    --build $BUILD_ROOT \
-    -t "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j16
+    cmake --build $BUILD_ROOT -j16 --target install
 
     rm -f $PRJ_ROOT/backends/qualcomm/python/*
     cp -fv $BUILD_ROOT/backends/qualcomm/Py* "$PRJ_ROOT/backends/qualcomm/python"
+
+   EXAMPLE_ROOT=examples/qualcomm
+   CMAKE_PREFIX_PATH="${BUILD_ROOT}/lib/cmake/ExecuTorch;${BUILD_ROOT}/third-party/gflags;"
+
+   cmake $PRJ_ROOT/$EXAMPLE_ROOT \
+       -DCMAKE_BUILD_TYPE=Debug \
+       -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
+       -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
+       -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
+       -B$EXAMPLE_ROOT
+
+   cmake --build $EXAMPLE_ROOT -j16
 fi
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 80fc71ef7c..c1c070ca3c 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -147,6 +147,7 @@ def test_qnn_backend_element_wise_ceil(self):
 
     def test_qnn_backend_element_wise_div(self):
         eps = 1e-03
+        torch.manual_seed(8)
         test_comb = [
             {
                 QCOM_MODULE: [Div()],  # noqa: F405
@@ -721,6 +722,7 @@ def test_qnn_backend_element_wise_ceil(self):
 
     def test_qnn_backend_element_wise_div(self):
         eps = 1e-03
+        torch.manual_seed(8)
         test_comb = [
             {
                 QCOM_MODULE: [Div()],  # noqa: F405
@@ -1323,7 +1325,6 @@ def test_qnn_backend_multi_contexts_composite(self):
         exec_prog = edge_prog.to_executorch()
         self.verify_output(module.get_reference_module(), sample_input, exec_prog)
 
-    @unittest.expectedFailure
     def test_qnn_backend_profile_op(self):
         TestQNN.enable_profile = True
         backend_options = generate_htp_compiler_spec(use_fp16=True)
@@ -1338,7 +1339,7 @@ def test_qnn_backend_profile_op(self):
             module,
             sample_input,
             expected_partitions=1,
-            expected_profile_events=25,
+            expected_profile_events=24,
         )
 
     def test_qnn_backend_shared_buffer(self):
@@ -1488,7 +1489,6 @@ def test_qnn_backend_multi_contexts_composite(self):
         exec_prog = edge_prog.to_executorch()
         self.verify_output(module.get_reference_module(), sample_input, exec_prog)
 
-    @unittest.expectedFailure
     def test_qnn_backend_profile_op(self):
         TestQNN.enable_profile = True
         backend_options = generate_htp_compiler_spec(use_fp16=False)
@@ -1504,7 +1504,7 @@ def test_qnn_backend_profile_op(self):
             module,
             sample_input,
             expected_partitions=1,
-            expected_profile_events=26,
+            expected_profile_events=25,
         )
 
     def test_qnn_backend_shared_buffer(self):
@@ -2288,6 +2288,12 @@ def setup_environment():
         help="Path to open source software model repository",
         type=str,
     )
+    parser.add_argument(
+        "-x",
+        "--enable_x86_64",
+        help="Enable unittest to be executed on x86_64 platform",
+        action="store_true",
+    )
 
     args, ns_args = parser.parse_known_args(namespace=unittest)
     TestQNN.host = args.host
@@ -2304,6 +2310,7 @@ def setup_environment():
     TestQNN.error_only = args.error_only
     TestQNN.oss_repo = args.oss_repo
     TestQNN.shared_buffer = args.shared_buffer
+    TestQNN.enable_x86_64 = args.enable_x86_64
     return sys.argv[:1] + ns_args
 
 
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index f31f07562b..ef0ac0f202 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -27,7 +27,11 @@
     QcomChipset,
 )
 from executorch.backends.qualcomm.utils.utils import capture_program
-from executorch.examples.qualcomm.scripts.utils import SimpleADB
+from executorch.examples.qualcomm.scripts.utils import (
+    generate_inputs,
+    make_output_dir,
+    SimpleADB,
+)
 
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.compile_spec_schema import CompileSpec
@@ -133,6 +137,7 @@ class TestQNN(unittest.TestCase):
     use_16a16w: str = "16a16w"
     use_16a4w: str = "16a4w"
     shared_buffer: bool = False
+    enable_x86_64: bool = False
 
     def _assert_outputs_equal(self, model_output, ref_output):
         self.assertTrue(len(ref_output) == len(model_output))
@@ -201,16 +206,16 @@ def verify_output(
                 tmp_dir,
             )
 
-            device_output_dir = f"{tmp_dir}/outputs"
-            device_outputs = []
+            output_dir = f"{tmp_dir}/outputs"
+            outputs = []
             etdump_path = f"{tmp_dir}/etdump.etdp"
 
             def post_process():
-                for i, f in enumerate(sorted(os.listdir(device_output_dir))):
-                    filename = os.path.join(device_output_dir, f)
+                for i, f in enumerate(sorted(os.listdir(output_dir))):
+                    filename = os.path.join(output_dir, f)
                     output = np.fromfile(filename, dtype=ref_outputs[i].numpy().dtype)
                     output = torch.from_numpy(output).reshape(ref_outputs[i].shape)
-                    device_outputs.append(output)
+                    outputs.append(output)
 
             def validate_profile():
                 inspector = Inspector(etdump_path=etdump_path, etrecord=etrecord_path)
@@ -218,23 +223,58 @@ def validate_profile():
                     len(inspector.to_dataframe().index) == expected_profile_events
                 )
 
-            adb = SimpleADB(
-                qnn_sdk=os.getenv("QNN_SDK_ROOT"),
-                build_path=self.build_folder,
-                pte_path=pte_fname,
-                workspace="/data/local/tmp/qnn_executorch_test",
-                device_id=self.device,
-                host_id=self.host,
-                soc_model=self.model,
-                error_only=self.error_only,
-            )
-            adb.push(inputs=[sample_inputs], input_list=input_list)
-            adb.execute()
-            adb.pull(output_path=tmp_dir, callback=post_process)
-            self._assert_outputs_equal(device_outputs, ref_outputs)
+            if self.enable_x86_64:
+                generate_inputs(tmp_dir, "input_list.txt", [sample_inputs], input_list)
+                make_output_dir(output_dir)
+
+                target = "x86_64-linux-clang"
+                qnn_sdk = os.environ.get("QNN_SDK_ROOT", None)
+                assert qnn_sdk, "QNN_SDK_ROOT was not found in environment variable"
+
+                build_path = "build_x86_64"
+                cmds = [
+                    # export LD_LIBRARY_PATH to QNN_SDK_ROOT
+                    f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{self.executorch_root}/{build_path}/lib && "
+                    # qnn_executor_runner
+                    f"{self.executorch_root}/{build_path}/examples/qualcomm/qnn_executor_runner",
+                    f"--model_path {pte_fname}",
+                    f"--input_list_path {tmp_dir}/input_list.txt",
+                    f"--output_folder_path {output_dir}",
+                ]
+
+                subprocess.run(
+                    " ".join(cmds),
+                    shell=True,
+                    executable="/bin/bash",
+                    capture_output=True,
+                    cwd=tmp_dir,
+                )
+
+                # Verify the outputs
+                post_process()
+                self._assert_outputs_equal(outputs, ref_outputs)
+
+                # Verify the etdump
+                if expected_profile_events != -1:
+                    validate_profile()
+            else:
+                adb = SimpleADB(
+                    qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+                    build_path=self.build_folder,
+                    pte_path=pte_fname,
+                    workspace="/data/local/tmp/qnn_executorch_test",
+                    device_id=self.device,
+                    host_id=self.host,
+                    soc_model=self.model,
+                    error_only=self.error_only,
+                )
+                adb.push(inputs=[sample_inputs], input_list=input_list)
+                adb.execute()
+                adb.pull(output_path=tmp_dir, callback=post_process)
+                self._assert_outputs_equal(outputs, ref_outputs)
 
-            if expected_profile_events != -1:
-                adb.pull_etdump(etdump_path, callback=validate_profile)
+                if expected_profile_events != -1:
+                    adb.pull_etdump(etdump_path, callback=validate_profile)
 
     def lower_module_and_test_output(
         self,
@@ -362,6 +402,8 @@ def _insert_clone(
                             (node,),
                         )
                         inserted_node.meta["val"] = node.meta["val"]
+                        if "quant_attrs" in node.meta:
+                            inserted_node.meta["quant_attrs"] = node.meta["quant_attrs"]
                         for user in users:
                             user.replace_input_with(node, inserted_node)
 
diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt
index 94aae08de8..6bfbdea058 100644
--- a/examples/qualcomm/CMakeLists.txt
+++ b/examples/qualcomm/CMakeLists.txt
@@ -7,9 +7,6 @@
 set(CMAKE_CXX_STANDARD 17)
 # qnn_executor_runner: Like executor_runner but with QNN
 
-if(NOT ${ANDROID})
-  message(FATAL_ERROR "Not building Android, quitting...")
-endif()
 cmake_minimum_required(VERSION 3.19)
 project(qualcomm_runner_example)
 
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index 0ae6e4e6e4..7871cafc24 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -31,6 +31,7 @@
 
 #include <gflags/gflags.h>
 
+#include <chrono>
 #include <fstream>
 #include <memory>
 
@@ -202,10 +203,8 @@ int main(int argc, char** argv) {
   // be used by a single thread at at time, but it can be reused.
   //
   torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
-  // TODO: So far we have issues with etdump_gen during load_method. Enable it
-  // after the issues are fixed.
   Result<Method> method =
-      program->load_method(method_name, &memory_manager, nullptr);
+      program->load_method(method_name, &memory_manager, &etdump_gen);
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
diff --git a/examples/qualcomm/llama2/qaihub_runner/runner.cpp b/examples/qualcomm/llama2/qaihub_runner/runner.cpp
index 32a89c9700..2f8a01f4e9 100644
--- a/examples/qualcomm/llama2/qaihub_runner/runner.cpp
+++ b/examples/qualcomm/llama2/qaihub_runner/runner.cpp
@@ -23,7 +23,9 @@
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/platform/log.h>
 
+#if defined(__aarch64__)
 #include "arm_neon.h"
+#endif
 
 namespace torch {
 namespace executor {
@@ -108,9 +110,11 @@ Error Runner::load() {
 
 int32_t Runner::logitsToToken(const Tensor& logits_tensor) {
   static std::vector<float> logits_f(vocab_size_);
+  const uint16_t* logits = logits_tensor.data_ptr<uint16_t>();
+
+#if defined(__aarch64__)
   static int32x4_t offset = vmovq_n_s32(logits_offset_);
   static float32x4_t scale = vmovq_n_f32(logits_scale_);
-  const uint16_t* logits = logits_tensor.data_ptr<uint16_t>();
   // dequantize
   for (int i = 0; i < vocab_size_; i += 4) {
     const uint16_t* in = logits + i;
@@ -121,6 +125,13 @@ int32_t Runner::logitsToToken(const Tensor& logits_tensor) {
     float32x4_t shifted_f = vcvtq_f32_s32(shifted);
     vst1q_f32(out, vmulq_f32(shifted_f, scale));
   }
+#else
+  // dequantize
+  for (int i = 0; i < vocab_size_; i++) {
+    logits_f[i] = (logits[i] - logits_offset_) * logits_scale_;
+  }
+#endif
+
   return sampler_->sample(logits_f.data());
 }
 
diff --git a/examples/qualcomm/scripts/utils.py b/examples/qualcomm/scripts/utils.py
index 1e4b1c6968..8211dc4581 100755
--- a/examples/qualcomm/scripts/utils.py
+++ b/examples/qualcomm/scripts/utils.py
@@ -106,24 +106,18 @@ def push(self, inputs=None, input_list=None, files=None):
             f"{self.build_path}/backends/qualcomm/libqnn_executorch_backend.so",
         ]
 
-        # prepare input list
-        if input_list is not None:
-            input_list_file = f"{self.working_dir}/{self.input_list_filename}"
-            with open(input_list_file, "w") as f:
-                f.write(input_list)
-                f.flush()
-            artifacts.append(input_list_file)
+        input_list_file, input_files = generate_inputs(
+            self.working_dir, self.input_list_filename, inputs, input_list
+        )
 
+        # prepare input list
+        artifacts.append(input_list_file)
         for artifact in artifacts:
             self._adb(["push", artifact, self.workspace])
 
         # input data
-        if inputs is not None:
-            for idx, data in enumerate(inputs):
-                for i, d in enumerate(data):
-                    file_name = f"{self.working_dir}/input_{idx}_{i}.raw"
-                    d.detach().numpy().tofile(file_name)
-                    self._adb(["push", file_name, self.workspace])
+        for file_name in input_files:
+            self._adb(["push", file_name, self.workspace])
 
         # custom files
         if files is not None:
@@ -437,3 +431,25 @@ def parse_skip_delegation_node(args):
         print("Skipping following node ops: ", skip_node_op_set)
 
     return skip_node_id_set, skip_node_op_set
+
+
+def generate_inputs(dest_path: str, file_name: str, inputs=None, input_list=None):
+    input_list_file = ""
+    input_files = []
+
+    # Prepare input list
+    if input_list is not None:
+        input_list_file = f"{dest_path}/{file_name}"
+        with open(input_list_file, "w") as f:
+            f.write(input_list)
+            f.flush()
+
+    # Prepare input data
+    if inputs is not None:
+        for idx, data in enumerate(inputs):
+            for i, d in enumerate(data):
+                file_name = f"{dest_path}/input_{idx}_{i}.raw"
+                d.detach().numpy().tofile(file_name)
+                input_files.append(file_name)
+
+    return input_list_file, input_files

From 4483bb66e000305122a6e8e741afd6f7c9a74ebc Mon Sep 17 00:00:00 2001
From: Matthias Cremon <matthiascremon@meta.com>
Date: Fri, 2 Aug 2024 09:22:03 -0700
Subject: [PATCH 62/75] Add Wav2Vec2 base model (#4513)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4513

As titled.

Reviewed By: zonglinpengmeta

Differential Revision: D60619295

fbshipit-source-id: 00fd48029bc2413cf2a4a1453c80bbf65d29c57f
---
 examples/cadence/models/wav2vec2.py | 65 +++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 examples/cadence/models/wav2vec2.py

diff --git a/examples/cadence/models/wav2vec2.py b/examples/cadence/models/wav2vec2.py
new file mode 100644
index 0000000000..5db9ea2a6d
--- /dev/null
+++ b/examples/cadence/models/wav2vec2.py
@@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Example script for exporting simple models to flatbuffer
+
+import logging
+
+from executorch.backends.cadence.aot.ops_registrations import *  # noqa
+
+import torch
+
+from executorch.backends.cadence.aot.export_example import export_model
+from torchaudio.models.wav2vec2.model import wav2vec2_model, Wav2Vec2Model
+
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+
+def main() -> None:
+    # The wrapper is needed to avoid issues with the optional second arguments
+    # of Wav2Vec2Models.
+    class Wav2Vec2ModelWrapper(torch.nn.Module):
+        def __init__(self, model: Wav2Vec2Model):
+            super().__init__()
+            self.model = model
+
+        def forward(self, x):
+            out, _ = self.model(x)
+            return out
+
+    _model = wav2vec2_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=768,
+        encoder_projection_dropout=0.1,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=12,
+        encoder_num_heads=12,
+        encoder_attention_dropout=0.1,
+        encoder_ff_interm_features=3072,
+        encoder_ff_interm_dropout=0.0,
+        encoder_dropout=0.1,
+        encoder_layer_norm_first=False,
+        encoder_layer_drop=0.1,
+        aux_num_out=None,
+    )
+    _model.eval()
+
+    model = Wav2Vec2ModelWrapper(_model)
+    model.eval()
+
+    # test input
+    audio_len = 1680
+    example_inputs = (torch.rand(1, audio_len),)
+
+    export_model(model, example_inputs)
+
+
+if __name__ == "__main__":
+    main()

From 448c7d3c07125c1f13e24ff81de7473a6a446916 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Fri, 2 Aug 2024 10:43:16 -0700
Subject: [PATCH 63/75] Support int8 texture tensors without requiring int8
 buffers (#4485)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4485

## Context

By default, storage buffers in Vulkan must contain 32 bit data types; using 8 bit and 16 bit data types in buffers can be enabled optionally by supporting the [VK_KHR_8bit_storage](https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VK_KHR_8bit_storage.html) extension or the [VK_KHR_16bit_storage](https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VK_KHR_16bit_storage.html) extension respectively.

Previously, 8-bit and 16-bit tensors were enabled by using those extensions; however, this meant that 8-bit and 16-bit tensors could not be used if the Vulkan driver does not support the corresponding extension.

This diff adds support for 8-bit texture-backed tensors without the need for the VK_KHR_8bit_storage extension. This is done by introducing shaders that manually pack and repack 4 8-bit integers into a single int32 value. Once the tensor data has been transferred to an image texture (which will use the `VK_FORMAT_R8G8B8A8_SINT` image format) the extension will no longer be required.

Reviewed By: jorgep31415

Differential Revision: D60536832

fbshipit-source-id: 8d3d8b069582ab8c18d41701c864778621d2f6e3
---
 .../vulkan/runtime/graph/ComputeGraph.cpp     | 18 ++--
 backends/vulkan/runtime/graph/ComputeGraph.h  | 28 +++++-
 .../runtime/graph/ops/glsl/indexing_utils.h   |  2 +-
 .../ops/glsl/int8_tensor_to_nchw_noint8.glsl  | 54 +++++++++++
 .../ops/glsl/nchw_to_int8_tensor_noint8.glsl  | 74 +++++++++++++++
 .../graph/ops/glsl/nchw_to_tensor.glsl        |  2 +-
 .../runtime/graph/ops/glsl/q_8w_linear.glsl   |  2 +-
 .../graph/ops/glsl/tensor_to_nchw.glsl        |  2 +-
 .../vulkan/runtime/graph/ops/impl/Staging.cpp | 31 ++++--
 .../runtime/graph/ops/utils/StagingUtils.cpp  | 18 +++-
 .../runtime/graph/ops/utils/StagingUtils.h    |  8 +-
 backends/vulkan/test/glsl/all_shaders.yaml    | 17 +---
 .../vulkan/test/glsl/idx_fill_texture.glsl    | 14 +--
 backends/vulkan/test/utils/test_utils.cpp     | 24 ++++-
 backends/vulkan/test/utils/test_utils.h       |  5 +
 .../vulkan/test/vulkan_compute_api_test.cpp   | 94 ++++++++++++++-----
 16 files changed, 320 insertions(+), 73 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/int8_tensor_to_nchw_noint8.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_tensor_noint8.glsl

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 2046e78e88..fb2c379c1b 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -319,24 +319,20 @@ utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) {
   return image_extents_of(idx);
 }
 
-utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
+utils::uvec3 ComputeGraph::create_local_wg_size(
+    const utils::uvec3 global_wg_size) {
   if (config_.enable_local_wg_size_override) {
     return config_.local_wg_size_override;
   }
 
-  if (is_buffer_storage(idx)) {
-    return {64u, 1u, 1u};
-  }
-
-  const utils::uvec3 image_extents = image_extents_of(idx);
   utils::uvec3 local_group_size = {4, 4, 4};
 
-  if (image_extents.data[2u] == 1) {
-    if (image_extents.data[1u] == 1) {
+  if (global_wg_size.data[2u] == 1) {
+    if (global_wg_size.data[1u] == 1) {
       local_group_size.data[0u] = 64;
       local_group_size.data[1u] = 1;
       local_group_size.data[2u] = 1;
-    } else if (image_extents.data[1u] < 8) {
+    } else if (global_wg_size.data[1u] < 8) {
       local_group_size.data[0u] = 16;
       local_group_size.data[1u] = 4;
       local_group_size.data[2u] = 1;
@@ -349,6 +345,10 @@ utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
   return local_group_size;
 }
 
+utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
+  return create_local_wg_size(image_extents_of(idx));
+}
+
 void ComputeGraph::copy_into_staging(
     const ValueRef idx,
     const void* data,
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 5237a7746d..898a856291 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -180,7 +180,9 @@ class ComputeGraph final {
     return values_.at(idx).type();
   }
 
-  // Get Tensor Property
+  //
+  // Tensor Properties Accessors
+  //
 
   std::vector<int64_t> sizes_of(const ValueRef idx) const;
 
@@ -226,7 +228,9 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().ntexels_ubo();
   }
 
+  //
   // Scalar Value Extraction
+  //
 
   template <typename T>
   T extract_scalar(const ValueRef idx) {
@@ -459,9 +463,7 @@ class ComputeGraph final {
   utils::uvec3 create_global_wg_size(const ValueRef idx);
 
   /*
-   * Suggest a local workgroup size for a given `api::vTensor` value, assuming
-   * that every shader invocation calculates one texel element of the output
-   * tensor.
+   * Suggest a local workgroup size for a given global workgroup size.
    *
    * The local workgroup size will be formed to try and minimize the number of
    * inactive invocations.
@@ -469,6 +471,13 @@ class ComputeGraph final {
    * Currently, the local workgroup size is hard-coded to contain a total of 64
    * shader invocations. In the future, this value can be configured.
    */
+  utils::uvec3 create_local_wg_size(const utils::uvec3 global_wg_size);
+
+  /*
+   * Convenience function to suggest a local workgroup size for a given
+   * `api::vTensor` value, assuming that every shader invocation calculates one
+   * texel element of the output tensor.
+   */
   utils::uvec3 create_local_wg_size(const ValueRef idx);
 
   //
@@ -500,6 +509,17 @@ class ComputeGraph final {
   void resize_input(const int64_t idx, const std::vector<int64_t>& new_sizes);
   void propagate_resize();
 
+  //
+  // Miscellaneous Utilities
+  //
+
+  /*
+   * Check whether the GPU supports 8 bit buffers.
+   */
+  inline bool int8_buffers_enabled() const {
+    return context_->adapter_ptr()->has_full_int8_buffers_support();
+  }
+
   //
   // Debug support (implemented in Logging.cpp)
   //
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
index 0ecfb83eac..d3264e43a2 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -80,7 +80,7 @@ ivec4 from_nchw_buffer_i(int buf_i, ivec4 sizes) {
  * Returns: The (x, y, z, n) texel position corresponding to the first element
  *          of the texel at the specified buffer index
  */
-ivec4 to_texel_pos(int buf_i, ivec4 strides, int packed_dim) {
+ivec4 to_tensor_idx(int buf_i, ivec4 strides, int packed_dim) {
   ivec4 idx;
   for (int i = 3; i >= 0; i--) {
     if (i != packed_dim) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8_tensor_to_nchw_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/int8_tensor_to_nchw_noint8.glsl
new file mode 100644
index 0000000000..21290d0ce8
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/int8_tensor_to_nchw_noint8.glsl
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+#extension GL_EXT_control_flow_attributes : require
+
+${layout_declare_tensor(0, "r", "t_in", "int8", "texture3d")}
+${layout_declare_buffer(1, "w", "nchw_out", "int")}
+${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
+${layout_declare_ubo(3, "int", "out_ntexels")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+void main() {
+  const int out_buf_idx = int(gl_GlobalInvocationID.x);
+  if (out_buf_idx >= out_ntexels) {
+    return;
+  }
+
+  ivec4 values;
+  int in_buf_idx = 4 * out_buf_idx;
+
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    const ivec4 tensor_idx = from_nchw_buffer_i(in_buf_idx, tensor_sizes);
+    const ivec4 texture_pos = to_texture_elem_pos(
+        tensor_idx, tensor_sizes, packed_dim);
+    values[i] = load_texel(t_in, texture_pos.xyz)[texture_pos.w];
+    in_buf_idx++;
+  }
+
+  // Manually pack 4x 8-bit integers into a 32 bit integer. Note that little
+  // endian is assumed, since most processors use little endian. Thus the
+  // "later" values are placed in most significant bytes.
+  int packed = ((values[3] & 0xFF) << 24)
+             | ((values[2] & 0xFF) << 16)
+             | ((values[1] & 0xFF) << 8)
+             | ((values[0] & 0xFF));
+
+  nchw_out[out_buf_idx] = packed;
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_tensor_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_tensor_noint8.glsl
new file mode 100644
index 0000000000..378cf09d12
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_tensor_noint8.glsl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+#extension GL_EXT_control_flow_attributes : require
+
+${layout_declare_tensor(0, "w", "t_out", "int8", "texture3d")}
+${layout_declare_buffer(1, "r", "nchw_in", "int")}
+${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+/*
+ * Extends sign of int8
+ */
+int extend_sign(int x) {
+  if (x >> 7 == 1) {
+    return x | 0xFFFFFF00;
+  }
+  return x;
+}
+
+ivec4 read_texel(ivec4 tensor_idx) {
+  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(
+      tensor_idx, tensor_sizes, packed_dim);
+
+  int shift = (1 << 8) - 1;
+  ivec4 masks;
+  // Masks used to unpack 4x 8-bit values from a 32 bit integer. Note that
+  // little endian is assumed, as most processors use little endian. Thus the
+  // most significant bytes correspond to the "latter" packed values.
+  masks.x = shift << (8 * (buf_indices.x % 4));
+  masks.y = shift << (8 * (buf_indices.y % 4));
+  masks.z = shift << (8 * (buf_indices.z % 4));
+  masks.w = shift << (8 * (buf_indices.w % 4));
+
+  ivec4 out_tex = ivec4(0);
+
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    if (tensor_idx[packed_dim] + i < tensor_sizes[packed_dim]) {
+      int in_texel = nchw_in[buf_indices[i] / 4];
+      int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4));
+      extracted_val = extend_sign(extracted_val);
+      out_tex[i] = extracted_val;
+    }
+  }
+
+  return out_tex;
+}
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 tensor_idx = to_tensor_idx(pos, tensor_sizes, packed_dim);
+
+  if (any(greaterThanEqual(tensor_idx, tensor_sizes))) {
+    return;
+  }
+
+  write_texel(t_out, pos, read_texel(tensor_idx));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.glsl
index c0bbc5183a..c218482b09 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.glsl
@@ -62,7 +62,7 @@ void main() {
     return;
   }
 
-  ivec4 tensor_idx = to_texel_pos(t_id, gpu_strides, packed_dim);
+  ivec4 tensor_idx = to_tensor_idx(t_id, gpu_strides, packed_dim);
   tensor_idx[packed_dim] *= 4;
   t_out[t_id] = read_texel(tensor_idx);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
index 139c82866f..37988f21ec 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
@@ -53,7 +53,7 @@ void main() {
     return;
   }
 
-  const ivec4 out_pos = to_texel_pos(t_id, out_strides, 0);
+  const ivec4 out_pos = to_tensor_idx(t_id, out_strides, 0);
 
   VEC4_T outtex = q_8w_linear(out_pos, mat1_sizes.x);
   write_texel(t_out, t_id, outtex);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.glsl
index 78d8346428..d545e5d86e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.glsl
@@ -61,7 +61,7 @@ void main() {
   }
 
   const VEC4_T intex = t_in[t_id];
-  ivec4 tensor_idx = to_texel_pos(t_id, gpu_strides, packed_dim);
+  ivec4 tensor_idx = to_tensor_idx(t_id, gpu_strides, packed_dim);
   tensor_idx[packed_dim] *= 4;
   write_out_texel(intex, tensor_idx);
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index 2e5e9addfb..79b463d7ef 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -21,8 +21,8 @@ void add_staging_to_tensor_node(
     const ValueRef out_tensor) {
   VK_CHECK_COND(graph.val_is_staging(in_staging));
 
-  vkapi::ShaderInfo shader =
-      get_nchw_to_tensor_shader(*graph.get_tensor(out_tensor));
+  vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
+      *graph.get_tensor(out_tensor), graph.int8_buffers_enabled());
 
   vkapi::ParamsBindList ubos({graph.sizes_ubo(out_tensor)});
   if (graph.is_buffer_storage(out_tensor)) {
@@ -55,10 +55,26 @@ void add_tensor_to_staging_node(
     const ValueRef out_staging) {
   VK_CHECK_COND(graph.val_is_staging(out_staging));
 
-  vkapi::ShaderInfo shader =
-      get_tensor_to_nchw_shader(*graph.get_tensor(in_tensor));
+  vkapi::ShaderInfo shader = get_tensor_to_nchw_shader(
+      *graph.get_tensor(in_tensor), graph.int8_buffers_enabled());
 
+  utils::uvec3 global_wg_size = graph.create_global_wg_size(in_tensor);
   vkapi::ParamsBindList ubos({graph.sizes_ubo(in_tensor)});
+
+  // Normally, the tensor_to_nchw shader is structured so that each thread reads
+  // one texel from the input texture and writes each component of the texel
+  // into the corresponding location in the output buffer. However, this shader
+  // is structured slightly differently in that each thread writes out a
+  // complete 32 bit integer (containing 4 packed 8-bit integers) into the
+  // output buffer. Therefore, the global work group size for this shader will
+  // be the number of elements in the output buffer divided by 4, as opposed to
+  // the extents of the input texture.
+  if (shader.kernel_name == "int8_tensor_to_nchw_noint8") {
+    uint32_t buffer_len = graph.get_staging(out_staging)->numel() / 4;
+    global_wg_size = {buffer_len, 1, 1};
+    ubos.append({graph.ntexels_ubo(in_tensor)});
+  }
+
   if (graph.is_buffer_storage(in_tensor)) {
     ubos.append({
         graph.texel_strides_ubo(in_tensor),
@@ -69,8 +85,8 @@ void add_tensor_to_staging_node(
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       shader,
-      graph.create_global_wg_size(in_tensor),
-      graph.create_local_wg_size(in_tensor),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
       // Input and Outputs
       {{in_tensor, vkapi::MemoryAccessType::READ},
        {out_staging, vkapi::MemoryAccessType::WRITE}},
@@ -86,7 +102,8 @@ ValueRef prepack(
     const utils::GPUMemoryLayout layout) {
   ValueRef v = graph.add_tensor_like(vref, layout);
 
-  vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(*graph.get_tensor(v));
+  vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
+      *graph.get_tensor(v), graph.int8_buffers_enabled());
 
   vkapi::ParamsBindList ubos({graph.sizes_ubo(v)});
   if (graph.is_buffer_storage(v)) {
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
index d681618d9d..2ade34e425 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -95,10 +95,17 @@ void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes) {
   memset(data_ptr, 0, staging.nbytes());
 }
 
-vkapi::ShaderInfo get_nchw_to_tensor_shader(const api::vTensor& v_dst) {
+vkapi::ShaderInfo get_nchw_to_tensor_shader(
+    const api::vTensor& v_dst,
+    const bool int8_buffer_enabled) {
   std::string kernel_name;
   kernel_name.reserve(kShaderNameReserve);
 
+  if (v_dst.dtype() == vkapi::kChar &&
+      v_dst.storage_type() == utils::kTexture3D && !int8_buffer_enabled) {
+    return VK_KERNEL(nchw_to_int8_tensor_noint8);
+  }
+
   kernel_name = "nchw_to_tensor";
   add_dtype_suffix(kernel_name, v_dst);
   add_storage_type_suffix(kernel_name, v_dst);
@@ -106,10 +113,17 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader(const api::vTensor& v_dst) {
   return VK_KERNEL_FROM_STR(kernel_name);
 }
 
-vkapi::ShaderInfo get_tensor_to_nchw_shader(const api::vTensor& v_src) {
+vkapi::ShaderInfo get_tensor_to_nchw_shader(
+    const api::vTensor& v_src,
+    bool int8_buffer_enabled) {
   std::string kernel_name;
   kernel_name.reserve(kShaderNameReserve);
 
+  if (v_src.dtype() == vkapi::kChar &&
+      v_src.storage_type() == utils::kTexture3D && !int8_buffer_enabled) {
+    return VK_KERNEL(int8_tensor_to_nchw_noint8);
+  }
+
   kernel_name = "tensor_to_nchw";
   add_dtype_suffix(kernel_name, v_src);
   add_storage_type_suffix(kernel_name, v_src);
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
index dfe86a9e26..cabc17f30e 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
@@ -31,7 +31,11 @@ void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes);
 // Functions to get shaders
 //
 
-vkapi::ShaderInfo get_nchw_to_tensor_shader(const api::vTensor& v_dst);
-vkapi::ShaderInfo get_tensor_to_nchw_shader(const api::vTensor& v_src);
+vkapi::ShaderInfo get_nchw_to_tensor_shader(
+    const api::vTensor& v_dst,
+    bool int8_buffer_enabled = true);
+vkapi::ShaderInfo get_tensor_to_nchw_shader(
+    const api::vTensor& v_src,
+    bool int8_buffer_enabled = true);
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/glsl/all_shaders.yaml b/backends/vulkan/test/glsl/all_shaders.yaml
index edba41b7ea..37403c97ac 100644
--- a/backends/vulkan/test/glsl/all_shaders.yaml
+++ b/backends/vulkan/test/glsl/all_shaders.yaml
@@ -47,21 +47,12 @@ idx_fill_buffer:
 idx_fill_texture:
   parameter_names_with_default_values:
     DTYPE: float
-    NDIM: 3
-    PACKING: CHANNELS_PACKED
   generate_variant_forall:
-    PACKING:
-      - VALUE: "CHANNELS_PACKED"
-        SUFFIX: "C_packed"
-      - VALUE: "WIDTH_PACKED"
-        SUFFIX: "W_packed"
-      - VALUE: "HEIGHT_PACKED"
-        SUFFIX: "H_packed"
     DTYPE:
-      - VALUE: "half"
-        SUFFIX: "half"
-      - VALUE: "float"
-        SUFFIX: "float"
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int
+      - VALUE: int8
   shader_variants:
     - NAME: idx_fill_texture
 
diff --git a/backends/vulkan/test/glsl/idx_fill_texture.glsl b/backends/vulkan/test/glsl/idx_fill_texture.glsl
index 1f75cadf49..8914d2b892 100644
--- a/backends/vulkan/test/glsl/idx_fill_texture.glsl
+++ b/backends/vulkan/test/glsl/idx_fill_texture.glsl
@@ -12,21 +12,17 @@
 
 #define VEC4_T ${texel_type(DTYPE)}
 
-#define POS ${get_pos[NDIM]("pos")}
-
 #include "indexing_utils.h"
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-
-layout(set = 0, binding = 1) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
+${layout_declare_tensor(0, "w", "image_out", DTYPE, "texture3d")}
+${layout_declare_ubo(1, "ivec4", "sizes")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
+layout(constant_id = 4) const int offset = 10;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
@@ -37,6 +33,6 @@ void main() {
   }
 
   const ivec4 buf_indices = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
-  VEC4_T texel = VEC4_T(buf_indices);
-  imageStore(image_out, POS, texel);
+  VEC4_T texel = VEC4_T(buf_indices) + offset;
+  imageStore(image_out, pos, texel);
 }
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index c55c286acc..649c0c82d6 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -76,7 +76,8 @@ void record_nchw_to_image_op(
       SV(v_dst.packed_dim_whcn_idx())};
 
   context->submit_compute_job(
-      get_nchw_to_tensor_shader(v_dst),
+      get_nchw_to_tensor_shader(
+          v_dst, context->adapter_ptr()->has_full_int8_buffers_support()),
       pipeline_barrier,
       v_dst.image_extents(),
       adaptive_work_group_size(v_dst.image_extents()),
@@ -112,6 +113,27 @@ void record_image_to_nchw_op(
       v_src.sizes_ubo());
 }
 
+void record_int8_image_to_nchw_noint8_op(
+    api::Context* const context,
+    api::vTensor& v_src,
+    api::StorageBuffer& dst_buffer) {
+  vkapi::PipelineBarrier pipeline_barrier{};
+  uint32_t buffer_len = utils::safe_downcast<uint32_t>(dst_buffer.numel() / 4);
+  utils::uvec3 global_wg_size = {buffer_len, 1, 1};
+  context->submit_compute_job(
+      VK_KERNEL(int8_tensor_to_nchw_noint8),
+      pipeline_barrier,
+      global_wg_size,
+      adaptive_work_group_size(global_wg_size),
+      {v_src.packed_dim_whcn_idx()},
+      VK_NULL_HANDLE,
+      0,
+      v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
+      dst_buffer.buffer(),
+      v_src.sizes_ubo(),
+      v_src.ntexels_ubo());
+}
+
 void record_conv2d_prepack_weights_op(
     api::Context* const context,
     vkapi::VulkanBuffer& src_buffer,
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index 89e16131c9..3dd9497e69 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -82,6 +82,11 @@ void record_image_to_nchw_op(
     api::vTensor& v_src,
     vkapi::VulkanBuffer& dst_buffer);
 
+void record_int8_image_to_nchw_noint8_op(
+    api::Context* const context,
+    api::vTensor& v_src,
+    api::StorageBuffer& dst_buffer);
+
 void record_conv2d_prepack_weights_op(
     api::Context* const context,
     vkapi::VulkanBuffer& src_buffer,
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 5f4fa519ca..6f0879c422 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -8,6 +8,7 @@
 
 #include <gtest/gtest.h>
 
+#include <random>
 #include <utility>
 #include <vector>
 
@@ -1692,25 +1693,21 @@ void run_from_gpu_test(
   if (dtype == vkapi::kHalf && !context()->adapter_ptr()->has_16bit_storage()) {
     return;
   }
-  if ((dtype == vkapi::kChar || dtype == vkapi::kQInt8) &&
-      !context()->adapter_ptr()->has_full_int8_buffers_support()) {
-    return;
-  }
   vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout);
 
   std::string kernel_name("idx_fill_texture");
-  add_memory_layout_suffix(kernel_name, vten);
   add_dtype_suffix(kernel_name, vten);
 
+  int32_t offset = -50;
+
   {
     vkapi::PipelineBarrier pipeline_barrier{};
-    vkapi::SpecVarList specialization_constants = {vten.packed_dim_whcn_idx()};
     context()->submit_compute_job(
         VK_KERNEL_FROM_STR(kernel_name),
         pipeline_barrier,
         vten.image_extents(),
         {4, 4, 4},
-        specialization_constants,
+        {vten.packed_dim_whcn_idx(), offset},
         VK_NULL_HANDLE,
         0,
         vten.image(
@@ -1722,7 +1719,12 @@ void run_from_gpu_test(
 
   StorageBuffer staging_buffer(context(), dtype, vten.gpu_numel());
 
-  record_image_to_nchw_op(context(), vten, staging_buffer.buffer());
+  if (dtype == vkapi::kChar &&
+      !context()->adapter_ptr()->has_full_int8_buffers_support()) {
+    record_int8_image_to_nchw_noint8_op(context(), vten, staging_buffer);
+  } else {
+    record_image_to_nchw_op(context(), vten, staging_buffer.buffer());
+  }
 
   submit_to_gpu();
 
@@ -1730,12 +1732,12 @@ void run_from_gpu_test(
   copy_staging_to_ptr(staging_buffer, data_out.data(), staging_buffer.nbytes());
 
   for (int i = 0; i < vten.numel(); i++) {
-    CHECK_VALUE(data_out, i, i);
+    CHECK_VALUE(data_out, i, i + offset);
   }
 }
 
 template <typename T>
-void run_to_gpu_test(
+void round_trip_test(
     std::vector<int64_t>& sizes,
     utils::GPUMemoryLayout memory_layout =
         utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
@@ -1744,10 +1746,6 @@ void run_to_gpu_test(
   if (dtype == vkapi::kHalf && !context()->adapter_ptr()->has_16bit_storage()) {
     return;
   }
-  if ((dtype == vkapi::kChar || dtype == vkapi::kQInt8) &&
-      !context()->adapter_ptr()->has_full_int8_buffers_support()) {
-    return;
-  }
 
   vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout);
 
@@ -1756,16 +1754,22 @@ void run_to_gpu_test(
 
   std::vector<T> data_in(staging_buffer_in.numel());
   for (int i = 0; i < staging_buffer_in.numel(); i++) {
-    data_in[i] = i;
+    data_in[i] = T(i * -1);
   }
   copy_ptr_to_staging(data_in.data(), staging_buffer_in, vten.gpu_nbytes());
 
   // Output staging buffer
   StorageBuffer staging_buffer_out(context(), dtype, vten.gpu_numel());
 
-  // Copy data in and out of the tensor
   record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten);
-  record_image_to_nchw_op(context(), vten, staging_buffer_out.buffer());
+
+  // Copy data in and out of the tensor
+  if (dtype == vkapi::kChar &&
+      !context()->adapter_ptr()->has_full_int8_buffers_support()) {
+    record_int8_image_to_nchw_noint8_op(context(), vten, staging_buffer_out);
+  } else {
+    record_image_to_nchw_op(context(), vten, staging_buffer_out.buffer());
+  }
 
   // Execute command buffer
   submit_to_gpu();
@@ -1777,11 +1781,51 @@ void run_to_gpu_test(
 
   // All indices should be equal to the input data
   for (int i = 0; i < vten.numel(); i++) {
-    CHECK_VALUE(data_out, i, i);
+    CHECK_VALUE(data_out, i, data_in[i]);
+  }
+}
+
+template <typename T>
+void compute_graph_round_trip_test(
+    std::vector<int64_t>& sizes,
+    utils::GPUMemoryLayout memory_layout =
+        utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
+    vkapi::ScalarType dtype = vkapi::kFloat,
+    utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) {
+  if (dtype == vkapi::kHalf && !context()->adapter_ptr()->has_16bit_storage()) {
+    return;
+  }
+
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  ValueRef r_tensor =
+      graph.add_tensor(sizes, dtype, storage_type, memory_layout);
+  ValueRef r_staging_in = graph.set_input_tensor(r_tensor);
+  ValueRef r_staging_out = graph.set_output_tensor(r_tensor);
+
+  graph.prepare();
+  graph.encode_execute();
+
+  vTensorPtr tensor = graph.get_tensor(r_tensor);
+
+  std::vector<T> data_in(tensor->numel());
+  for (int i = 0; i < data_in.size(); i++) {
+    data_in[i] = T(i * -1);
+  }
+  graph.copy_into_staging(r_staging_in, data_in.data(), data_in.size());
+
+  graph.execute();
+
+  std::vector<T> data_out(tensor->gpu_numel());
+  graph.copy_from_staging(r_staging_out, data_out.data(), data_out.size());
+
+  for (int i = 0; i < data_in.size(); i++) {
+    CHECK_VALUE(data_out, i, data_in[i]);
   }
 }
 
-TEST(VulkanToFromGPUShaderTest, to_gpu_and_from_gpu_test_texture) {
+TEST(VulkanToFromGPUShaderTest, round_trip_tests) {
   // The below tests will fill each texel element with the value of the linear
   // buffer index that corresponds to it. The texel at position (0, 0, 0) will
   // be filled with the values [0, 1, 2, 3], the texel at position (1, 0, 0)
@@ -1824,11 +1868,17 @@ TEST(VulkanToFromGPUShaderTest, to_gpu_and_from_gpu_test_texture) {
   };
 
 #define RUN_TESTS(ctype, dtype)                                      \
-  run_to_gpu_test<ctype>(                                            \
+  round_trip_test<ctype>(                                            \
+      sizes, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, dtype); \
+  round_trip_test<ctype>(                                            \
+      sizes, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, dtype);    \
+  round_trip_test<ctype>(                                            \
+      sizes, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, dtype);   \
+  compute_graph_round_trip_test<ctype>(                              \
       sizes, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, dtype); \
-  run_to_gpu_test<ctype>(                                            \
+  compute_graph_round_trip_test<ctype>(                              \
       sizes, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, dtype);    \
-  run_to_gpu_test<ctype>(                                            \
+  compute_graph_round_trip_test<ctype>(                              \
       sizes, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, dtype);
 
   for (auto& sizes : to_test) {

From 0caaf3f3851053be5cc2e8273e7f7c4fdd5ae7c6 Mon Sep 17 00:00:00 2001
From: Guang Yang <guangyang@meta.com>
Date: Fri, 2 Aug 2024 11:28:09 -0700
Subject: [PATCH 64/75] Support devices and delegates parameters in
 android-perf workflow (#4484)

Summary:
The workflow can support parameters `models`, `devices`, `delegates`.
The workflow is able to translate `devices` to `device-pool-arn` and pass to the `mobile_job` for scheduling
The model is packed and uploaded to `artifacts/{model}_{backend}/model.zip`

Note: We should only expect the scheduling is working correctly by passing models, devices and delegates. The actual job may not finish successfully because the underlying model/delegate/device are not supported yet.

Pull Request resolved: https://github.com/pytorch/executorch/pull/4484

Test Plan: The workflow scheduling is working as expected: https://github.com/pytorch/executorch/actions/runs/10187260307

Reviewed By: huydhn

Differential Revision: D60540343

Pulled By: guangy10

fbshipit-source-id: 14815b508707822a5720ad2fdb2ba430712779ad
---
 .github/workflows/android-perf.yml | 68 +++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 78f41ada20..a8223eef2c 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -38,26 +38,52 @@ concurrency:
 permissions: read-all
 
 jobs:
-  set-models:
+  set-parameters:
     runs-on: linux.2xlarge
     outputs:
-      models: ${{ steps.set-models.outputs.models }}
+      models: ${{ steps.set-parameters.outputs.models }}
+      devices: ${{ steps.set-parameters.outputs.devices }}
+      delegates: ${{ steps.set-parameters.outputs.delegates }}
     steps:
-      - name: Set models
-        id: set-models
+      - name: Set parameters
+        id: set-parameters
         shell: bash
         run: |
           set -ex
           MODELS="${{ inputs.models }}"
+          DEVICES="${{ inputs.devices }}"
+          DELEGATES="${{ inputs.delegates }}"
+
+          # Mapping devices to their corresponding device-pool-arn
+          declare -A DEVICE_POOL_ARNS
+          DEVICE_POOL_ARNS[samsung_galaxy_s2x]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
+
+          # Resolve device names with their corresponding ARNs
+          if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
+            DEVICES=$(echo "$DEVICES" | jq -Rc 'split(",")')
+          fi
+          declare -a MAPPED_ARNS=()
+          for DEVICE in $(echo "$DEVICES" | jq -r '.[]'); do
+            if [[ -z "${DEVICE_POOL_ARNS[$DEVICE]}" ]]; then
+              echo "Error: No ARN found for device '$DEVICE'. Abort." >&2
+              exit 1
+            fi
+            MAPPED_ARNS+=("${DEVICE_POOL_ARNS[$DEVICE]}")
+          done
+
           echo "models=$(echo $MODELS | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
+          MAPPED_ARNS_JSON=$(printf '%s\n' "${MAPPED_ARNS[@]}" | jq -R . | jq -s .)
+          echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT
+          echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
 
   export-models:
     name: export-models
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    needs: set-models
+    needs: set-parameters
     strategy:
       matrix:
-          model: ${{ fromJson(needs.set-models.outputs.models) }}
+          model: ${{ fromJson(needs.set-parameters.outputs.models) }}
+          delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
       fail-fast: false
     with:
       runner: linux.2xlarge
@@ -72,32 +98,33 @@ jobs:
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
         echo "Exporting model: ${{ matrix.model }}"
-        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}
+        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
 
+        # TODO(T197546696): Note that the following scripts/steps only work for llama. It's expected to fail for other models+delegates.
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh "${{ matrix.model }}.pt" "cmake" "fp32" "xnnpack+custom+qe" "${ARTIFACTS_DIR_NAME}"\
 
-  # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
+  # Upload models to S3. The artifacts are needed not only by the device farm but also TorchChat
   upload-models:
     needs: export-models
     runs-on: linux.2xlarge
     steps:
-      - name: Download the artifacts from GitHub
+      - name: Download the models from GitHub
         uses: actions/download-artifact@v3
         with:
           # The name here needs to match the name of the upload-artifact parameter
           name: android-models
           path: ${{ runner.temp }}/artifacts/
 
-      - name: Verify the artifacts
+      - name: Verify the models
         shell: bash
         working-directory: ${{ runner.temp }}/artifacts/
         run: |
           ls -lah ./
 
-      - name: Upload the artifacts to S3
+      - name: Upload the models to S3
         uses: seemethere/upload-artifact-s3@v5
         with:
           s3-bucket: gha-artifacts
@@ -110,7 +137,7 @@ jobs:
   build-llm-demo:
     name: build-llm-demo
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    needs: set-models
+    needs: set-parameters
     strategy:
       matrix:
           tokenizer: [bpe]
@@ -139,20 +166,20 @@ jobs:
     needs: build-llm-demo
     runs-on: linux.2xlarge
     steps:
-      - name: Download the artifacts from GitHub
+      - name: Download the apps from GitHub
         uses: actions/download-artifact@v3
         with:
           # The name here needs to match the name of the upload-artifact parameter
           name: android-apps
           path: ${{ runner.temp }}/artifacts/
 
-      - name: Verify the artifacts
+      - name: Verify the apps
         shell: bash
         working-directory: ${{ runner.temp }}/artifacts/
         run: |
           ls -lah ./
 
-      - name: Upload the artifacts to S3
+      - name: Upload the apps to S3
         uses: seemethere/upload-artifact-s3@v5
         with:
           s3-bucket: gha-artifacts
@@ -169,20 +196,21 @@ jobs:
       contents: read
     uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
     needs:
-      - set-models
+      - set-parameters
       - upload-models
       - upload-android-apps
     strategy:
       matrix:
-        model: ${{ fromJson(needs.set-models.outputs.models) }}
+        model: ${{ fromJson(needs.set-parameters.outputs.models) }}
+        delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+        device: ${{ fromJson(needs.set-parameters.outputs.devices) }}
     with:
       device-type: android
       runner: linux.2xlarge
       test-infra-ref: ''
       # This is the ARN of ExecuTorch project on AWS
       project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
-      # This is the custom Android device pool that only includes Samsung Galaxy S2x
-      device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa
+      device-pool-arn: ${{ matrix.device }}
       # Uploaded to S3 from the previous job, the name of the app comes from the project itself.
       # Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer.
       # It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only
@@ -193,4 +221,4 @@ jobs:
       # The test spec can be downloaded from https://ossci-assets.s3.amazonaws.com/android-llama2-device-farm-test-spec.yml
       test-spec: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/abd86868-fa63-467e-a5c7-218194665a77
       # Uploaded to S3 from the previous job
-      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}/model.zip
+      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}_${{ matrix.delegate }}/model.zip

From 1b6d5bb3a10dbcaa7769f775efd87c5d2a12306d Mon Sep 17 00:00:00 2001
From: Avik Chaudhuri <avik@meta.com>
Date: Fri, 2 Aug 2024 11:38:36 -0700
Subject: [PATCH 65/75] fix forward _partition_and_lower_one_graph_module

Summary:
D60296909 broke some executorch backend code.

Note that `inputs_to_buffers`, `buffers`, `inputs_to_parameters`, `parameters` on a graph signature were all property accessors even before that diff, so mutating operations like `pop()` and `remove()` had no effect since a new list / dict was returned on every property access.
That diff makes them immutable so `pop()` and `remove()` are now errors.

Reviewed By: pssrawat

Differential Revision: D60642036

fbshipit-source-id: 7e65f333acda59d99d541ef3f3c4be008d10572f
---
 exir/backend/backend_api.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
index 2e7f1b3cdf..25c793287d 100644
--- a/exir/backend/backend_api.py
+++ b/exir/backend/backend_api.py
@@ -269,8 +269,7 @@ def _partition_and_lower_one_graph_module(
 
             if node.name in toplevel_signature.inputs_to_buffers:
                 # Delete the consumed buffers
-                buffer_name = toplevel_signature.inputs_to_buffers.pop(node.name)
-                toplevel_signature.buffers.remove(buffer_name)
+                buffer_name = toplevel_signature.inputs_to_buffers.get(node.name)
                 if buffer_name in owning_program.state_dict:
                     owning_program.state_dict.pop(buffer_name)
                 else:
@@ -278,8 +277,7 @@ def _partition_and_lower_one_graph_module(
                 tagged_graph_module.graph.erase_node(node)
             elif node.name in toplevel_signature.inputs_to_parameters:
                 # Delete the consumed parameters
-                param_name = toplevel_signature.inputs_to_parameters.pop(node.name)
-                toplevel_signature.parameters.remove(param_name)
+                param_name = toplevel_signature.inputs_to_parameters.get(node.name)
                 owning_program.state_dict.pop(param_name)
                 tagged_graph_module.graph.erase_node(node)
 

From 15815dd5cd58b6d250dc98f8a4b121c77c44dbed Mon Sep 17 00:00:00 2001
From: haowhsu-quic <quic_haowhsu@quicinc.com>
Date: Fri, 2 Aug 2024 12:23:28 -0700
Subject: [PATCH 66/75] Make flatcc cross-compile deterministic (#4312)

Summary:
As mentioned in https://github.com/pytorch/executorch/pull/4297, the original flow makes host / cross build happen concurrently. This change moves host build process into cmake configuring stage and refine related dependencies.

Pull Request resolved: https://github.com/pytorch/executorch/pull/4312

Test Plan:
- cross-compile > Through running `backends/qualcomm/script/build.sh --release`, we could check if the compiling process successfully finished.
- native-compile > Run following to check:
```shell
cmake \
    -DCMAKE_BUILD_TYPE=RelWithDebInfo \
    -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
    -DEXECUTORCH_BUILD_QNN=ON \
    -DEXECUTORCH_BUILD_SDK=ON \
    -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
    -S $EXECUTORCH_ROOT \
    -B $EXECUTORCH_ROOT/build_x86_64 \
```

Reviewed By: tarun292

Differential Revision: D60243701

Pulled By: dbort

fbshipit-source-id: ff8d8cb06f0cc296c7ef465596e7e3df367dd059
---
 sdk/CMakeLists.txt | 73 +++++++++++++++++-----------------------------
 1 file changed, 27 insertions(+), 46 deletions(-)

diff --git a/sdk/CMakeLists.txt b/sdk/CMakeLists.txt
index 75a200d90c..8f677000c8 100644
--- a/sdk/CMakeLists.txt
+++ b/sdk/CMakeLists.txt
@@ -87,23 +87,37 @@ if(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT)
   # Add the host project. We build this separately so that we can generate
   # headers on the host during the build, even if we're cross-compiling the
   # flatcc runtime to a different architecture.
-
-  # lint_cmake: -readability/wonkycase
-  ExternalProject_Add(
-    flatcc_project
-    PREFIX ${CMAKE_BINARY_DIR}/_host_build
-    SOURCE_DIR ${_flatcc_source_dir}
-    BINARY_DIR ${CMAKE_BINARY_DIR}/_host_build
-    CMAKE_CACHE_ARGS
-      -DFLATCC_TEST:BOOL=OFF -DFLATCC_REFLECTION:BOOL=OFF
-      # See above comment about POSITION_INDEPENDENT_CODE.
-      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-    INSTALL_COMMAND "" # Prevent the install step
+  execute_process(
+    COMMAND ${CMAKE_COMMAND} ${_flatcc_source_dir}
+    -DFLATCC_TEST=OFF -DFLATCC_REFLECTION=OFF
+    # See above comment about POSITION_INDEPENDENT_CODE.
+    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    -B${CMAKE_BINARY_DIR}/_host_build
+  )
+  execute_process(
+    COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR}/_host_build
+  )
+  set(_etdump_schema_gen_dep)
+  # TODO(dbort): flatcc installs its files directly in its source directory
+  # instead of under CMAKE_BINARY_DIR, and it has no options to avoid doing
+  # this. We build flatcc twice in the executorch build: once to get the
+  # `flatcc` host commandline tool, and once to get the (potentially
+  # cross-compiled) target runtime library. The host build will put its outputs
+  # in the source tree, making the cross-compiling target build think that the
+  # outputs have already been built. It will then try to link against the
+  # host-architecture libraries, failing when cross-compiling. To work around
+  # this, delete the host outputs after running this command (which only runs
+  # when setting up the cmake files, not when actually building). This leaves
+  # room for the target build to put its own files in the source tree. We should
+  # try to remove this hack, ideally by submitting an upstream PR that adds an
+  # option to change the installation location.
+  set(_etdump_schema_cleanup_paths ${_flatcc_source_dir}/bin/*
+                                   ${_flatcc_source_dir}/lib/*
   )
-  set(_etdump_schema_gen_dep flatcc_project)
 else()
   # If we're not cross-compiling, we can just use the plain commandline target.
   set(_etdump_schema_gen_dep flatcc_cli)
+  set(_etdump_schema_cleanup_paths "")
 endif()
 
 set(_etdump_schema__outputs)
@@ -134,42 +148,11 @@ add_library(
   bundled_program_schema INTERFACE ${_bundled_program_schema__outputs}
 )
 
-# Ensure the host tool is built before the main project
-add_dependencies(etdump_schema flatcc_cli)
-
 file(MAKE_DIRECTORY ${_program_schema__include_dir}/executorch/sdk/etdump)
 file(MAKE_DIRECTORY
      ${_program_schema__include_dir}/executorch/sdk/bundled_program
 )
 
-if(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT)
-  # If we cross-compiling, we need to use the version of the commandline tool
-  # built for the host.
-  set(_etdump_schema_gen_dep flatcc_project)
-
-  # TODO(dbort): flatcc installs its files directly in its source directory
-  # instead of under CMAKE_BINARY_DIR, and it has no options to avoid doing
-  # this. We build flatcc twice in the executorch build: once to get the
-  # `flatcc` host commandline tool, and once to get the (potentially
-  # cross-compiled) target runtime library. The host build will put its outputs
-  # in the source tree, making the cross-compiling target build think that the
-  # outputs have already been built. It will then try to link against the
-  # host-architecture libraries, failing when cross-compiling. To work around
-  # this, delete the host outputs after running this command (which only runs
-  # when setting up the cmake files, not when actually building). This leaves
-  # room for the target build to put its own files in the source tree. We should
-  # try to remove this hack, ideally by submitting an upstream PR that adds an
-  # option to change the installation location.
-  set(_etdump_schema_cleanup_paths ${_flatcc_source_dir}/bin/*
-                                   ${_flatcc_source_dir}/lib/*
-  )
-else()
-  # If we're not cross-compiling we can use the plain commandline target, and we
-  # don't need to delete any files.
-  set(_etdump_schema_gen_dep flatcc_cli)
-  set(_etdump_schema_cleanup_paths "")
-endif()
-
 add_custom_command(
   OUTPUT ${_etdump_schema__outputs}
   COMMAND
@@ -180,10 +163,8 @@ add_custom_command(
     ${_program_schema__include_dir}/executorch/sdk/etdump
     ${_etdump_schema__srcs}
   COMMAND rm -f ${_etdump_schema_cleanup_paths}
-  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/sdk
   DEPENDS ${_etdump_schema_gen_dep}
   COMMENT "Generating etdump headers"
-  VERBATIM
 )
 
 add_library(

From cd529cdbf50fc4c68837a5fbd3d122889fbe6417 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Fri, 2 Aug 2024 13:05:08 -0700
Subject: [PATCH 67/75] Add dim-order op revert pass for delegates (#4470)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4470

As the title suggests, this brings back memory format ops replacing dim order ops.

Rationale
* DimOrder has to be introduced in the edge IR by ET Beta
* Delegates may need more time to handle this especially with BC and/or external repo update
* This will allow us to set `EdgeCompileConfig._skip_dim_order = False` by default and let delegate use this pass for their `preprocess` function for easy migration.

The expected net result is in the Edge dialect we will have memory_format removed for at least `to_copy` op and rest should be able to transparently removed gradually like full_op. While delegate can effectively live in the old edge dialect (technically illegal  - verifier will fail) with this pass. Once they are ready to accept the dim_order ops, they can drop this pass.

bypass-github-export-checks
bypass-github-pytorch-ci-checks
bypass-github-executorch-ci-checks

Reviewed By: Gasoonjia

Differential Revision: D60431584

fbshipit-source-id: 75196930b8308bc089080841942eaeffb2dcce39
---
 exir/passes/dim_order_ops_registry.py |  12 ++++
 exir/passes/memory_format_ops_pass.py |  60 +++++++++++++++-
 exir/tests/test_passes.py             | 100 +++++++++++++++++++++++++-
 3 files changed, 168 insertions(+), 4 deletions(-)

diff --git a/exir/passes/dim_order_ops_registry.py b/exir/passes/dim_order_ops_registry.py
index 7fed005b3c..27fc03f941 100644
--- a/exir/passes/dim_order_ops_registry.py
+++ b/exir/passes/dim_order_ops_registry.py
@@ -45,3 +45,15 @@ def _to_dim_order_copy_out_impl(*args, **kwargs):
 DimOrderOpsMap = {
     "aten._to_copy.default": exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
 }
+
+"""
+Defines a map of aten or edge ops to the corresponding memory format ops for quick lookup
+"""
+MemoryFormatOpsMap = {
+    "dim_order_ops._to_dim_order_copy.default": exir_ops.edge.aten._to_copy.default,
+}
+
+# If we are replacing an aten op with a dim_order op, we must have a 1:1 mapping through these dicts.
+assert len(DimOrderOpsMap) == len(MemoryFormatOpsMap)
+
+# TODO stricter check for 1:1 mapping
diff --git a/exir/passes/memory_format_ops_pass.py b/exir/passes/memory_format_ops_pass.py
index 5a3c0f3a91..32678bf408 100644
--- a/exir/passes/memory_format_ops_pass.py
+++ b/exir/passes/memory_format_ops_pass.py
@@ -9,13 +9,19 @@
 
 import torch
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
-from executorch.exir.dim_order_utils import get_dim_order
+from executorch.exir.dim_order_utils import get_dim_order, get_memory_format
 from executorch.exir.pass_base import ExportPass, ProxyValue
-from executorch.exir.passes.dim_order_ops_registry import DimOrderOpsMap
+from executorch.exir.passes.dim_order_ops_registry import (
+    DimOrderOpsMap,
+    MemoryFormatOpsMap,
+)
 
 logger = logging.getLogger(__file__)
 logger.setLevel(logging.INFO)
 
+# TODO - these passes are too specialized on a single to_copy op.
+# We should be able to replace (or revert) any of the dim_order ops in the future.
+
 
 class MemoryFormatOpsPass(ExportPass):
     """
@@ -53,7 +59,55 @@ def call_operator(self, op, args, kwargs, meta):
             f" _to_dim_order_copy = dim_order: {nkwargs['dim_order']}"
         )
 
-        t = DimOrderOpsMap[op.__name__]
+        t = DimOrderOpsMap.get(op.__name__, None)
+        assert t is not None, f"{op.__name__} not found in DimOrderOpsMap"
+
+        return super().call_operator(
+            t,
+            args,
+            nkwargs,
+            meta,
+        )
+
+
+class DimOrderOpsRevertPass(ExportPass):
+    """
+    This pass is to revert the dim_order ops back to the memory format ops.
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if not (isinstance(op, EdgeOpOverload) and op.__name__ in MemoryFormatOpsMap):
+            return super().call_operator(
+                op,
+                args,
+                kwargs,
+                meta,
+            )
+
+        # new kwargs with dim_order, and no memory_format for the new op
+        nkwargs = dict(copy.deepcopy(kwargs))  # orig kwargs are immutable
+
+        # can always get the shape, assuming rank is specialized
+        if isinstance(args[0], ProxyValue) and args[0].is_tensor():
+            ndim = args[0].to_tensor().dim()
+        elif isinstance(args[0], torch.Tensor):
+            ndim = args[0].dim()
+        else:
+            assert 0, f"Expecting a Tensor or a ProxyValue buy got {type(args[0])}"
+
+        # get the "to" memory format for the EdgeOp
+        default_dim_order = list(range(ndim))
+        dim_order = nkwargs.pop("dim_order", default_dim_order)
+
+        nkwargs["memory_format"] = get_memory_format(dim_order)
+
+        logger.debug(
+            f" _to_dim_order_copy = dim_order: {dim_order}."
+            f"_to_copy = rank: {ndim}, memory_format: {nkwargs['memory_format']}."
+        )
+
+        t = MemoryFormatOpsMap.get(op.__name__, None)
+        assert t is not None, f"{op.__name__} not found in MemoryFormatOpsMap"
 
         return super().call_operator(
             t,
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index 61d3af8afb..99ec648145 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -37,10 +37,11 @@
 from executorch.exir.passes.insert_write_back_for_buffers_pass import (
     insert_write_back_for_buffers_pass,
 )
+
+from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
 from executorch.exir.passes.normalize_view_copy_base_pass import (
     NormalizeViewCopyBasePass,
 )
-
 from executorch.exir.passes.remove_graph_asserts_pass import RemoveGraphAssertsPass
 from executorch.exir.passes.remove_mixed_type_operators import RemoveMixedTypeOperators
 from executorch.exir.passes.replace_edge_with_backend_pass import EdgeToBackendOpsPass
@@ -1676,3 +1677,100 @@ def forward(self, text_tokens):
         )
         new_ep = constant_prop_pass(edge_manager._edge_programs["forward"])
         _ = copy.deepcopy(new_ep.module_call_graph)
+
+    def test_dim_order_revert_pass(self) -> None:
+        aten_op_str = "torch.ops.aten._to_copy.default"
+        edge_aten_op_str = "executorch_exir_dialects_edge__ops_aten__to_copy_default"
+        edge_dim_order_op_str = "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default"
+
+        class Module(torch.nn.Module):
+            """
+            A simple module that has a single to op that converts to channels last and then back to contiguous.
+            Assuming contiguous input.
+            """
+
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return x.to(memory_format=torch.channels_last).to(
+                    memory_format=torch.contiguous_format
+                ) + x.to(memory_format=torch.channels_last).to(
+                    memory_format=torch.contiguous_format
+                )
+
+            @staticmethod
+            def to_copy_count():
+                return 4
+
+        def _do_checks(
+            test_str: str, allowed: str, allowed_count: int, not_allowed_list: List[str]
+        ) -> None:
+            for not_allowed in not_allowed_list:
+                FileCheck().check_count(allowed, allowed_count, exactly=True).check_not(
+                    not_allowed
+                ).run(test_str)
+
+        m = Module()
+        n = m.to_copy_count()
+        input = torch.randn([2, 3, 4, 5]).to(memory_format=torch.contiguous_format)
+
+        # 1. vanilla export, no edge ops
+        ep = export(
+            m,
+            (input,),
+        )
+        _do_checks(
+            ep.graph_module.code,
+            aten_op_str,
+            n,
+            [edge_aten_op_str, edge_dim_order_op_str],
+        )
+
+        # 2a. to edge without dim orders, we should see edge aten ops but not dim order ops
+        edge_prog = to_edge(
+            ep, compile_config=exir.EdgeCompileConfig(_skip_dim_order=True)
+        )._edge_programs["forward"]
+        _do_checks(
+            edge_prog.graph_module.code,
+            edge_aten_op_str,
+            n,
+            [aten_op_str, edge_dim_order_op_str],
+        )
+
+        # 3a. expect no change after the pass, we should see edge aten ops but not dim order ops
+        new_res = DimOrderOpsRevertPass()(edge_prog.graph_module)
+        self.assertIsNotNone(new_res)
+        _do_checks(
+            new_res.graph_module.code,
+            edge_aten_op_str,
+            n,
+            [aten_op_str, edge_dim_order_op_str],
+        )
+
+        # 2b. let's try with dim order enabled, we should see edge dim order ops but not edge aten ops
+        edge_prog_dim_order = to_edge(
+            ep, compile_config=exir.EdgeCompileConfig(_skip_dim_order=False)
+        )._edge_programs["forward"]
+        _do_checks(
+            edge_prog_dim_order.graph_module.code,
+            edge_dim_order_op_str,
+            n,
+            [aten_op_str, edge_aten_op_str],
+        )
+
+        # 3b. expect edge aten ops after the pass, we should see not see the edge dim order ops
+        new_res_dim_order = DimOrderOpsRevertPass()(edge_prog_dim_order.graph_module)
+        self.assertIsNotNone(new_res_dim_order)
+        _do_checks(
+            new_res_dim_order.graph_module.code,
+            edge_aten_op_str,
+            n,
+            [aten_op_str, edge_dim_order_op_str],
+        )
+
+        output_no_dim_order = new_res.graph_module(input)
+        output_no_dim_order_revert = new_res_dim_order.graph_module(input)
+        self.assertTrue(
+            torch.allclose(output_no_dim_order[0], output_no_dim_order_revert[0])
+        )

From 164762271634d668ad3110bb7d7dde82efb46f5e Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Fri, 2 Aug 2024 13:05:08 -0700
Subject: [PATCH 68/75] Revert dim_order ops by default (#4518)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4518

Use this pass to revert dim_order ops back to memory_format ops until we are ready.

bypass-github-export-checks
bypass-github-pytorch-ci-checks
bypass-github-executorch-ci-checks

Reviewed By: mcr229

Differential Revision: D60431583

fbshipit-source-id: c3dfeb26e59b086ca15bf01a8ccda1c294e41ab4
---
 backends/xnnpack/passes/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/backends/xnnpack/passes/__init__.py b/backends/xnnpack/passes/__init__.py
index 1ca4fe307f..c3a85e4aa8 100644
--- a/backends/xnnpack/passes/__init__.py
+++ b/backends/xnnpack/passes/__init__.py
@@ -27,6 +27,7 @@
 from executorch.exir.pass_base import ExportPass
 
 from executorch.exir.passes.const_prop_pass import ConstPropPass
+from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
 
 from executorch.exir.program._program import _transform
 from torch._export.pass_base import PassType
@@ -50,6 +51,8 @@ def __init__(
         if not passes:
             # All the XNNPACK passes
             self.passes = [
+                # TODO - remove this pass once we have a better support for dim_order ops lowering
+                DimOrderOpsRevertPass,
                 ConvertToUpsampleBilinear2d,
                 ConvertToLinearPass,
                 ConvertToSDPAPass,

From ee8359c6d6e12b4fc163c7f63f7068b17b5194f1 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Fri, 2 Aug 2024 13:05:08 -0700
Subject: [PATCH 69/75] Update bilinear test to handle dim_order (#4520)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4520

As title suggests, we don't want to have this as a configurable option in the long term`

bypass-github-export-checks
bypass-github-pytorch-ci-checks
bypass-github-executorch-ci-checks

Reviewed By: mcr229

Differential Revision: D60492331

fbshipit-source-id: 4874f98be7df388603557e15c6cb6635ae8897ed
---
 backends/xnnpack/partition/graphs/bilinear_2d.py | 15 +++++++++------
 backends/xnnpack/test/ops/bilinear2d.py          |  7 ++++---
 backends/xnnpack/utils/configs.py                |  8 ++++++--
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/backends/xnnpack/partition/graphs/bilinear_2d.py b/backends/xnnpack/partition/graphs/bilinear_2d.py
index a971cb9244..0040439f84 100644
--- a/backends/xnnpack/partition/graphs/bilinear_2d.py
+++ b/backends/xnnpack/partition/graphs/bilinear_2d.py
@@ -37,12 +37,15 @@ def forward(self, x):
     ]
     for align_corners in [True, False]:
         for config in capture_configs:
-            edge = exir.capture(
-                bilinear2d(align_corners), sample_inputs, config
-            ).to_edge(
-                config=get_xnnpack_edge_compile_config(),
-            )
-            _bilinear2d_graphs[edge.exported_program.graph_module] = align_corners
+            for skip_dim_order_flag in [True, False]:
+                edge = exir.capture(
+                    bilinear2d(align_corners), sample_inputs, config
+                ).to_edge(
+                    config=get_xnnpack_edge_compile_config(
+                        skip_dim_order=skip_dim_order_flag
+                    )
+                )
+                _bilinear2d_graphs[edge.exported_program.graph_module] = align_corners
     return _bilinear2d_graphs
 
 
diff --git a/backends/xnnpack/test/ops/bilinear2d.py b/backends/xnnpack/test/ops/bilinear2d.py
index ab9d3d3c11..d3c8535069 100644
--- a/backends/xnnpack/test/ops/bilinear2d.py
+++ b/backends/xnnpack/test/ops/bilinear2d.py
@@ -65,12 +65,15 @@ def forward(self, x):
             )
             return a
 
+    # Since we may or may not enable dim order, use these ops only for
+    # check_not since we have `to_copy` and `to_dim_order_copy` in the list.
     ops = {
         "executorch_exir_dialects_edge__ops_aten_sub_Tensor",
         "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
         "executorch_exir_dialects_edge__ops_aten_index_Tensor",
         "executorch_exir_dialects_edge__ops_aten_arange_start_step",
         "executorch_exir_dialects_edge__ops_aten__to_copy_default",
+        "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default",
         "executorch_exir_dialects_edge__ops_aten_add_Tensor",
         "executorch_exir_dialects_edge__ops_aten_clamp_default",
     }
@@ -81,7 +84,6 @@ def test_fp32_static_resize_bilinear2d(self):
             Tester(self.StaticResizeBilinear2dModule(), example_inputs)
             .export()
             .to_edge()
-            .check(self.ops)
             .partition()
             .check_not(self.ops)
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
@@ -90,13 +92,12 @@ def test_fp32_static_resize_bilinear2d(self):
             .run_method_and_compare_outputs()
         )
 
-    def test_fp32_static_resize_bilinear2d_with_align_cornesr(self):
+    def test_fp32_static_resize_bilinear2d_with_align_corners(self):
         example_inputs = (torch.randn(2, 3, 4, 5),)
         (
             Tester(self.StaticResizeBilinear2dModuleWithAlignCorners(), example_inputs)
             .export()
             .to_edge()
-            .check(self.ops)
             .partition()
             .check_not(self.ops)
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
diff --git a/backends/xnnpack/utils/configs.py b/backends/xnnpack/utils/configs.py
index 3fe290606c..9dda84c5e5 100644
--- a/backends/xnnpack/utils/configs.py
+++ b/backends/xnnpack/utils/configs.py
@@ -12,8 +12,12 @@
 
 
 ### XNNPACK Configs ###
-def get_xnnpack_edge_compile_config() -> exir.EdgeCompileConfig:
-    return exir.EdgeCompileConfig(_check_ir_validity=False, _skip_dim_order=True)
+def get_xnnpack_edge_compile_config(
+    skip_dim_order: bool = True,
+) -> exir.EdgeCompileConfig:
+    return exir.EdgeCompileConfig(
+        _check_ir_validity=False, _skip_dim_order=skip_dim_order
+    )
 
 
 def get_transform_passes(additional_passes=None) -> List[PassType]:

From 7a034522c888e5d3c724c0c79c90bd21eecee1c5 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Fri, 2 Aug 2024 13:32:48 -0700
Subject: [PATCH 70/75] Move runner stats into its own header (#4499)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4499

As titled, respecting new `executorch` namespace, getting
runner prepared for multimodal LLM.

Reviewed By: iseeyuan

Differential Revision: D60573219

fbshipit-source-id: 6f455ecd27e0bbd93ef2703edb5dd04f5d268a22
---
 examples/models/llama2/runner/runner.cpp  |  87 +--------------
 examples/models/llama2/runner/runner.h    |  29 +----
 examples/models/llama2/runner/targets.bzl |   1 +
 extension/llm/runner/TARGETS              |   8 ++
 extension/llm/runner/stats.h              | 123 ++++++++++++++++++++++
 extension/llm/runner/targets.bzl          |  10 ++
 6 files changed, 147 insertions(+), 111 deletions(-)
 create mode 100644 extension/llm/runner/TARGETS
 create mode 100644 extension/llm/runner/stats.h
 create mode 100644 extension/llm/runner/targets.bzl

diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
index b624e6eb43..cd5346bacd 100644
--- a/examples/models/llama2/runner/runner.cpp
+++ b/examples/models/llama2/runner/runner.cpp
@@ -32,11 +32,6 @@
 #include <executorch/runtime/platform/log.h>
 
 namespace torch::executor {
-namespace {
-static constexpr auto kTopp = 0.9f;
-void printReport(const Runner::Stats& stats);
-std::string statsToJsonString(const Runner::Stats& stats);
-} // namespace
 
 Runner::Runner(
     const std::string& model_path,
@@ -96,7 +91,7 @@ Error Runner::load() {
   sampler_ = std::make_unique<Sampler>(
       vocab_size_,
       temperature_,
-      kTopp,
+      ::executorch::llm::kTopp,
       static_cast<unsigned long long>(std::time(nullptr)));
 
   return Error::Ok;
@@ -479,7 +474,7 @@ Error Runner::generate(
 
   stats_.num_prompt_tokens = num_prompt_tokens;
   stats_.num_generated_tokens = pos - num_prompt_tokens;
-  printReport(stats_);
+  ::executorch::llm::print_report(stats_);
   if (stats_callback) {
     stats_callback(stats_);
   }
@@ -487,84 +482,6 @@ Error Runner::generate(
   return Error::Ok;
 }
 
-namespace {
-void printReport(const Runner::Stats& stats) {
-  printf("PyTorchObserver %s\n", statsToJsonString(stats).c_str());
-
-  ET_LOG(
-      Info,
-      "\tPrompt Tokens: %" PRIu64 "    Generated Tokens: %" PRIu64,
-      stats.num_prompt_tokens,
-      stats.num_generated_tokens);
-
-  ET_LOG(
-      Info,
-      "\tModel Load Time:\t\t%f (seconds)",
-      ((double)(stats.model_load_end_ms - stats.model_load_start_ms) /
-       stats.SCALING_FACTOR_UNITS_PER_SECOND));
-  double inference_time_ms =
-      (double)(stats.inference_end_ms - stats.inference_start_ms);
-  ET_LOG(
-      Info,
-      "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
-      inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND,
-
-      (stats.num_generated_tokens) /
-          (double)(stats.inference_end_ms - stats.inference_start_ms) *
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
-  double prompt_eval_time =
-      (double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
-  ET_LOG(
-      Info,
-      "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
-      prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
-      (stats.num_prompt_tokens) / prompt_eval_time *
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
-
-  double eval_time =
-      (double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
-  ET_LOG(
-      Info,
-      "\t\tGenerated %" PRIu64
-      " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
-      stats.num_generated_tokens,
-      eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
-      stats.num_generated_tokens / eval_time *
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
-
-  // Time to first token is measured from the start of inference, excluding
-  // model load time.
-  ET_LOG(
-      Info,
-      "\tTime to first generated token:\t%f (seconds)",
-      ((double)(stats.first_token_ms - stats.inference_start_ms) /
-       stats.SCALING_FACTOR_UNITS_PER_SECOND));
-
-  ET_LOG(
-      Info,
-      "\tSampling time over %" PRIu64 " tokens:\t%f (seconds)",
-      stats.num_prompt_tokens + stats.num_generated_tokens,
-      (double)stats.aggregate_sampling_time_ms /
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
-}
-
-std::string statsToJsonString(const Runner::Stats& stats) {
-  std::stringstream ss;
-  ss << "{\"prompt_tokens\":" << stats.num_prompt_tokens << ","
-     << "\"generated_tokens\":" << stats.num_generated_tokens << ","
-     << "\"model_load_start_ms\":" << stats.model_load_start_ms << ","
-     << "\"model_load_end_ms\":" << stats.model_load_end_ms << ","
-     << "\"inference_start_ms\":" << stats.inference_start_ms << ","
-     << "\"inference_end_ms\":" << stats.inference_end_ms << ","
-     << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << ","
-     << "\"first_token_ms\":" << stats.first_token_ms << ","
-     << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms
-     << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
-     << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}";
-  return ss.str();
-}
-} // namespace
-
 void Runner::stop() {
   shouldStop_ = true;
 }
diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
index 7b9d2763fc..407527531d 100644
--- a/examples/models/llama2/runner/runner.h
+++ b/examples/models/llama2/runner/runner.h
@@ -15,14 +15,17 @@
 #include <functional>
 #include <memory>
 #include <string>
+#include <type_traits>
 #include <unordered_map>
 
+#include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
 namespace torch::executor {
+using Stats = ::executorch::llm::Stats;
 
 class Runner {
  public:
@@ -31,32 +34,6 @@ class Runner {
       const std::string& tokenizer_path,
       const float temperature = 0.8f);
 
-  struct Stats {
-    // Scaling factor for timestamps - in this case, we use ms.
-    const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
-    // Time stamps for the different stages of the execution
-    // model_load_start_ms: Start of model loading.
-    long model_load_start_ms;
-    // model_load_end_ms: End of model loading.
-    long model_load_end_ms;
-    // inference_start_ms: Immediately after the model is loaded (or we check
-    // for model load), measure the inference time.
-    long inference_start_ms;
-    // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
-    // before the inference loop starts
-    long prompt_eval_end_ms;
-    // first_token: Timestamp when the first generated token is emitted
-    long first_token_ms;
-    // inference_end_ms: End of inference/generation.
-    long inference_end_ms;
-    // Keep a running total of the time spent in sampling.
-    long aggregate_sampling_time_ms;
-    // Token count from prompt
-    int64_t num_prompt_tokens;
-    // Token count from generated (total - prompt)
-    int64_t num_generated_tokens;
-  };
-
   bool is_loaded() const;
   Error load();
   Error generate(
diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl
index c8b63b6a54..c0a892e14d 100644
--- a/examples/models/llama2/runner/targets.bzl
+++ b/examples/models/llama2/runner/targets.bzl
@@ -33,6 +33,7 @@ def define_common_targets():
             ],
             exported_deps = [
                 "//executorch/backends/xnnpack:xnnpack_backend",
+                "//executorch/extension/llm/runner:stats",
                 "//executorch/extension/llm/sampler:sampler" + aten_suffix,
                 "//executorch/extension/evalue_util:print_evalue" + aten_suffix,
                 "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
diff --git a/extension/llm/runner/TARGETS b/extension/llm/runner/TARGETS
new file mode 100644
index 0000000000..2341af9282
--- /dev/null
+++ b/extension/llm/runner/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h
new file mode 100644
index 0000000000..31dd5e71cf
--- /dev/null
+++ b/extension/llm/runner/stats.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Runner stats for LLM
+#pragma once
+#include <cinttypes>
+#include <sstream>
+// patternlint-disable-next-line executorch-cpp-nostdinc
+#include <string>
+
+#include <executorch/runtime/platform/log.h>
+namespace executorch::llm {
+
+struct Stats {
+  // Scaling factor for timestamps - in this case, we use ms.
+  const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
+  // Time stamps for the different stages of the execution
+  // model_load_start_ms: Start of model loading.
+  long model_load_start_ms;
+  // model_load_end_ms: End of model loading.
+  long model_load_end_ms;
+  // inference_start_ms: Immediately after the model is loaded (or we check
+  // for model load), measure the inference time.
+  long inference_start_ms;
+  // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
+  // before the inference loop starts
+  long prompt_eval_end_ms;
+  // first_token: Timestamp when the first generated token is emitted
+  long first_token_ms;
+  // inference_end_ms: End of inference/generation.
+  long inference_end_ms;
+  // Keep a running total of the time spent in sampling.
+  long aggregate_sampling_time_ms;
+  // Token count from prompt
+  int64_t num_prompt_tokens;
+  // Token count from generated (total - prompt)
+  int64_t num_generated_tokens;
+};
+
+static constexpr auto kTopp = 0.9f;
+
+inline std::string stats_to_json_string(const Stats& stats) {
+  std::stringstream ss;
+  ss << "{\"prompt_tokens\":" << stats.num_prompt_tokens << ","
+     << "\"generated_tokens\":" << stats.num_generated_tokens << ","
+     << "\"model_load_start_ms\":" << stats.model_load_start_ms << ","
+     << "\"model_load_end_ms\":" << stats.model_load_end_ms << ","
+     << "\"inference_start_ms\":" << stats.inference_start_ms << ","
+     << "\"inference_end_ms\":" << stats.inference_end_ms << ","
+     << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << ","
+     << "\"first_token_ms\":" << stats.first_token_ms << ","
+     << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms
+     << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
+     << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}";
+  return ss.str();
+}
+
+inline void print_report(const Stats& stats) {
+  printf("PyTorchObserver %s\n", stats_to_json_string(stats).c_str());
+
+  ET_LOG(
+      Info,
+      "\tPrompt Tokens: %" PRIu64 "    Generated Tokens: %" PRIu64,
+      stats.num_prompt_tokens,
+      stats.num_generated_tokens);
+
+  ET_LOG(
+      Info,
+      "\tModel Load Time:\t\t%f (seconds)",
+      ((double)(stats.model_load_end_ms - stats.model_load_start_ms) /
+       stats.SCALING_FACTOR_UNITS_PER_SECOND));
+  double inference_time_ms =
+      (double)(stats.inference_end_ms - stats.inference_start_ms);
+  ET_LOG(
+      Info,
+      "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
+      inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND,
+
+      (stats.num_generated_tokens) /
+          (double)(stats.inference_end_ms - stats.inference_start_ms) *
+          stats.SCALING_FACTOR_UNITS_PER_SECOND);
+  double prompt_eval_time =
+      (double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
+  ET_LOG(
+      Info,
+      "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
+      prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
+      (stats.num_prompt_tokens) / prompt_eval_time *
+          stats.SCALING_FACTOR_UNITS_PER_SECOND);
+
+  double eval_time =
+      (double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
+  ET_LOG(
+      Info,
+      "\t\tGenerated %" PRIu64
+      " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
+      stats.num_generated_tokens,
+      eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
+      stats.num_generated_tokens / eval_time *
+          stats.SCALING_FACTOR_UNITS_PER_SECOND);
+
+  // Time to first token is measured from the start of inference, excluding
+  // model load time.
+  ET_LOG(
+      Info,
+      "\tTime to first generated token:\t%f (seconds)",
+      ((double)(stats.first_token_ms - stats.inference_start_ms) /
+       stats.SCALING_FACTOR_UNITS_PER_SECOND));
+
+  ET_LOG(
+      Info,
+      "\tSampling time over %" PRIu64 " tokens:\t%f (seconds)",
+      stats.num_prompt_tokens + stats.num_generated_tokens,
+      (double)stats.aggregate_sampling_time_ms /
+          stats.SCALING_FACTOR_UNITS_PER_SECOND);
+}
+
+} // namespace executorch::llm
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
new file mode 100644
index 0000000000..81a3d32ba8
--- /dev/null
+++ b/extension/llm/runner/targets.bzl
@@ -0,0 +1,10 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    runtime.cxx_library(
+        name = "stats",
+        exported_headers = ["stats.h"],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )

From 0590ed17a61cb87f45f26f3e1a6bc7ba3a612100 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Fri, 2 Aug 2024 13:40:29 -0700
Subject: [PATCH 71/75] Add test training model (#4511)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4511

Adds support to generate a test model for training

Reviewed By: larryliu0820

Differential Revision: D60614497

fbshipit-source-id: 39fd110bcdc52aaaec80f80b4a876f2ca1c1a8e5
---
 test/end2end/exported_module.py | 38 +++++++++++++++++++++++++--------
 test/models/export_program.py   | 25 ++++++++++++++++++++++
 test/models/targets.bzl         |  1 +
 3 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/test/end2end/exported_module.py b/test/end2end/exported_module.py
index 1cf38cc54e..656b570512 100644
--- a/test/end2end/exported_module.py
+++ b/test/end2end/exported_module.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 """Test helper for exporting an nn.Module to an ExecuTorch program."""
 
 import functools
@@ -22,6 +24,8 @@
 )
 from torch import nn
 from torch.export import export
+from torch.export._trace import _export
+from torch.export.experimental import _export_forward_backward
 
 
 class ExportedModule:
@@ -65,6 +69,7 @@ def export(
         capture_config=None,
         extract_constant_segment: bool = True,
         skip_type_promotion: bool = False,
+        export_joint_graph: bool = False,
     ) -> "ExportedModule":
         """
         Creates a new ExportedModule for the specified module class.
@@ -157,15 +162,30 @@ def __init__(self, method):
         # variant, along with some other transformations.
         for method_name, method_input in method_name_to_args.items():
             # if not isinstance(eager_module, torch.nn.Module):
-            exported_methods[method_name] = export(
-                eager_module,
-                method_input,
-                dynamic_shapes=(
-                    method_name_to_dynamic_shapes[method_name]
-                    if method_name_to_dynamic_shapes
-                    else None
-                ),
-            )
+            if export_joint_graph:
+                # _export was having issues with WrapperModule.
+                assert method_name == "forward"
+                ep = _export(
+                    eager_module,
+                    method_input,
+                    dynamic_shapes=(
+                        method_name_to_dynamic_shapes[method_name]
+                        if method_name_to_dynamic_shapes
+                        else None
+                    ),
+                    pre_dispatch=True,
+                )
+                exported_methods[method_name] = _export_forward_backward(ep)
+            else:
+                exported_methods[method_name] = export(
+                    eager_module,
+                    method_input,
+                    dynamic_shapes=(
+                        method_name_to_dynamic_shapes[method_name]
+                        if method_name_to_dynamic_shapes
+                        else None
+                    ),
+                )
 
         exec_prog = to_edge(
             exported_methods,
diff --git a/test/models/export_program.py b/test/models/export_program.py
index c6d744d058..7941af376f 100644
--- a/test/models/export_program.py
+++ b/test/models/export_program.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import argparse
 import inspect
 import os
@@ -164,6 +166,23 @@ def get_method_names_to_export() -> List[str]:
         return ["forward", "forward2"]
 
 
+class ModuleSimpleTrain(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(3, 3)
+        self.loss = torch.nn.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        return self.loss(self.linear(x).softmax(dim=0), y)
+
+    def get_random_inputs(self):
+        return (torch.randn(3), torch.tensor([1.0, 0.0, 0.0]))
+
+    @staticmethod
+    def export_joint():
+        return True
+
+
 #
 # Main logic.
 #
@@ -175,11 +194,15 @@ def export_module_to_program(
     skip_type_promotion: bool,
 ):
     """Exports the module and returns the serialized program data."""
+    torch.manual_seed(0)
     # Look for an optional @staticmethod that defines custom trace params.
     export_kwargs: Dict[str, Any] = {}
     if hasattr(module_class, "get_export_kwargs"):
         # pyre-ignore[16]: pyre doesn't know about get_export_kwargs.
         export_kwargs = module_class.get_export_kwargs()
+    export_joint = False
+    if hasattr(module_class, "export_joint"):
+        export_joint = module_class.export_joint()  # pyre-ignore
     if hasattr(module_class, "get_method_names_to_export"):
         # pyre-ignore[16]: pyre doesn't know about get_export_kwargs.
         methods = module_class.get_method_names_to_export()
@@ -190,6 +213,7 @@ def export_module_to_program(
         methods,
         extract_constant_segment=extract_constant_segment,
         skip_type_promotion=skip_type_promotion,
+        export_joint_graph=export_joint,
         **export_kwargs,
     )
     return module.executorch_program.buffer
@@ -199,6 +223,7 @@ def main() -> None:
     # These args are optimized for genrule usage. There's a lot of startup
     # overhead for this tool, so it's faster to export multiple models at once
     # when possible.
+    torch.manual_seed(0)
     parser = argparse.ArgumentParser(
         prog="export_program",
         description="Exports nn.Module models to ExecuTorch .pte files",
diff --git a/test/models/targets.bzl b/test/models/targets.bzl
index e44c6eb0c7..ad907304ed 100644
--- a/test/models/targets.bzl
+++ b/test/models/targets.bzl
@@ -66,6 +66,7 @@ def define_common_targets():
         "ModuleMultipleEntry",
         "ModuleIndex",
         "ModuleDynamicCatUnallocatedIO",
+        "ModuleSimpleTrain",
     ]
 
     # Generates Executorch .pte program files for various modules at build time.

From 20c86cacf9eecdbef91fc6a70719632441415f03 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Fri, 2 Aug 2024 13:47:30 -0700
Subject: [PATCH 72/75] Move test delegate to new AoT flow (#4522)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4522

title

Reviewed By: mcr229, cccclai

Differential Revision: D60684627

fbshipit-source-id: 31dc6b8b634bad92da14e968727460eb42c64635
---
 exir/backend/test/TARGETS                              |  1 -
 .../test/demos/rpc/executor_backend_preprocess.py      | 10 +++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/exir/backend/test/TARGETS b/exir/backend/test/TARGETS
index 7a6d68aef1..ed58b06b3d 100644
--- a/exir/backend/test/TARGETS
+++ b/exir/backend/test/TARGETS
@@ -269,7 +269,6 @@ python_unittest(
         "test_utils.py",
     ],
     deps = [
-        "fbsource//third-party/pypi/pandas:pandas",
         ":op_partitioner_demo",
         "//caffe2:torch",
         "//executorch/exir:lib",
diff --git a/exir/backend/test/demos/rpc/executor_backend_preprocess.py b/exir/backend/test/demos/rpc/executor_backend_preprocess.py
index aa286af300..0e5b8a8d3d 100644
--- a/exir/backend/test/demos/rpc/executor_backend_preprocess.py
+++ b/exir/backend/test/demos/rpc/executor_backend_preprocess.py
@@ -4,15 +4,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 from typing import final, List
 
-from executorch.exir import ExirExportedProgram
 from executorch.exir.backend.backend_details import (
     BackendDetails,
     ExportedProgram,
     PreprocessResult,
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.program._program import EdgeProgramManager
 
 
 @final
@@ -23,10 +25,8 @@ def preprocess(
         compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
         return PreprocessResult(
-            processed_bytes=ExirExportedProgram(
-                exported_program=edge_program,
-                # Indicates that edge_program is already in edge dialect.
-                after_to_edge_passes=True,
+            processed_bytes=EdgeProgramManager(
+                edge_programs=edge_program,
             )
             .to_executorch()
             .buffer,

From 738842d87abd7d5cba87edc90d9e0a0273ababb6 Mon Sep 17 00:00:00 2001
From: Max Ren <maxren@meta.com>
Date: Fri, 2 Aug 2024 14:16:24 -0700
Subject: [PATCH 73/75] Type Promote Div (#4516)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4516

In decomposition of Hardsigmoid, we see a divide by 6. When we lift this scalar to a tensor, we then see that the divisor is int64, and the dividend is float32. This allows us to remove the Mixed Dtype and also helps us delegate to XNNPACK

Reviewed By: tarun292, digantdesai

Differential Revision: D60492342

fbshipit-source-id: be059eee4481fcae7683a99468993d80d0f8d50a
---
 exir/passes/remove_mixed_type_operators.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/exir/passes/remove_mixed_type_operators.py b/exir/passes/remove_mixed_type_operators.py
index 93d6689ae1..701a8269f1 100644
--- a/exir/passes/remove_mixed_type_operators.py
+++ b/exir/passes/remove_mixed_type_operators.py
@@ -23,6 +23,7 @@ def call_operator(self, op, args, kwargs, meta: NodeMetadata):  # noqa: C901
         promotion_type_allow_list = {
             torch.ops.aten.add.Tensor: ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
             torch.ops.aten.mul.Tensor: ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+            torch.ops.aten.div.Tensor: ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
             torch.ops.aten.minimum.default: ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
         }
 

From fbc183f21493d71cb6d377acdfcb3fd52011db39 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Fri, 2 Aug 2024 15:09:30 -0700
Subject: [PATCH 74/75] Dont generate a mutable segment if you ahve no mutable
 data (#4523)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4523

Forward on a None instead of a list with one elem

Reviewed By: lucylq

Differential Revision: D60678833

fbshipit-source-id: 6f6ca3999ea644a13c3374d9e60dd387499bea15
---
 exir/emit/_emit_program.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py
index 3dcf8fb759..0aebab649e 100644
--- a/exir/emit/_emit_program.py
+++ b/exir/emit/_emit_program.py
@@ -45,7 +45,7 @@ class EmitterOutput:
         str, Dict[int, Dict[str, Union[str, _DelegateDebugIdentifierMap]]]
     ]
 
-    mutable_data: List[Buffer]
+    mutable_data: Optional[List[Buffer]]
 
 
 def _remove_non_user_outputs(exported_program: ExportedProgram) -> torch.fx.GraphModule:
@@ -161,5 +161,9 @@ def emit_program(
             constant_segment=SubsegmentOffsets(segment_index=0, offsets=[]),
             mutable_data_segments=None,  # Will be filled in during serialization
         ),
-        mutable_data=program_state.mutable_buffer,
+        mutable_data=(
+            program_state.mutable_buffer
+            if len(program_state.mutable_buffer) > 1
+            else None
+        ),
     )

From 76f0b61845495002196a042861587688a2260731 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Fri, 2 Aug 2024 15:37:27 -0700
Subject: [PATCH 75/75] Fix Stats in jni_layer_llama.cpp (#4527)

Summary:
In https://github.com/pytorch/executorch/pull/4499 `Runner::Stats` is moved to `Stats`. Fixing it in JNI.

Pull Request resolved: https://github.com/pytorch/executorch/pull/4527

Reviewed By: kirklandsign

Differential Revision: D60694093

Pulled By: larryliu0820

fbshipit-source-id: bb206e216ecc6b8a74e2cae4cd0f4d9ad03bdf27
---
 extension/android/jni/jni_layer_llama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index b4fe80f022..90be2c68c4 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -73,7 +73,7 @@ class ExecuTorchLlamaCallbackJni
     method(self(), s);
   }
 
-  void onStats(const Runner::Stats& result) const {
+  void onStats(const Stats& result) const {
     static auto cls = ExecuTorchLlamaCallbackJni::javaClassStatic();
     static const auto method = cls->getMethod<void(jfloat)>("onStats");
     double eval_time =
@@ -132,7 +132,7 @@ class ExecuTorchLlamaJni
         prompt->toStdString(),
         128,
         [callback](std::string result) { callback->onResult(result); },
-        [callback](const Runner::Stats& result) { callback->onStats(result); });
+        [callback](const Stats& result) { callback->onStats(result); });
     return 0;
   }