From 889e5cbc0943958fe111f4ec373a2301832d4dd1 Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Thu, 25 Jul 2024 15:52:54 -0700 Subject: [PATCH 01/75] Enable SPIR-V compiler optimization (#4402) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4402 Call the SPIR-V compiler with the `-O` flag, which enables optimizations when compiling GLSL to SPIR-V. The `-Os` flag (which tries to minimize SPIR-V size) was tested as well, but resulted in (very) slightly worse performance. Reviewed By: jorgep31415 Differential Revision: D60193514 fbshipit-source-id: 2dfb999fb1951a63a990773ab563a1a3a3c304b0 --- backends/vulkan/runtime/gen_vulkan_spv.py | 44 +++++++++++++++-------- backends/vulkan/targets.bzl | 29 +++++++++------ 2 files changed, 48 insertions(+), 25 deletions(-) diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py index c9e3aaa31e..c734ed395e 100644 --- a/backends/vulkan/runtime/gen_vulkan_spv.py +++ b/backends/vulkan/runtime/gen_vulkan_spv.py @@ -482,6 +482,7 @@ def __init__( src_dir_paths: Union[str, List[str]], env: Dict[Any, Any], glslc_path: Optional[str], + glslc_flags: str = "", ) -> None: if isinstance(src_dir_paths, str): self.src_dir_paths = [src_dir_paths] @@ -490,6 +491,7 @@ def __init__( self.env = env self.glslc_path = glslc_path + self.glslc_flags = glslc_flags self.glsl_src_files: Dict[str, str] = {} self.template_yaml_files: List[str] = [] @@ -668,19 +670,23 @@ def process_shader(shader_paths_pair): if self.glslc_path is not None: spv_out_path = os.path.join(output_dir, f"{shader_name}.spv") - cmd = [ - self.glslc_path, - "-fshader-stage=compute", - glsl_out_path, - "-o", - spv_out_path, - "--target-env=vulkan1.1", - "-Werror", - ] + [ - arg - for src_dir_path in self.src_dir_paths - for arg in ["-I", src_dir_path] - ] + cmd = ( + [ + self.glslc_path, + "-fshader-stage=compute", + glsl_out_path, + "-o", + spv_out_path, + "--target-env=vulkan1.1", + "-Werror", + ] + + [ + arg + for src_dir_path in self.src_dir_paths + for arg in ["-I", src_dir_path] + ] + + self.glslc_flags.split() + ) subprocess.check_call(cmd) @@ -966,6 +972,8 @@ def main(argv: List[str]) -> int: parser.add_argument("-c", "--glslc-path", required=True, help="") parser.add_argument("-t", "--tmp-dir-path", required=True, help="/tmp") parser.add_argument("-o", "--output-path", required=True, help="") + parser.add_argument("--optimize_size", action="store_true", help="") + parser.add_argument("--optimize", action="store_true", help="") parser.add_argument( "--env", metavar="KEY=VALUE", nargs="*", help="Set a number of key-value pairs" ) @@ -984,7 +992,15 @@ def main(argv: List[str]) -> int: if not os.path.exists(options.tmp_dir_path): os.makedirs(options.tmp_dir_path) - shader_generator = SPVGenerator(options.glsl_paths, env, options.glslc_path) + glslc_flags = "" + if options.optimize_size: + glslc_flags += "-Os" + elif options.optimize: + glslc_flags += "-O" + + shader_generator = SPVGenerator( + options.glsl_paths, env, options.glslc_path, glslc_flags + ) output_spv_files = shader_generator.generateSPV(options.tmp_dir_path) genCppFiles( diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl index 981552f17a..e8b232098b 100644 --- a/backends/vulkan/targets.bzl +++ b/backends/vulkan/targets.bzl @@ -1,12 +1,15 @@ +load("@fbcode_macros//build_defs:native_rules.bzl", "buck_genrule") load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") def get_vulkan_compiler_flags(): return ["-Wno-missing-prototypes", "-Wno-global-constructors"] def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False): - gen_vulkan_spv_target = "//executorch/backends/vulkan:gen_vulkan_spv_bin" - glslc_path = "//caffe2/fb/vulkan/dotslash:glslc" + gen_vulkan_spv_target = "//xplat/executorch/backends/vulkan:gen_vulkan_spv_bin" + glslc_path = "//xplat/caffe2/fb/vulkan/dotslash:glslc" + if is_fbcode: + gen_vulkan_spv_target = "//executorch/backends/vulkan:gen_vulkan_spv_bin" glslc_path = "//caffe2/fb/vulkan/tools:glslc" glsl_paths = [] @@ -15,21 +18,25 @@ def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False): for target, subpath in spv_filegroups.items(): glsl_paths.append("$(location {})/{}".format(target, subpath)) - genrule_cmd = [ - "$(exe {})".format(gen_vulkan_spv_target), - "--glsl-paths {}".format(" ".join(glsl_paths)), - "--output-path $OUT", - "--glslc-path=$(exe {})".format(glslc_path), - "--tmp-dir-path=$OUT", - ] + genrule_cmd = ( + "$(exe {}) ".format(gen_vulkan_spv_target) + + "--glsl-paths {} ".format(" ".join(glsl_paths)) + + "--output-path $OUT " + + "--glslc-path=$(exe {}) ".format(glslc_path) + + "--tmp-dir-path=$OUT " + + select({ + "DEFAULT": "", + "ovr_config//os:android": "--optimize", + }) + ) genrule_name = "gen_{}_cpp".format(name) - runtime.genrule( + buck_genrule( name = genrule_name, outs = { "{}.cpp".format(name): ["spv.cpp"], }, - cmd = " ".join(genrule_cmd), + cmd = genrule_cmd, default_outs = ["."], labels = ["uses_dotslash"], ) From faeeca8ec9040ae2db23973139c1b5f71ea51d4c Mon Sep 17 00:00:00 2001 From: Wei Lu Date: Thu, 25 Jul 2024 21:23:45 -0700 Subject: [PATCH 02/75] remove unused tensors from VK model's graph (#4427) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4427 We implemented [operators fusion](https://github.com/pytorch/executorch/pull/3769?fbclid=IwZXh0bgNhZW0CMTEAAR3kYya0wRrkupmV86OpPZZ9_QhqLYEmNrKcJk5Jj_4VSO_WqvFsbWNigTs_aem_gQeSu2zvazf_hpy3RsIXhg) (`conv+bn`) which fused `conv` and `bn`'s weights and biases, but the old parameters are not deleted. Hence we saw that VK model's size is nearly twice large as CPU's. As regards mobilenet_v2, before this diff CPU vs VK: 14M vs 22M. After this diff, both of them have 14M. Reviewed By: SS-JIA Differential Revision: D60257047 fbshipit-source-id: ca9e0f38d53187edff9dba45fdeffa619fde51a7 --- backends/vulkan/serialization/vulkan_graph_builder.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backends/vulkan/serialization/vulkan_graph_builder.py b/backends/vulkan/serialization/vulkan_graph_builder.py index 477e54a2d7..da40f0a720 100644 --- a/backends/vulkan/serialization/vulkan_graph_builder.py +++ b/backends/vulkan/serialization/vulkan_graph_builder.py @@ -262,6 +262,9 @@ def get_or_create_value_for(self, arg: _Argument): raise RuntimeError(f"Cannot create value for arg of type {type(arg)}") def process_placeholder_node(self, node: Node) -> None: + # ignores any tensors that don't get used in any ops + if len(node.users) == 0: + return None ids = self.create_node_value(node) if not self.is_param_node(node): if isinstance(ids, int): From 11407f05edf6e5304dc3199c9b4bad345387d5a2 Mon Sep 17 00:00:00 2001 From: Avik Chaudhuri Date: Fri, 26 Jul 2024 01:39:05 -0700 Subject: [PATCH 03/75] immutable accessors in graph signature (#4428) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4428 X-link: https://github.com/pytorch/pytorch/pull/131807 bypass-github-export-checks Test failures either unrelated or due to cross-dependencies between repos bypass-github-executorch-ci-checks bypass-github-pytorch-ci-checks Reviewed By: ydwu4 Differential Revision: D60253955 fbshipit-source-id: eb6eb65bf17fd7e20287881a297d9eac2cbee691 --- exir/passes/constant_prop_pass.py | 6 +++--- exir/program/_program.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py index 354a1b071f..6ab3abbd7b 100644 --- a/exir/passes/constant_prop_pass.py +++ b/exir/passes/constant_prop_pass.py @@ -212,11 +212,11 @@ def erase_constant_node( ) -> None: # Remove corresponding tensor from param/constants dict. signature = exported_program.graph_signature - if name := signature.inputs_to_parameters.pop(node.name, None): + if name := signature.inputs_to_parameters.get(node.name, None): exported_program.state_dict.pop(name, None) - elif name := signature.inputs_to_lifted_tensor_constants.pop(node.name, None): + elif name := signature.inputs_to_lifted_tensor_constants.get(node.name, None): exported_program.constants.pop(name, None) - elif name := signature.inputs_to_buffers.pop(node.name, None): + elif name := signature.inputs_to_buffers.get(node.name, None): exported_program.constants.pop(name, None) exported_program.state_dict.pop(name, None) diff --git a/exir/program/_program.py b/exir/program/_program.py index 6fb9eca46f..fd6253a8aa 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -225,7 +225,7 @@ def lift_constant_tensor_pass(ep): return ep graph_signature = ep.graph_signature - buffers = graph_signature.buffers + buffers = list(graph_signature.buffers) fake_mode = list(ep.graph.nodes)[0].meta["val"].fake_mode first_user_input = None From 5d3ec1323183aa1bcbba8026986d2aca3fab88d3 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 26 Jul 2024 10:11:33 -0700 Subject: [PATCH 04/75] Revert D60253955: immutable accessors in graph signature Differential Revision: D60253955 Original commit changeset: eb6eb65bf17f Original Phabricator Diff: D60253955 fbshipit-source-id: f203ef791da6f7efa40bf51a6e905eba65cb6b47 --- exir/passes/constant_prop_pass.py | 6 +++--- exir/program/_program.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py index 6ab3abbd7b..354a1b071f 100644 --- a/exir/passes/constant_prop_pass.py +++ b/exir/passes/constant_prop_pass.py @@ -212,11 +212,11 @@ def erase_constant_node( ) -> None: # Remove corresponding tensor from param/constants dict. signature = exported_program.graph_signature - if name := signature.inputs_to_parameters.get(node.name, None): + if name := signature.inputs_to_parameters.pop(node.name, None): exported_program.state_dict.pop(name, None) - elif name := signature.inputs_to_lifted_tensor_constants.get(node.name, None): + elif name := signature.inputs_to_lifted_tensor_constants.pop(node.name, None): exported_program.constants.pop(name, None) - elif name := signature.inputs_to_buffers.get(node.name, None): + elif name := signature.inputs_to_buffers.pop(node.name, None): exported_program.constants.pop(name, None) exported_program.state_dict.pop(name, None) diff --git a/exir/program/_program.py b/exir/program/_program.py index fd6253a8aa..6fb9eca46f 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -225,7 +225,7 @@ def lift_constant_tensor_pass(ep): return ep graph_signature = ep.graph_signature - buffers = list(graph_signature.buffers) + buffers = graph_signature.buffers fake_mode = list(ep.graph.nodes)[0].meta["val"].fake_mode first_user_input = None From 91298923a0076c1b41059efb6dad2876426e4b03 Mon Sep 17 00:00:00 2001 From: Avik Chaudhuri Date: Fri, 26 Jul 2024 12:45:15 -0700 Subject: [PATCH 05/75] immutable accessors in graph signature (#4433) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4433 splitting ET part of D60253955 Reviewed By: guangy10, zhxchen17 Differential Revision: D60295940 fbshipit-source-id: 4ad9a661a50db9b9e9bccbc13b232416d7264a49 --- exir/passes/constant_prop_pass.py | 6 +++--- exir/program/_program.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py index 354a1b071f..6ab3abbd7b 100644 --- a/exir/passes/constant_prop_pass.py +++ b/exir/passes/constant_prop_pass.py @@ -212,11 +212,11 @@ def erase_constant_node( ) -> None: # Remove corresponding tensor from param/constants dict. signature = exported_program.graph_signature - if name := signature.inputs_to_parameters.pop(node.name, None): + if name := signature.inputs_to_parameters.get(node.name, None): exported_program.state_dict.pop(name, None) - elif name := signature.inputs_to_lifted_tensor_constants.pop(node.name, None): + elif name := signature.inputs_to_lifted_tensor_constants.get(node.name, None): exported_program.constants.pop(name, None) - elif name := signature.inputs_to_buffers.pop(node.name, None): + elif name := signature.inputs_to_buffers.get(node.name, None): exported_program.constants.pop(name, None) exported_program.state_dict.pop(name, None) diff --git a/exir/program/_program.py b/exir/program/_program.py index 6fb9eca46f..fd6253a8aa 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -225,7 +225,7 @@ def lift_constant_tensor_pass(ep): return ep graph_signature = ep.graph_signature - buffers = graph_signature.buffers + buffers = list(graph_signature.buffers) fake_mode = list(ep.graph.nodes)[0].meta["val"].fake_mode first_user_input = None From 5a20a49517c5c05a71692d6d6885735a2cd30bb1 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Fri, 26 Jul 2024 13:52:34 -0700 Subject: [PATCH 06/75] Fix numpy and pandas versions. (#4430) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4430 Numpy has to match the coremltools requirements, and Pandas depends on Numpy: https://github.com/apple/coremltools/blob/main/reqs/build.pip Reviewed By: kirklandsign Differential Revision: D60265982 fbshipit-source-id: c84dd319c19fb48dc6d4ad3ffc8accd1fdc9b840 --- .ci/docker/requirements-ci.txt | 6 +++++- pyproject.toml | 7 +++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index 3a0cd57ddb..c33cc533c0 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -1,5 +1,7 @@ mpmath==1.3.0 -numpy==1.25.2 +numpy==1.21.3; python_version == '3.10' +numpy==1.23.2; python_version == '3.11' +numpy; python_version >= '3.12' PyYAML==6.0.1 ruamel.yaml==0.17.32 sympy==1.12 @@ -8,6 +10,8 @@ tomli==2.0.1 torchsr==1.0.4 transformers==4.38.0 zstd==1.5.5.1 +pandas==2.0.3; python_version == '3.10' +pandas; python_version >= '3.11' pytest==7.2.0 pytest-cov==4.1.0 expecttest==0.1.6 diff --git a/pyproject.toml b/pyproject.toml index b23091cc5f..e83fe2bc2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,9 +55,12 @@ dependencies=[ "flatbuffers", "hypothesis", "mpmath==1.3.0", - "numpy>=1.25.2", + "numpy==1.21.3; python_version == '3.10'", + "numpy==1.23.2; python_version == '3.11'", + "numpy; python_version >= '3.12'", "packaging", - "pandas", + "pandas==2.0.3; python_version == '3.10'", + "pandas; python_version >= '3.11'", "parameterized", "pytest", "pytest-xdist", From 1e4603d2e8264d61a006a1a27258214c15d465ce Mon Sep 17 00:00:00 2001 From: Gyanendra Sinha Date: Mon, 29 Jul 2024 00:24:58 -0700 Subject: [PATCH 07/75] FileDataLoader fails to read the file when size > INT32_MAX (#4435) Summary: On macOS, the `read` function will fail with an `EINVAL` error if the size parameter exceeds `INT32_MAX`. This update addresses the issue by adding a check to ensure that the read size does not surpass `INT32_MAX`. On Linux, the maximum permissible read size is 2,147,479,552 bytes ( < `INT32_MAX`), so attempting to read beyond this limit is inconsequential. Pull Request resolved: https://github.com/pytorch/executorch/pull/4435 Test Plan: Exporting llama3 with `python -m examples.models.llama2.export_llama --checkpoint examples/models/llama-2-7B/consolidated.00.pth --params examples/models/llama-2-7B/params.json --coreml --disable_dynamic_shape -kv ` Without fix Fails with `invalid argument` error. With fix Succeeds. Reviewed By: kirklandsign Differential Revision: D60321719 Pulled By: shoumikhin fbshipit-source-id: fca265c6c1edc628b38a5044693ec7bbe0c0b43a --- extension/data_loader/file_data_loader.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/extension/data_loader/file_data_loader.cpp b/extension/data_loader/file_data_loader.cpp index 651bc713db..7b041fef00 100644 --- a/extension/data_loader/file_data_loader.cpp +++ b/extension/data_loader/file_data_loader.cpp @@ -8,9 +8,11 @@ #include +#include #include #include #include +#include #include #include @@ -189,7 +191,12 @@ Result FileDataLoader::load( size_t needed = size; uint8_t* buf = reinterpret_cast(aligned_buffer); while (needed > 0) { - ssize_t nread = ::read(fd_, buf, needed); + // Reads on macos will fail with EINVAL if size > INT32_MAX. + ssize_t nread = ::read( + fd_, + buf, + std::min( + needed, static_cast(std::numeric_limits::max()))); if (nread < 0 && errno == EINTR) { // Interrupted by a signal; zero bytes read. continue; From dd88708719b488db6fd89c9b9846a207ea22f001 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 29 Jul 2024 12:15:06 -0700 Subject: [PATCH 08/75] Hoist numel out of loop condtion in op_embedding (#4146) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4146 This seems to save a single instruction in the inner loop (on x86, but I expect other architectures to be similar). ghstack-source-id: 235302150 Reviewed By: tarun292 Differential Revision: D59335729 fbshipit-source-id: cf22669ffd8b127e60d863e4bc7858f994d8b1ce --- kernels/portable/cpu/op_embedding.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernels/portable/cpu/op_embedding.cpp b/kernels/portable/cpu/op_embedding.cpp index 19e915ebc0..ffa43da739 100644 --- a/kernels/portable/cpu/op_embedding.cpp +++ b/kernels/portable/cpu/op_embedding.cpp @@ -37,7 +37,8 @@ void embedding_kernel( char* out_data = out.mutable_data_ptr(); const CTYPE* indices_ptr = indices.const_data_ptr(); ssize_t weight_height = weight.size(0); - for (int i = 0; i < indices.numel(); i++) { + const auto indices_numel = indices.numel(); + for (int i = 0; i < indices_numel; i++) { // Ensure index is larger than 0 and smaller than weight.size(0) ET_KERNEL_CHECK_MSG( ctx, From e087ac83fcfa5e051b6ab812ed969c3768b63c4b Mon Sep 17 00:00:00 2001 From: winskuo-quic Date: Mon, 29 Jul 2024 15:03:31 -0700 Subject: [PATCH 09/75] Qualcomm AI Engine Direct - Fix UT example script hang when exception happened (#4355) Summary: - Fix UT Example Script hang when exceptions happened during execution. While the main process is waiting for child process to return a message, the child process exits without exceptions properly caught. - Remove RemoveRedundancy Pass during quantizer to resolve memory format issues while quantizing. - Prevent constant being double dequant in AnnotateQuantAttrs Pass Pull Request resolved: https://github.com/pytorch/executorch/pull/4355 Reviewed By: kirklandsign Differential Revision: D60177584 Pulled By: cccclai fbshipit-source-id: fc4d277b8eef05bd42c4eae2b9aa67236f53cc32 --- .../qualcomm/passes/annotate_quant_attrs.py | 7 +- .../passes/recompose_pixel_unshuffle.py | 8 +- backends/qualcomm/quantizer/quantizer.py | 2 - backends/qualcomm/tests/test_qnn_delegate.py | 116 +++++++++++++----- examples/qualcomm/llama2/llama.py | 9 +- examples/qualcomm/oss_scripts/dino_v2.py | 60 +++++---- examples/qualcomm/oss_scripts/esrgan.py | 98 ++++++++------- examples/qualcomm/oss_scripts/fbnet.py | 58 +++++---- .../oss_scripts/gMLP_image_classification.py | 59 +++++---- examples/qualcomm/oss_scripts/squeezenet.py | 62 ++++++---- examples/qualcomm/oss_scripts/ssd300_vgg16.py | 98 ++++++++------- examples/qualcomm/scripts/deeplab_v3.py | 52 ++++---- examples/qualcomm/scripts/edsr.py | 84 +++++++------ examples/qualcomm/scripts/inception_v3.py | 62 ++++++---- examples/qualcomm/scripts/inception_v4.py | 62 ++++++---- .../qualcomm/scripts/mobilebert_fine_tune.py | 82 +++++++------ examples/qualcomm/scripts/mobilenet_v2.py | 62 ++++++---- examples/qualcomm/scripts/mobilenet_v3.py | 62 ++++++---- examples/qualcomm/scripts/torchvision_vit.py | 56 +++++---- examples/qualcomm/scripts/utils.py | 2 +- 20 files changed, 652 insertions(+), 449 deletions(-) diff --git a/backends/qualcomm/passes/annotate_quant_attrs.py b/backends/qualcomm/passes/annotate_quant_attrs.py index 199d26b026..0dc39d2a4d 100644 --- a/backends/qualcomm/passes/annotate_quant_attrs.py +++ b/backends/qualcomm/passes/annotate_quant_attrs.py @@ -94,9 +94,11 @@ def _dequant_fold_params(self, n, quant_attrs, param): def _annotate_quant_attrs( self, graph_module: torch.fx.GraphModule ) -> torch.fx.GraphModule: + # Keep track of const params that has been dequant, so it does not get + # dequant multiple times if the const param has more than 1 user + visited_const_param = set() for n in graph_module.graph.nodes: self._annotate_requant(n) - # With fold_quant enabled, check if the input of dq op is quantized param. param = None if n.target in dq_ops: @@ -106,7 +108,8 @@ def _annotate_quant_attrs( quant_attrs = get_quant_attrs(self.edge_program, n) self._annotate_source_nodes(n, quant_attrs) - if param is not None: + if param is not None and n.args[0] not in visited_const_param: + visited_const_param.add(n.args[0]) self._dequant_fold_params(n, quant_attrs, param) return graph_module diff --git a/backends/qualcomm/passes/recompose_pixel_unshuffle.py b/backends/qualcomm/passes/recompose_pixel_unshuffle.py index cadc310bbb..a47f3d119a 100644 --- a/backends/qualcomm/passes/recompose_pixel_unshuffle.py +++ b/backends/qualcomm/passes/recompose_pixel_unshuffle.py @@ -35,7 +35,13 @@ def call(self, graph_module: torch.fx.GraphModule): for node in graph.nodes: if node.op == "call_function" and node.target == self.reshape_target: with graph.inserting_after(node): - premute_node = node.args[0] + + # Clone op still exists between permute and reshape_target during quantization, + # so we need to check for args[0].args[0] to get permute node + if self.quantization_capture: + premute_node = node.args[0].args[0] + else: + premute_node = node.args[0] if any( [ len(node.args[1]) != 4, diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py index 91e31b62e4..d51e016473 100644 --- a/backends/qualcomm/quantizer/quantizer.py +++ b/backends/qualcomm/quantizer/quantizer.py @@ -12,7 +12,6 @@ RecomposePixelUnshuffle, ) from executorch.backends.qualcomm.passes.reduce_dynamic_range import ReduceDynamicRange -from executorch.backends.qualcomm.passes.remove_redundancy import RemoveRedundancy from executorch.backends.qualcomm.passes.replace_inf_buffer import ReplaceInfBuffer from executorch.backends.transforms.decompose_sdpa import ( DecomposeScaledDotProductAttention, @@ -182,7 +181,6 @@ def set_per_channel_linear_quant(self, enable: bool) -> None: self._update_per_channel_weight_quant_ops(linear_ops, enable) def transform_for_annotation(self, model: GraphModule) -> GraphModule: - model = RemoveRedundancy()(model).graph_module model = ReduceDynamicRange()(model).graph_module model = RecomposePixelUnshuffle(quantization_capture=True)(model).graph_module model = DecomposeScaledDotProductAttention()(model).graph_module diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 508a027da6..f9d05131bb 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -1581,8 +1581,11 @@ def test_fbnet(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - self.assertGreaterEqual(msg["top_1"], 60) - self.assertGreaterEqual(msg["top_5"], 90) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 60) + self.assertGreaterEqual(msg["top_5"], 90) def test_gMLP(self): if not self.required_envs([self.image_dataset]): @@ -1614,8 +1617,11 @@ def test_gMLP(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - self.assertGreaterEqual(msg["top_1"], 60) - self.assertGreaterEqual(msg["top_5"], 90) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 60) + self.assertGreaterEqual(msg["top_5"], 90) def test_ssd300_vgg16(self): if not self.required_envs([self.pretrained_weight, self.oss_repo]): @@ -1649,7 +1655,10 @@ def test_ssd300_vgg16(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - self.assertGreaterEqual(msg["mAP"], 0.70) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["mAP"], 0.70) def test_dino_v2(self): if not self.required_envs([self.image_dataset]): @@ -1680,8 +1689,11 @@ def test_dino_v2(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - self.assertGreaterEqual(msg["top_1"], 70) - self.assertGreaterEqual(msg["top_5"], 85) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 70) + self.assertGreaterEqual(msg["top_5"], 85) def test_esrgan(self): if not self.required_envs(): @@ -1714,8 +1726,11 @@ def test_esrgan(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - self.assertGreaterEqual(msg["PSNR"], 24) - self.assertGreaterEqual(msg["SSIM"], 0.8) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["PSNR"], 24) + self.assertGreaterEqual(msg["SSIM"], 0.8) def test_squeezenet(self): if not self.required_envs([self.image_dataset]): @@ -1747,8 +1762,11 @@ def test_squeezenet(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - self.assertGreaterEqual(msg["top_1"], 40) - self.assertGreaterEqual(msg["top_5"], 70) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 40) + self.assertGreaterEqual(msg["top_5"], 70) class TestExampleScript(TestQNN): @@ -1794,8 +1812,11 @@ def test_mobilenet_v2(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - self.assertGreaterEqual(msg["top_1"], 60) - self.assertGreaterEqual(msg["top_5"], 80) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 60) + self.assertGreaterEqual(msg["top_5"], 80) def test_mobilenet_v3(self): if not self.required_envs([self.image_dataset]): @@ -1829,8 +1850,11 @@ def test_mobilenet_v3(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - self.assertGreaterEqual(msg["top_1"], 60) - self.assertGreaterEqual(msg["top_5"], 80) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 60) + self.assertGreaterEqual(msg["top_5"], 80) def test_inception_v3(self): if not self.required_envs([self.image_dataset]): @@ -1864,8 +1888,11 @@ def test_inception_v3(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - self.assertGreaterEqual(msg["top_1"], 60) - self.assertGreaterEqual(msg["top_5"], 80) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 60) + self.assertGreaterEqual(msg["top_5"], 80) def test_inception_v4(self): if not self.required_envs([self.image_dataset]): @@ -1899,8 +1926,11 @@ def test_inception_v4(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - self.assertGreaterEqual(msg["top_1"], 60) - self.assertGreaterEqual(msg["top_5"], 80) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 60) + self.assertGreaterEqual(msg["top_5"], 80) def test_vit(self): if not self.required_envs([self.image_dataset]): @@ -1934,8 +1964,11 @@ def test_vit(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - self.assertGreaterEqual(msg["top_1"], 70) - self.assertGreaterEqual(msg["top_5"], 90) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 70) + self.assertGreaterEqual(msg["top_5"], 90) def test_edsr(self): if not self.required_envs(): @@ -1968,8 +2001,11 @@ def test_edsr(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - self.assertGreaterEqual(msg["PSNR"], 25) - self.assertGreaterEqual(msg["SSIM"], 0.8) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["PSNR"], 25) + self.assertGreaterEqual(msg["SSIM"], 0.8) def test_deeplab_v3(self): if not self.required_envs(): @@ -2002,9 +2038,12 @@ def test_deeplab_v3(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - self.assertGreaterEqual(msg["PA"], 0.85) - self.assertGreaterEqual(msg["MPA"], 0.70) - self.assertGreaterEqual(msg["MIoU"], 0.55) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["PA"], 0.85) + self.assertGreaterEqual(msg["MPA"], 0.70) + self.assertGreaterEqual(msg["MIoU"], 0.55) def test_stories_single_llama(self): if not self.required_envs(): @@ -2049,8 +2088,11 @@ def test_stories_single_llama(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - model_out = msg["result"][0] - self.assertTrue(model_out.startswith(golden_start_with)) + if "Error" in msg: + self.fail(msg["Error"]) + else: + model_out = msg["result"][0] + self.assertTrue(model_out.startswith(golden_start_with)) def test_mobilebert(self): if not self.required_envs([self.pretrained_weight]): @@ -2085,9 +2127,12 @@ def test_mobilebert(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - cpu, htp = msg["CPU"], msg["HTP"] - for k, v in cpu.items(): - self.assertLessEqual(abs(v[0] - htp[k][0]), 2) + if "Error" in msg: + self.fail(msg["Error"]) + else: + cpu, htp = msg["CPU"], msg["HTP"] + for k, v in cpu.items(): + self.assertLessEqual(abs(v[0] - htp[k][0]), 2) @unittest.skip("will be enabled after TODOs got resolved") def test_ptq_mobilebert(self): @@ -2127,9 +2172,12 @@ def test_ptq_mobilebert(self): conn = listener.accept() p.communicate() msg = json.loads(conn.recv()) - cpu, htp = msg["CPU"], msg["HTP"] - for k, v in cpu.items(): - self.assertLessEqual(abs(v[0] - htp[k][0]), 5) + if "Error" in msg: + self.fail(msg["Error"]) + else: + cpu, htp = msg["CPU"], msg["HTP"] + for k, v in cpu.items(): + self.assertLessEqual(abs(v[0] - htp[k][0]), 5) def test_export_example(self): if not self.required_envs([self.model_name]): diff --git a/examples/qualcomm/llama2/llama.py b/examples/qualcomm/llama2/llama.py index 79cf5606d6..6e0f3f4399 100644 --- a/examples/qualcomm/llama2/llama.py +++ b/examples/qualcomm/llama2/llama.py @@ -586,4 +586,11 @@ def post_process(): if args.compile_only: exit(f"Finish compile_only and save to {args.artifact}") - inference(args) + try: + inference(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/oss_scripts/dino_v2.py b/examples/qualcomm/oss_scripts/dino_v2.py index e4d4c6af25..03249b63d8 100644 --- a/examples/qualcomm/oss_scripts/dino_v2.py +++ b/examples/qualcomm/oss_scripts/dino_v2.py @@ -67,31 +67,7 @@ def get_instance(): return model.eval() -if __name__ == "__main__": - parser = setup_common_args_and_variables() - - parser.add_argument( - "-a", - "--artifact", - help="Path for storing generated artifacts by this example. Default ./dino_v2", - default="./dino_v2", - type=str, - ) - - parser.add_argument( - "-d", - "--dataset", - help=( - "path to the validation folder of ImageNet dataset. " - "e.g. --dataset imagenet-mini/val " - "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" - ), - type=str, - required=True, - ) - - args = parser.parse_args() - +def main(args): skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) # ensure the working directory exist. @@ -170,3 +146,37 @@ def get_instance(): else: for i, k in enumerate(k_val): print(f"top_{k}->{topk[i]}%") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + + parser.add_argument( + "-a", + "--artifact", + help="Path for storing generated artifacts by this example. Default ./dino_v2", + default="./dino_v2", + type=str, + ) + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/oss_scripts/esrgan.py b/examples/qualcomm/oss_scripts/esrgan.py index 50dc59cf0c..e4e609e152 100644 --- a/examples/qualcomm/oss_scripts/esrgan.py +++ b/examples/qualcomm/oss_scripts/esrgan.py @@ -40,50 +40,7 @@ def get_instance(repo: str): return model.model.eval() -if __name__ == "__main__": - parser = setup_common_args_and_variables() - - parser.add_argument( - "-a", - "--artifact", - help="path for storing generated artifacts by this example. Default ./esrgan", - default="./esrgan", - type=str, - ) - - parser.add_argument( - "-r", - "--hr_ref_dir", - help="Path to the high resolution images", - default="", - type=str, - ) - - parser.add_argument( - "-l", - "--lr_dir", - help="Path to the low resolution image inputs", - default="", - type=str, - ) - - parser.add_argument( - "-d", - "--default_dataset", - help="If specified, download and use B100 dataset by torchSR API", - action="store_true", - default=False, - ) - - parser.add_argument( - "--oss_repo", - help="Path to cloned https://github.com/ai-forever/Real-ESRGAN", - type=str, - required=True, - ) - - args = parser.parse_args() - +def main(args): skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) # ensure the working directory exist. @@ -174,3 +131,56 @@ def post_process(): else: print(f"Average of PSNR is: {avg_PSNR}") print(f"Average of SSIM is: {avg_SSIM}") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. Default ./esrgan", + default="./esrgan", + type=str, + ) + + parser.add_argument( + "-r", + "--hr_ref_dir", + help="Path to the high resolution images", + default="", + type=str, + ) + + parser.add_argument( + "-l", + "--lr_dir", + help="Path to the low resolution image inputs", + default="", + type=str, + ) + + parser.add_argument( + "-d", + "--default_dataset", + help="If specified, download and use B100 dataset by torchSR API", + action="store_true", + default=False, + ) + + parser.add_argument( + "--oss_repo", + help="Path to cloned https://github.com/ai-forever/Real-ESRGAN", + type=str, + required=True, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/oss_scripts/fbnet.py b/examples/qualcomm/oss_scripts/fbnet.py index d62c4a78b1..fe07ab83d2 100755 --- a/examples/qualcomm/oss_scripts/fbnet.py +++ b/examples/qualcomm/oss_scripts/fbnet.py @@ -23,30 +23,7 @@ ) -if __name__ == "__main__": - parser = setup_common_args_and_variables() - parser.add_argument( - "-a", - "--artifact", - help="path for storing generated artifacts by this example. Default ./fbnet", - default="./fbnet", - type=str, - ) - - parser.add_argument( - "-d", - "--dataset", - help=( - "path to the validation folder of ImageNet dataset. " - "e.g. --dataset imagenet-mini/val " - "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" - ), - type=str, - required=True, - ) - - args = parser.parse_args() - +def main(args): if not args.compile_only and args.device is None: raise RuntimeError( "device serial is required if not compile only. " @@ -126,3 +103,36 @@ def post_process(): else: for i, k in enumerate(k_val): print(f"top_{k}->{topk[i]}%") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. Default ./fbnet", + default="./fbnet", + type=str, + ) + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/oss_scripts/gMLP_image_classification.py b/examples/qualcomm/oss_scripts/gMLP_image_classification.py index 3d98f55a7d..e9b9b91507 100644 --- a/examples/qualcomm/oss_scripts/gMLP_image_classification.py +++ b/examples/qualcomm/oss_scripts/gMLP_image_classification.py @@ -59,30 +59,7 @@ def get_data_loader(): return inputs, targets, input_list -if __name__ == "__main__": - parser = setup_common_args_and_variables() - - parser.add_argument( - "-a", - "--artifact", - help="path for storing generated artifacts by this example. Default ./gMLP_image_classification", - default="./gMLP_image_classification", - type=str, - ) - - parser.add_argument( - "-d", - "--dataset", - help=( - "path to the validation folder of ImageNet dataset. " - "e.g. --dataset imagenet-mini/val " - "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" - ), - type=str, - required=True, - ) - - args = parser.parse_args() +def main(args): skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) # ensure the working directory exist. @@ -161,3 +138,37 @@ def get_data_loader(): else: for i, k in enumerate(k_val): print(f"top_{k}->{topk[i]}%") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. Default ./gMLP_image_classification", + default="./gMLP_image_classification", + type=str, + ) + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/oss_scripts/squeezenet.py b/examples/qualcomm/oss_scripts/squeezenet.py index 53edb98b91..bc000c6938 100644 --- a/examples/qualcomm/oss_scripts/squeezenet.py +++ b/examples/qualcomm/oss_scripts/squeezenet.py @@ -57,32 +57,7 @@ def get_data_loader(): return inputs, targets, input_list -if __name__ == "__main__": - parser = setup_common_args_and_variables() - - parser.add_argument( - "-d", - "--dataset", - help=( - "path to the validation folder of ImageNet dataset. " - "e.g. --dataset imagenet-mini/val " - "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" - ), - type=str, - required=True, - ) - - parser.add_argument( - "-a", - "--artifact", - help="path for storing generated artifacts by this example. " - "Default ./squeezenet", - default="./squeezenet", - type=str, - ) - - args = parser.parse_args() - +def main(args): skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) # ensure the working directory exist. @@ -158,3 +133,38 @@ def get_data_loader(): else: for i, k in enumerate(k_val): print(f"top_{k}->{topk[i]}%") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./squeezenet", + default="./squeezenet", + type=str, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/oss_scripts/ssd300_vgg16.py b/examples/qualcomm/oss_scripts/ssd300_vgg16.py index cd4eb8764f..8fdb896e09 100644 --- a/examples/qualcomm/oss_scripts/ssd300_vgg16.py +++ b/examples/qualcomm/oss_scripts/ssd300_vgg16.py @@ -119,50 +119,7 @@ def SSD300VGG16(pretrained_weight_model): return model.eval() -if __name__ == "__main__": - parser = setup_common_args_and_variables() - - parser.add_argument( - "-a", - "--artifact", - help="path for storing generated artifacts by this example. Default ./ssd300_vgg16", - default="./ssd300_vgg16", - type=str, - ) - - parser.add_argument( - "-d", - "--download", - help="If specified, download VOCSegmentation dataset by torchvision API", - action="store_true", - default=False, - ) - - parser.add_argument( - "--oss_repo", - help=( - "Repository that contains model backbone and score calculation." - "e.g., --M ./a-PyTorch-Tutorial-to-Object-Detection" - "Please clone the repository from https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection" - ), - type=str, - required=True, - ) - - parser.add_argument( - "-p", - "--pretrained_weight", - help=( - "Location of model pretrained weight." - "e.g., -p ./checkpoint_ssd300.pth.tar" - "Pretrained model can be found in the link https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection, under the Training Section" - ), - type=str, - required=True, - ) - - args = parser.parse_args() - +def main(args): sys.path.insert(0, args.oss_repo) skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) @@ -279,3 +236,56 @@ def post_process(): pp.pprint(APs) adb.pull(output_path=args.artifact, callback=post_process) + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. Default ./ssd300_vgg16", + default="./ssd300_vgg16", + type=str, + ) + + parser.add_argument( + "-d", + "--download", + help="If specified, download VOCSegmentation dataset by torchvision API", + action="store_true", + default=False, + ) + + parser.add_argument( + "--oss_repo", + help=( + "Repository that contains model backbone and score calculation." + "e.g., --M ./a-PyTorch-Tutorial-to-Object-Detection" + "Please clone the repository from https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-p", + "--pretrained_weight", + help=( + "Location of model pretrained weight." + "e.g., -p ./checkpoint_ssd300.pth.tar" + "Pretrained model can be found in the link https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection, under the Training Section" + ), + type=str, + required=True, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/scripts/deeplab_v3.py b/examples/qualcomm/scripts/deeplab_v3.py index ff1f53c180..d870380e35 100755 --- a/examples/qualcomm/scripts/deeplab_v3.py +++ b/examples/qualcomm/scripts/deeplab_v3.py @@ -61,27 +61,7 @@ def get_dataset(data_size, dataset_dir, download): return inputs, targets, input_list -if __name__ == "__main__": - parser = setup_common_args_and_variables() - - parser.add_argument( - "-a", - "--artifact", - help="path for storing generated artifacts by this example. Default ./deeplab_v3", - default="./deeplab_v3", - type=str, - ) - - parser.add_argument( - "-d", - "--download", - help="If specified, download VOCSegmentation dataset by torchvision API", - action="store_true", - default=False, - ) - - args = parser.parse_args() - +def main(args): skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) # ensure the working directory exist. @@ -196,3 +176,33 @@ def post_process(): print(f"MPA : {mpa}%") print(f"MIoU : {miou}%") print(f"CIoU : \n{json.dumps(cls_iou, indent=2)}") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. Default ./deeplab_v3", + default="./deeplab_v3", + type=str, + ) + + parser.add_argument( + "-d", + "--download", + help="If specified, download VOCSegmentation dataset by torchvision API", + action="store_true", + default=False, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/scripts/edsr.py b/examples/qualcomm/scripts/edsr.py index 54cc8bff19..f602ecc1af 100755 --- a/examples/qualcomm/scripts/edsr.py +++ b/examples/qualcomm/scripts/edsr.py @@ -91,43 +91,7 @@ def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str return SrDataset(hr_dir, lr_dir) -if __name__ == "__main__": - parser = setup_common_args_and_variables() - - parser.add_argument( - "-a", - "--artifact", - help="path for storing generated artifacts by this example. Default ./edsr", - default="./edsr", - type=str, - ) - - parser.add_argument( - "-r", - "--hr_ref_dir", - help="Path to the high resolution images", - default="", - type=str, - ) - - parser.add_argument( - "-l", - "--lr_dir", - help="Path to the low resolution image inputs", - default="", - type=str, - ) - - parser.add_argument( - "-d", - "--default_dataset", - help="If specified, download and use B100 dataset by torchSR API", - action="store_true", - default=False, - ) - - args = parser.parse_args() - +def main(args): skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) # ensure the working directory exist. @@ -223,3 +187,49 @@ def post_process(): else: print(f"Average of PNSR is: {avg_PSNR}") print(f"Average of SSIM is: {avg_SSIM}") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. Default ./edsr", + default="./edsr", + type=str, + ) + + parser.add_argument( + "-r", + "--hr_ref_dir", + help="Path to the high resolution images", + default="", + type=str, + ) + + parser.add_argument( + "-l", + "--lr_dir", + help="Path to the low resolution image inputs", + default="", + type=str, + ) + + parser.add_argument( + "-d", + "--default_dataset", + help="If specified, download and use B100 dataset by torchSR API", + action="store_true", + default=False, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/scripts/inception_v3.py b/examples/qualcomm/scripts/inception_v3.py index 94aa618c72..90eb8cf206 100755 --- a/examples/qualcomm/scripts/inception_v3.py +++ b/examples/qualcomm/scripts/inception_v3.py @@ -58,32 +58,7 @@ def get_data_loader(): return inputs, targets, input_list -if __name__ == "__main__": - parser = setup_common_args_and_variables() - - parser.add_argument( - "-d", - "--dataset", - help=( - "path to the validation folder of ImageNet dataset. " - "e.g. --dataset imagenet-mini/val " - "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" - ), - type=str, - required=True, - ) - - parser.add_argument( - "-a", - "--artifact", - help="path for storing generated artifacts by this example. " - "Default ./inception_v3", - default="./inception_v3", - type=str, - ) - - args = parser.parse_args() - +def main(args): skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) # ensure the working directory exist. @@ -159,3 +134,38 @@ def get_data_loader(): else: for i, k in enumerate(k_val): print(f"top_{k}->{topk[i]}%") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./inception_v3", + default="./inception_v3", + type=str, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/scripts/inception_v4.py b/examples/qualcomm/scripts/inception_v4.py index e457fef0f7..84b20e6e20 100755 --- a/examples/qualcomm/scripts/inception_v4.py +++ b/examples/qualcomm/scripts/inception_v4.py @@ -57,32 +57,7 @@ def get_data_loader(): return inputs, targets, input_list -if __name__ == "__main__": - parser = setup_common_args_and_variables() - - parser.add_argument( - "-d", - "--dataset", - help=( - "path to the validation folder of ImageNet dataset. " - "e.g. --dataset imagenet-mini/val " - "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" - ), - type=str, - required=True, - ) - - parser.add_argument( - "-a", - "--artifact", - help="path for storing generated artifacts by this example. " - "Default ./inception_v4", - default="./inception_v4", - type=str, - ) - - args = parser.parse_args() - +def main(args): skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) # ensure the working directory exist. @@ -158,3 +133,38 @@ def get_data_loader(): else: for i, k in enumerate(k_val): print(f"top_{k}->{topk[i]}%") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./inception_v4", + default="./inception_v4", + type=str, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py index 85aafe7cae..8972ca202f 100755 --- a/examples/qualcomm/scripts/mobilebert_fine_tune.py +++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py @@ -220,42 +220,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size): return model.eval(), dataloader_val, labels -if __name__ == "__main__": - parser = setup_common_args_and_variables() - - parser.add_argument( - "-a", - "--artifact", - help="path for storing generated artifacts by this example. Default ./mobilebert_fine_tune", - default="./mobilebert_fine_tune", - type=str, - ) - - parser.add_argument( - "-p", - "--pretrained_weight", - help="Location of pretrained weight", - default=None, - type=str, - ) - - parser.add_argument( - "-F", - "--use_fp16", - help="If specified, will run in fp16 precision and discard ptq setting", - action="store_true", - default=False, - ) - - parser.add_argument( - "-P", - "--ptq", - help="If specified, will do PTQ quantization. default is 8bits activation and 8bits weight. Support 8a8w, 16a16w and 16a4w.", - default="8a8w", - ) - - args = parser.parse_args() - +def main(args): skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) # ensure the working directory exist. @@ -353,3 +318,48 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size): print(f"\n[{target[0]}]") for k, v in target[1].items(): print(f"{k}: {v[0]}/{v[1]}") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. Default ./mobilebert_fine_tune", + default="./mobilebert_fine_tune", + type=str, + ) + + parser.add_argument( + "-p", + "--pretrained_weight", + help="Location of pretrained weight", + default=None, + type=str, + ) + + parser.add_argument( + "-F", + "--use_fp16", + help="If specified, will run in fp16 precision and discard ptq setting", + action="store_true", + default=False, + ) + + parser.add_argument( + "-P", + "--ptq", + help="If specified, will do PTQ quantization. default is 8bits activation and 8bits weight. Support 8a8w, 16a16w and 16a4w.", + default="8a8w", + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/scripts/mobilenet_v2.py b/examples/qualcomm/scripts/mobilenet_v2.py index f642e0172c..3ebdcd5d05 100755 --- a/examples/qualcomm/scripts/mobilenet_v2.py +++ b/examples/qualcomm/scripts/mobilenet_v2.py @@ -58,32 +58,7 @@ def get_data_loader(): return inputs, targets, input_list -if __name__ == "__main__": - parser = setup_common_args_and_variables() - - parser.add_argument( - "-d", - "--dataset", - help=( - "path to the validation folder of ImageNet dataset. " - "e.g. --dataset imagenet-mini/val " - "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" - ), - type=str, - required=True, - ) - - parser.add_argument( - "-a", - "--artifact", - help="path for storing generated artifacts by this example. " - "Default ./mobilenet_v2", - default="./mobilenet_v2", - type=str, - ) - - args = parser.parse_args() - +def main(args): skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) # ensure the working directory exist. @@ -159,3 +134,38 @@ def get_data_loader(): else: for i, k in enumerate(k_val): print(f"top_{k}->{topk[i]}%") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./mobilenet_v2", + default="./mobilenet_v2", + type=str, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/scripts/mobilenet_v3.py b/examples/qualcomm/scripts/mobilenet_v3.py index d15827160a..18fd7c849a 100644 --- a/examples/qualcomm/scripts/mobilenet_v3.py +++ b/examples/qualcomm/scripts/mobilenet_v3.py @@ -57,32 +57,7 @@ def get_data_loader(): return inputs, targets, input_list -if __name__ == "__main__": - parser = setup_common_args_and_variables() - - parser.add_argument( - "-d", - "--dataset", - help=( - "path to the validation folder of ImageNet dataset. " - "e.g. --dataset imagenet-mini/val " - "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" - ), - type=str, - required=True, - ) - - parser.add_argument( - "-a", - "--artifact", - help="path for storing generated artifacts by this example. " - "Default ./mobilenet_v3", - default="./mobilenet_v3", - type=str, - ) - - args = parser.parse_args() - +def main(args): skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) # ensure the working directory exist. @@ -157,3 +132,38 @@ def get_data_loader(): else: for i, k in enumerate(k_val): print(f"top_{k}->{topk[i]}%") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./mobilenet_v3", + default="./mobilenet_v3", + type=str, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/scripts/torchvision_vit.py b/examples/qualcomm/scripts/torchvision_vit.py index cd5463c8a2..cfdbe5d075 100755 --- a/examples/qualcomm/scripts/torchvision_vit.py +++ b/examples/qualcomm/scripts/torchvision_vit.py @@ -56,29 +56,7 @@ def get_data_loader(): return inputs, targets, input_list -if __name__ == "__main__": - parser = setup_common_args_and_variables() - parser.add_argument( - "-d", - "--dataset", - help=( - "path to the validation folder of ImageNet dataset. " - "e.g. --dataset imagenet-mini/val " - "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" - ), - type=str, - required=True, - ) - parser.add_argument( - "-a", - "--artifact", - help="path for storing generated artifacts by this example. " "Default ./vit", - default="./vit", - type=str, - ) - - args = parser.parse_args() - +def main(args): # ensure the working directory exist. os.makedirs(args.artifact, exist_ok=True) @@ -140,3 +118,35 @@ def get_data_loader(): else: for i, k in enumerate(k_val): print(f"top_{k}->{topk[i]}%") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " "Default ./vit", + default="./vit", + type=str, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/scripts/utils.py b/examples/qualcomm/scripts/utils.py index d803932179..1e4b1c6968 100755 --- a/examples/qualcomm/scripts/utils.py +++ b/examples/qualcomm/scripts/utils.py @@ -187,6 +187,7 @@ def build_executorch_binary( quantizer = QnnQuantizer() quantizer.add_custom_quant_annotations(custom_annotations) quantizer.set_per_channel_linear_quant(per_channel_linear) + quantizer.set_per_channel_conv_quant(True) if quant_dtype == QuantDtype.use_8a8w: pass # default setting @@ -214,7 +215,6 @@ def build_executorch_binary( for data in dataset: annotated_model(*data) quantized_model = convert_pt2e(annotated_model) - edge_prog = capture_program(quantized_model, inputs) else: edge_prog = capture_program(model, inputs) From f695f8e8de71ce028e0414f7ed8fde416a64b822 Mon Sep 17 00:00:00 2001 From: Matthias Cremon Date: Mon, 29 Jul 2024 15:05:34 -0700 Subject: [PATCH 10/75] Support qmatmul with different dims tensors (#4438) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4438 MobileBERT exposes an issue in our kernel, where tensors have compatible (for PyTorch) but different batch dimensions. This diff changes the meta kernel to support that (the kernel can already do it). Reviewed By: dulinriley Differential Revision: D60314979 fbshipit-source-id: a0cde9d328098992787c353611ece64223d6c739 --- backends/cadence/aot/ops_registrations.py | 46 ++++++++++++----------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py index c877a7149d..adcf086873 100644 --- a/backends/cadence/aot/ops_registrations.py +++ b/backends/cadence/aot/ops_registrations.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from math import prod from typing import Optional, Tuple import torch @@ -186,28 +187,29 @@ def quantized_matmul_meta( X_size = list(X.size()) Y_size = list(Y.size()) - assert len(X_size) == len( - Y_size - ), "quantized matmul not supported for tensors of different dimensions" - - if len(X_size) == 3: - assert ( - X_size[0] == Y_size[0] - ), "quantized matmul only supported for batch dimension of same size" - if transposed: - assert X_size[2] == Y_size[2], "matrices cannot be multiplied" - out_size = X_size[:2] + [Y_size[1]] - else: - assert X_size[2] == Y_size[1], "matrices cannot be multiplied" - out_size = X_size[:2] + [Y_size[2]] - elif len(X_size) == 2: - if transposed: - assert X_size[1] == Y_size[1], "matrices cannot be multiplied" - out_size = [X_size[0], Y_size[0]] - else: - assert X_size[1] == Y_size[0], "matrices cannot be multiplied" - out_size = [X_size[0], Y_size[1]] + # Get the batch dimensions for both tensors + X_batch_dims = X_size[:-2] + Y_batch_dims = Y_size[:-2] + + # If they don't match, check that they're compatible + if X_batch_dims != Y_batch_dims: + assert prod(X_batch_dims) == prod( + Y_batch_dims + ), f"Batch dimensions of X and Y do not match: {X_batch_dims} vs {Y_batch_dims}" + + # Get the matmul output size + if transposed: + assert X_size[-1] == Y_size[-1], "matrices cannot be multiplied" + mat_size = [X_size[-2], Y_size[-2]] else: - raise AssertionError("quantized matmul only supported for 2D or 3D tensors") + assert X_size[-1] == Y_size[-2], "matrices cannot be multiplied" + mat_size = [X_size[-2], Y_size[-1]] + + # Combine the larger batch dimensions with the matmul output size + out_size = ( + X_batch_dims + mat_size + if len(X_batch_dims) > len(Y_batch_dims) + else Y_batch_dims + mat_size + ) return X.new_empty(out_size, dtype=X.dtype) From e6684f7662fe067673e0a3c36066f1245127d95d Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 29 Jul 2024 16:26:30 -0700 Subject: [PATCH 11/75] Use linux.24xlarge for llava test (#4446) Summary: Attempt to fix the OOM error after https://github.com/pytorch/executorch/pull/4430 Pull Request resolved: https://github.com/pytorch/executorch/pull/4446 Reviewed By: shoumikhin Differential Revision: D60418070 Pulled By: huydhn fbshipit-source-id: 7e119ce52645bfd452064b674e5b8896df3642a0 --- .github/workflows/pull.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 36099ca651..bbbb976385 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -193,7 +193,7 @@ jobs: strategy: fail-fast: false with: - runner: linux.12xlarge + runner: linux.24xlarge docker-image: executorch-ubuntu-22.04-clang12 submodules: 'true' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} From 711ecec40feafea8b74383d46dae342b39936e65 Mon Sep 17 00:00:00 2001 From: Yidi Wu Date: Mon, 29 Jul 2024 17:05:44 -0700 Subject: [PATCH 12/75] fix zero arg export in training_ir and constant tensor handling (#4382) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4382 Prepare for the re-land of D60006710. Previously, some buffers are not correctly identified. D60006710 fixes it but causes test failures. This pr did a patch to the test to avoid test failures. Reviewed By: BoyuanFeng Differential Revision: D60137883 fbshipit-source-id: d919e25525347a86afc6a895d2b0eb94d161b5ad --- exir/backend/test/test_partitioner.py | 28 ++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/exir/backend/test/test_partitioner.py b/exir/backend/test/test_partitioner.py index d492c291f3..3ee6202ae8 100644 --- a/exir/backend/test/test_partitioner.py +++ b/exir/backend/test/test_partitioner.py @@ -36,7 +36,7 @@ ) from executorch.extension.pytree import tree_flatten from torch._export import capture_pre_autograd_graph -from torch._export.utils import is_buffer, is_param +from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param from torch.export import export from torch.fx.passes.operator_support import any_chain @@ -235,7 +235,11 @@ def partition( self.assertEqual( len(owning_program.state_dict) + len(owning_program.constants), 3 ) - self.assertEqual(len(owning_program.graph_signature.buffers), 2) + self.assertEqual( + len(owning_program.graph_signature.buffers) + + len(owning_program.graph_signature.lifted_tensor_constants), + 2, + ) self.assertEqual(len(owning_program.graph_signature.parameters), 1) # Check Lowered Module Exported Program does not have any constant data @@ -290,6 +294,7 @@ def partition( if node.op == "placeholder" and ( is_param(edge_exported_program, node) or is_buffer(edge_exported_program, node) + or is_lifted_tensor_constant(edge_exported_program, node) ): delegation_tag = "tag0" node.meta["delegation_tag"] = delegation_tag @@ -324,7 +329,11 @@ def partition( ) delegated_ep = lower_module.original_module self.assertEqual(len(delegated_ep.state_dict) + len(delegated_ep.constants), 3) - self.assertEqual(len(delegated_ep.graph_signature.buffers), 2) + self.assertEqual( + len(delegated_ep.graph_signature.buffers) + + len(delegated_ep.graph_signature.lifted_tensor_constants), + 2, + ) self.assertEqual(len(delegated_ep.graph_signature.parameters), 1) # check exported program is still runnable @@ -380,7 +389,11 @@ def partition( self.assertEqual( len(owning_program.state_dict) + len(owning_program.constants), 2 ) - self.assertEqual(len(owning_program.graph_signature.buffers), 2) + self.assertEqual( + len(owning_program.graph_signature.buffers) + + len(owning_program.graph_signature.lifted_tensor_constants), + 2, + ) self.assertEqual(len(owning_program.graph_signature.parameters), 0) # Check Lowered Module Exported Program does not own any buffers @@ -503,6 +516,7 @@ def partition( if node.op == "placeholder" and ( is_param(edge_exported_program, node) or is_buffer(edge_exported_program, node) + or is_lifted_tensor_constant(edge_exported_program, node) ): delegation_tag = "tag0" node.meta["delegation_tag"] = delegation_tag @@ -519,9 +533,9 @@ def partition( with self.assertRaises(RuntimeError) as error: _ = edge.to_backend(PartitionerTagData()) - self.assertEqual( - "constant data node (b_const) is tagged with (tag0) but has user (aten_sub_tensor) which has tag (None)", - str(error.exception), + self.assertTrue( + "is tagged with (tag0) but has user (aten_sub_tensor) which has tag (None)" + in str(error.exception), ) def test_not_delegate_mutable_buffers(self) -> None: From 7f6a3416c4e183a57f023969004afec74a4ea480 Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Mon, 29 Jul 2024 18:15:47 -0700 Subject: [PATCH 13/75] Remove redundant generate_*_compile_spec funcs (#3869) Summary: tidying up a redundant wrapper function. Pull Request resolved: https://github.com/pytorch/executorch/pull/3869 Reviewed By: mergennachin Differential Revision: D58301772 Pulled By: digantdesai fbshipit-source-id: cd906e2aa307408e7e2f877ace6544e91cd972fc --- backends/arm/arm_backend.py | 37 -------------------------------- examples/arm/aot_arm_compiler.py | 14 ++++++------ 2 files changed, 8 insertions(+), 43 deletions(-) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 8ef5a79d3f..f187191fee 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -166,43 +166,6 @@ def get_intermediate_path(compile_spec: List[CompileSpec]) -> str: return None -def generate_ethosu_compile_spec( - config: str, - permute_memory_to_nhwc: Optional[bool] = None, - quantize_io: Optional[bool] = None, - system_config: Optional[str] = None, - memory_mode: Optional[str] = None, - extra_flags: Optional[str] = None, - config_ini: Optional[str] = "Arm/vela.ini", -) -> List[CompileSpec]: - return ( - ArmCompileSpecBuilder() - .ethosu_compile_spec( - config, - system_config=system_config, - memory_mode=memory_mode, - extra_flags=extra_flags, - config_ini=config_ini, - ) - .set_permute_memory_format(permute_memory_to_nhwc) - .set_quantize_io(quantize_io) - .build() - ) - - -def generate_tosa_compile_spec( - permute_memory_to_nhwc: Optional[bool] = None, - output_path: Optional[str] = None, -) -> List[CompileSpec]: - return ( - ArmCompileSpecBuilder() - .tosa_compile_spec() - .set_permute_memory_format(permute_memory_to_nhwc) - .dump_intermediate_artifacts_to(output_path) - .build() - ) - - @final class ArmBackend(BackendDetails): @staticmethod diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index 916a766f7c..f854a081fa 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -11,7 +11,8 @@ import logging import torch -from executorch.backends.arm.arm_backend import generate_ethosu_compile_spec + +from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder from executorch.backends.arm.arm_partitioner import ArmPartitioner from executorch.backends.arm.quantizer.arm_quantizer import ( ArmQuantizer, @@ -212,12 +213,13 @@ def forward(self, x): if args.delegate is True: edge = edge.to_backend( ArmPartitioner( - generate_ethosu_compile_spec( - "ethos-u55-128", - permute_memory_to_nhwc=args.model_name - in MODEL_NAME_TO_MODEL.keys(), - quantize_io=True, + ArmCompileSpecBuilder() + .ethosu_compile_spec("ethos-u55-128") + .set_permute_memory_format( + args.model_name in MODEL_NAME_TO_MODEL.keys() ) + .set_quantize_io(True) + .build() ) ) logging.debug(f"Lowered graph:\n{edge.exported_program().graph}") From da24d185579911df48f031468a3842fee56f9cb2 Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Mon, 29 Jul 2024 18:22:09 -0700 Subject: [PATCH 14/75] Add slice op to Arm backend (#4072) Summary: Implements node visitor and tests. Also implements a io_config in ArmQuantizer as a fallback. The io_config QuantizationConfig is applied to placeholders and outputs that miss annotation after all other annotation is applied. The intended use is for unit testing quantization of operations without quantization annotators. Change-Id: Iae7dc3f1dc2afe23776566f0e9904271cde0892a Pull Request resolved: https://github.com/pytorch/executorch/pull/4072 Reviewed By: manuelcandales Differential Revision: D59259968 Pulled By: digantdesai fbshipit-source-id: 253c4e9e6fd47bfe1fb18847edc33efa2a94f5d4 --- backends/arm/arm_partitioner.py | 1 + backends/arm/operators/__init__.py | 1 + backends/arm/operators/op_slice.py | 55 +++++++++ backends/arm/quantizer/arm_quantizer.py | 38 ++++++ backends/arm/quantizer/arm_quantizer_utils.py | 2 +- backends/arm/test/ops/test_clone.py | 22 ++-- backends/arm/test/ops/test_slice.py | 116 ++++++++++++++++++ backends/arm/test/ops/test_view.py | 22 ++-- 8 files changed, 234 insertions(+), 23 deletions(-) create mode 100644 backends/arm/operators/op_slice.py create mode 100644 backends/arm/test/ops/test_slice.py diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py index 54cfafcc9b..56dac5d248 100644 --- a/backends/arm/arm_partitioner.py +++ b/backends/arm/arm_partitioner.py @@ -47,6 +47,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: exir_ops.edge.aten.avg_pool2d.default, exir_ops.edge.aten.sigmoid.default, exir_ops.edge.aten._softmax.default, + exir_ops.edge.aten.slice_copy.Tensor, exir_ops.edge.aten.sub.Tensor, exir_ops.edge.aten.view_copy.default, exir_ops.edge.aten.clone.default, diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index 79c507816d..e868b584cf 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -19,6 +19,7 @@ op_permute, op_quant, op_sigmoid, + op_slice, op_softmax, op_sub, op_view, diff --git a/backends/arm/operators/op_slice.py b/backends/arm/operators/op_slice.py new file mode 100644 index 0000000000..8d59835ff0 --- /dev/null +++ b/backends/arm/operators/op_slice.py @@ -0,0 +1,55 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List + +import serializer.tosa_serializer as ts +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from serializer.tosa_serializer import TosaOp +from torch.fx import Node + + +@register_node_visitor +class SliceVisitor(NodeVisitor): + target = "aten.slice_copy.Tensor" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + + # aten.slice_copy supports slicing in 1d at a time. + # The arguments are dimension of slicing, start index and end index. + assert len(inputs) == 4 + input_node, dim, start, end = inputs + + # Translate and check parameters in Pytorch dim order. + shape = input_node.shape + dim = dim.number + end = (shape[dim] + end.number) % shape[dim] + size = end - start.number + assert size > 0 + assert size <= shape[dim] + + # Convert aten args to Tosa's start and size attributes and in TOSA dim order. + attr = ts.TosaSerializerAttribute() + start_attr = [start.number if i == dim else 0 for i in input_node.dim_order] + size_attr = [size if i == dim else shape[i] for i in input_node.dim_order] + attr.SliceAttribute(start_attr, size_attr) + + tosa_graph.addOperator( + TosaOp.Op().SLICE, [input_node.name], [output.name], attr + ) diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py index 3e1aceefe1..397ba68565 100644 --- a/backends/arm/quantizer/arm_quantizer.py +++ b/backends/arm/quantizer/arm_quantizer.py @@ -17,8 +17,11 @@ import torch import torch.nn.functional as F + +from executorch.backends.arm.quantizer import arm_quantizer_utils from executorch.backends.arm.quantizer.arm_quantizer_utils import ( convert_scalars_to_attrs, + mark_nodes_as_annotated, propagate_annotation, ) from executorch.backends.arm.quantizer.quantization_annotation import ( @@ -41,6 +44,10 @@ ) from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor from torch.ao.quantization.quantizer import QuantizationSpec, Quantizer +from torch.ao.quantization.quantizer.utils import ( + _annotate_input_qspec_map, + _annotate_output_qspec, +) from torch.fx import GraphModule, Node __all__ = [ @@ -263,6 +270,7 @@ class ArmQuantizer(Quantizer): def __init__(self) -> None: super().__init__() self.global_config: Optional[QuantizationConfig] = None + self.io_config: Optional[QuantizationConfig] = None self.module_type_config: Dict[Callable, Optional[QuantizationConfig]] = {} self.module_name_config: Dict[str, Optional[QuantizationConfig]] = {} @@ -294,6 +302,11 @@ def set_module_name( self.module_name_config[module_name] = quantization_config return self + def set_io(self, quantization_config): + """Set quantization_config for input and output nodes.""" + self.io_config = quantization_config + return self + def transform_for_annotation(self, model: GraphModule) -> GraphModule: """An initial pass for transforming the graph to prepare it for annotation. Currently transforms scalar values to tensor attributes. @@ -358,8 +371,33 @@ def _annotate_for_static_quantization_config( self.global_config, _get_not_module_type_or_name_filter(tp_list, module_name_list), ) + + if self.io_config: + self._annotate_io(model, self.io_config) + return model + def _annotate_io( + self, + model: GraphModule, + quantization_config: QuantizationConfig, + ): + for node in model.graph.nodes: + if arm_quantizer_utils.is_annotated(node): + continue + if node.op == "placeholder": + _annotate_output_qspec( + node, + quantization_config.get_output_act_qspec(), + ) + mark_nodes_as_annotated([node]) + if node.op == "output": + parent = node.all_input_nodes[0] + _annotate_input_qspec_map( + node, parent, quantization_config.get_input_act_qspec() + ) + mark_nodes_as_annotated([node]) + def validate(self, model: GraphModule) -> None: pass diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py index ee2844e668..89703f89b0 100644 --- a/backends/arm/quantizer/arm_quantizer_utils.py +++ b/backends/arm/quantizer/arm_quantizer_utils.py @@ -140,7 +140,7 @@ def is_share_obs_or_fq_op(op: Callable) -> bool: torch.ops.aten.adaptive_avg_pool2d.default, torch.ops.aten.view_copy.default, torch.ops.aten.view.default, - torch.ops.aten.slice_copy.Tensor, + torch.ops.aten.slice.Tensor, torch.ops.aten.flatten.using_ints, torch.ops.aten.dropout.default, ] diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py index edfaafbcc2..2fc9b338cf 100644 --- a/backends/arm/test/ops/test_clone.py +++ b/backends/arm/test/ops/test_clone.py @@ -8,16 +8,20 @@ # Tests the clone op which copies the data of the input tensor (possibly with new data format) # -import logging import unittest from typing import Tuple import torch + +from executorch.backends.arm.quantizer.arm_quantizer import ( + ArmQuantizer, + get_symmetric_quantization_config, +) from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester -from parameterized import parameterized -logger = logging.getLogger(__name__) +from executorch.backends.xnnpack.test.tester.tester import Quantize +from parameterized import parameterized class TestSimpleClone(unittest.TestCase): @@ -53,13 +57,14 @@ def _test_clone_tosa_MI_pipeline( def _test_clone_tosa_BI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) ( ArmTester( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec(), ) - .quantize() + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() .check_count({"torch.ops.aten.clone.default": 1}) .to_edge() @@ -72,13 +77,14 @@ def _test_clone_tosa_BI_pipeline( def _test_clone_tosa_u55_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) ( ArmTester( module, example_inputs=test_data, compile_spec=common.get_u55_compile_spec(), ) - .quantize() + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() .check_count({"torch.ops.aten.clone.default": 1}) .to_edge() @@ -91,16 +97,10 @@ def _test_clone_tosa_u55_pipeline( def test_clone_tosa_MI(self, test_tensor: torch.Tensor): self._test_clone_tosa_MI_pipeline(self.Clone(), (test_tensor,)) - # Expected to fail since ArmQuantizer cannot quantize a Clone layer - # TODO MLETROCH-125 @parameterized.expand(Clone.test_parameters) - @unittest.expectedFailure def test_clone_tosa_BI(self, test_tensor: torch.Tensor): self._test_clone_tosa_BI_pipeline(self.Clone(), (test_tensor,)) - # Expected to fail since ArmQuantizer cannot quantize a Clone layer - # TODO MLETROCH-125 @parameterized.expand(Clone.test_parameters) - @unittest.expectedFailure def test_clone_u55_BI(self, test_tensor: torch.Tensor): self._test_clone_tosa_u55_pipeline(self.Clone(), (test_tensor,)) diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py new file mode 100644 index 0000000000..a1c1e29cbc --- /dev/null +++ b/backends/arm/test/ops/test_slice.py @@ -0,0 +1,116 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from typing import Tuple + +import torch +from executorch.backends.arm.quantizer.arm_quantizer import ( + ArmQuantizer, + get_symmetric_quantization_config, +) +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.xnnpack.test.tester.tester import Quantize +from parameterized import parameterized + + +class TestSimpleSlice(unittest.TestCase): + + class Slice(torch.nn.Module): + + sizes = [(10), (10, 10), (10, 10, 10), ((1, 12, 10, 10))] + test_tensors = [(torch.ones(n),) for n in sizes] + + def forward(self, x: torch.Tensor): + if x.dim() == 1: + return x[3:-3] + elif x.dim() == 2: + return x[1:3, 3:5] + elif x.dim() == 3: + return x[0:7, 0:1, 0:8] + elif x.dim() == 4: + return x[:, 2:5, 3:5, 4:5] + + def _test_slice_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: torch.Tensor + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check(["torch.ops.aten.slice.Tensor"]) + .to_edge() + .check(["executorch_exir_dialects_edge__ops_aten_slice_copy"]) + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_slice_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor], permute: bool + ): + + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec( + permute_memory_to_nhwc=permute + ), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check(["torch.ops.aten.slice.Tensor"]) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data, qtol=1) + ) + + def _test_slice_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check(["torch.ops.aten.slice.Tensor"]) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(Slice.test_tensors) + def test_slice_tosa_MI(self, tensor): + self._test_slice_tosa_MI_pipeline(self.Slice(), (tensor,)) + + @parameterized.expand(Slice.test_tensors[:2]) + def test_slice_nchw_tosa_BI(self, test_tensor: torch.Tensor): + self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,), False) + + @parameterized.expand(Slice.test_tensors[2:]) + def test_slice_nhwc_tosa_BI(self, test_tensor: torch.Tensor): + self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,), True) + + # Fails during Vela compilation when trying to use a Tuple as a Named tuple, + # Could be Vela Issue, wait until Regor. + @parameterized.expand(Slice.test_tensors) + @unittest.expectedFailure + def test_slice_u55_BI(self, test_tensor: torch.Tensor): + self._test_slice_u55_BI_pipeline(self.Slice(), (test_tensor,)) diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py index 5dcd1fe73f..7eda0d9cc2 100644 --- a/backends/arm/test/ops/test_view.py +++ b/backends/arm/test/ops/test_view.py @@ -8,16 +8,20 @@ # Tests the view op which changes the size of a Tensor without changing the underlying data. # -import logging import unittest from typing import Tuple import torch + +from executorch.backends.arm.quantizer.arm_quantizer import ( + ArmQuantizer, + get_symmetric_quantization_config, +) from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester -from parameterized import parameterized -logger = logging.getLogger(__name__) +from executorch.backends.xnnpack.test.tester.tester import Quantize +from parameterized import parameterized class TestSimpleView(unittest.TestCase): @@ -50,13 +54,14 @@ def _test_view_tosa_MI_pipeline( def _test_view_tosa_BI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) ( ArmTester( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec(), ) - .quantize() + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() .check_count({"torch.ops.aten.view.default": 1}) .to_edge() @@ -69,13 +74,14 @@ def _test_view_tosa_BI_pipeline( def _test_view_u55_BI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) ( ArmTester( module, example_inputs=test_data, compile_spec=common.get_u55_compile_spec(), ) - .quantize() + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() .check_count({"torch.ops.aten.view.default": 1}) .to_edge() @@ -88,16 +94,10 @@ def _test_view_u55_BI_pipeline( def test_view_tosa_MI(self, test_tensor: torch.Tensor): self._test_view_tosa_MI_pipeline(self.View(), (test_tensor,)) - # Expected to fail since ArmQuantizer cannot quantize a View layer. - # TODO MLETROCH-125 @parameterized.expand(View.test_parameters) - @unittest.expectedFailure def test_view_tosa_BI(self, test_tensor: torch.Tensor): self._test_view_tosa_BI_pipeline(self.View(), (test_tensor,)) - # Expected to fail since ArmQuantizer cannot quantize a View layer. - # TODO MLETROCH-125 @parameterized.expand(View.test_parameters) - @unittest.expectedFailure def test_view_u55_BI(self, test_tensor: torch.Tensor): self._test_view_u55_BI_pipeline(self.View(), (test_tensor,)) From 318a178e365cca0900f3f8c06783b7934726749e Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Mon, 29 Jul 2024 18:34:14 -0700 Subject: [PATCH 15/75] Delete hooks.h (#4448) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4448 There are no more implementations of this, and users have switched to using the PAL. Reviewed By: tarun292 Differential Revision: D60408510 fbshipit-source-id: 97efbb64bed64e4d981cb33be355254e9a1eb47e --- runtime/platform/hooks.h | 25 ------------------------- runtime/platform/profiler.cpp | 1 - runtime/platform/targets.bzl | 1 - 3 files changed, 27 deletions(-) delete mode 100644 runtime/platform/hooks.h diff --git a/runtime/platform/hooks.h b/runtime/platform/hooks.h deleted file mode 100644 index 28518ff788..0000000000 --- a/runtime/platform/hooks.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace torch { -namespace executor { - -// The stubs defined in this file are expected to be implemented/provided on -// a per platform basis. e.g. we'll have one for Linux running on x86 and -// another one maybe for a system running a RTOS on an ARM SoC. - -// This is expected to return a 64 bit value that contains the most -// granular time representation available on the system. It could be -// ticks, cycle count or time in microseconds etc. -// TODO(T157580075): delete this file and merge functionality into Platform.hå -uint64_t get_curr_time(void); - -} // namespace executor -} // namespace torch diff --git a/runtime/platform/profiler.cpp b/runtime/platform/profiler.cpp index 92a00bc1c4..ac2e8e187b 100644 --- a/runtime/platform/profiler.cpp +++ b/runtime/platform/profiler.cpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/runtime/platform/targets.bzl b/runtime/platform/targets.bzl index 2dfad34a59..42bb851e2c 100644 --- a/runtime/platform/targets.bzl +++ b/runtime/platform/targets.bzl @@ -99,7 +99,6 @@ def define_common_targets(): "platform.h", "system.h", "types.h", - "hooks.h", ], exported_deps = [ ":compiler", From db1c4d838b021bce225ad382926c1942c3eae425 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 29 Jul 2024 18:45:01 -0700 Subject: [PATCH 16/75] Add an option to turn on/off sdpa_with_kv_cache (#4444) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4444 As titled, need to test both options in export_llava.py. Reviewed By: tarun292, iseeyuan Differential Revision: D60406655 fbshipit-source-id: a423c65c6d134515e7399a8ef14ea54b76b34154 --- .github/workflows/pull.yml | 14 +++++++++++--- examples/models/llava/export_llava.py | 27 +++++++++++++++++++++++++-- examples/models/llava/model.py | 21 ++++++++++++++------- 3 files changed, 50 insertions(+), 12 deletions(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index bbbb976385..591a0328b7 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -210,11 +210,19 @@ jobs: bash examples/models/llava/install_requirements.sh # run export_llava.sh - python examples/models/llava/export_llava.py + python examples/models/llava/export_llava.py --use-sdpa-with-kv-cache --pte-name llava_custom_sdpa.pte # verify file exists - if [ ! -f "llava_combined_xnnpack.pte" ]; then - echo "llava_combined_xnnpack.pte not found!" + if [ ! -f "llava_custom_sdpa.pte" ]; then + echo "llava_custom_sdpa.pte not found!" + exit 1 + fi + + python examples/models/llava/export_llava.py --no-use-sdpa-with-kv-cache --pte-name llava.pte + + # verify file exists + if [ ! -f "llava.pte" ]; then + echo "llava.pte not found!" exit 1 fi diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index 8c19cb977e..f57823a90a 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -4,6 +4,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import logging +from argparse import ArgumentParser, BooleanOptionalAction + import torch from executorch.backends.xnnpack.partition.xnnpack_partitioner import ( XnnpackDynamicallyQuantizedPartitioner, @@ -30,6 +33,9 @@ from torch.export import Dim from torch.nn.attention import SDPBackend +FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" +logging.basicConfig(level=logging.INFO, format=FORMAT) + class LlavaEdgeManager(LLMEdgeManager): def capture_pre_autograd_graph(self) -> "LlavaEdgeManager": @@ -155,7 +161,23 @@ def export_token_embedding(llava, prompt): def main(): - llava_model = LlavaModel() + parser = ArgumentParser() + parser.add_argument( + "--use-sdpa-with-kv-cache", + default=True, + action=BooleanOptionalAction, + help="Use sdpa_with_kv_cache custom op in LLava text model.", + ) + parser.add_argument( + "--pte-name", + default="llava_combined_xnnpack.pte", + help="Name of the exported ExecuTorch program.", + ) + args = parser.parse_args() + logging.info( + f"Exporting Llava model to ExecuTorch with sdpa_with_kv_cache: {args.use_sdpa_with_kv_cache}" + ) + llava_model = LlavaModel(use_sdpa_with_kv_cache_op=args.use_sdpa_with_kv_cache) llava = llava_model.get_eager_model() prompt_before_image, resized, prompt_after_image = ( @@ -193,8 +215,9 @@ def main(): } ).to_executorch() - with open("llava_combined_xnnpack.pte", "wb") as f: + with open(args.pte_name, "wb") as f: executorch_program.write_to_file(f) + logging.info(f"Exported ExecuTorch program to {args.pte_name}") if __name__ == "__main__": diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py index 31270b9042..35831192b4 100644 --- a/examples/models/llava/model.py +++ b/examples/models/llava/model.py @@ -62,8 +62,10 @@ def __init__( llava_model: LlavaMetaForCausalLM, image_processor: CLIPVisionTower, config: PreprocessConfig, + use_sdpa_with_kv_cache_op: bool = True, ): super().__init__() + self.use_sdpa_with_kv_cache_op = use_sdpa_with_kv_cache_op self.config = config self.model_ = llava_model self.text_model_args = ModelArgs( @@ -73,7 +75,7 @@ def __init__( max_batch_size=1, # doesn't work with default batch size 32 ffn_dim_multiplier=1, # TODO: a hack to make rotary embedding happy enable_dynamic_shape=True, # allow parallel prefill - use_sdpa_with_kv_cache_op=True, # use sdpa_with_kv_cache op + use_sdpa_with_kv_cache_op=use_sdpa_with_kv_cache_op, # use sdpa_with_kv_cache op use_hf_rope=True, ) self.embed_tokens = nn.Embedding( @@ -83,7 +85,8 @@ def __init__( ) self.text_model = Transformer(self.text_model_args) # use custom op for SDPA. - self.text_model = replace_sdpa_with_custom_op(self.text_model) + if use_sdpa_with_kv_cache_op: + self.text_model = replace_sdpa_with_custom_op(self.text_model) # load state dict self.text_model.load_state_dict( state_dict=self._translate_state_dict_for_text_model(), @@ -273,7 +276,8 @@ def get_conv_mode(model_name: str) -> str: class LlavaModel(EagerModelBase): - def __init__(self): + def __init__(self, use_sdpa_with_kv_cache_op=True): + self.use_sdpa_with_kv_cache_op = use_sdpa_with_kv_cache_op self.model_path = "liuhaotian/llava-v1.5-7b" self.tokenizer, self.model, self.image_processor, context_len = ( load_pretrained_model( @@ -316,7 +320,12 @@ def __init__(self): self.resized_image = None def get_eager_model(self): - model = Llava(self.model, self.image_processor, self.config) + model = Llava( + self.model, + self.image_processor, + self.config, + self.use_sdpa_with_kv_cache_op, + ) model.to(dtype=torch.float32) return model @@ -368,8 +377,6 @@ def _get_image_dynamic_shapes(self): return dynamic_shapes def _get_prompt_dynamic_shapes(self): - dim = torch.export.Dim( - "token_dim", min=1, max=self.model.config.max_position_embeddings - 1 - ) + dim = torch.export.Dim("token_dim", min=2, max=2048) text_model_dynamic_shapes = ({0: 1}, {1: dim}) return text_model_dynamic_shapes From 1e143339d463262694244c56f2fdb698ec28e3c5 Mon Sep 17 00:00:00 2001 From: Matthias Cremon Date: Mon, 29 Jul 2024 19:44:46 -0700 Subject: [PATCH 17/75] Add exportable baby llama example (#4345) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4345 Add a small LLaMa model, based on the babyllama paper. Note that this test case is only one layer by default, and the number of layers can be adjusted in the test. Removed some pyre changes that broke the OSS AoT export, and added some required passes and operators. Reviewed By: dulinriley Differential Revision: D60073137 fbshipit-source-id: 8379296ad0aa4099b09d033b33479165d7c7c5c9 --- backends/cadence/aot/TARGETS | 4 +- backends/cadence/aot/compiler.py | 20 ++-- backends/cadence/aot/functions.yaml | 20 ++++ backends/cadence/aot/passes.py | 103 +++++++++++++++++- backends/cadence/aot/quantizer/TARGETS | 1 - backends/cadence/aot/quantizer/quantizer.py | 25 ++--- .../reference/operators/CMakeLists.txt | 6 +- .../operators/quantized_matmul_out.cpp | 42 +++---- examples/cadence/models/babyllama.py | 42 +++++++ 9 files changed, 212 insertions(+), 51 deletions(-) create mode 100644 examples/cadence/models/babyllama.py diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS index bd4ec660a6..79646c1293 100644 --- a/backends/cadence/aot/TARGETS +++ b/backends/cadence/aot/TARGETS @@ -28,13 +28,13 @@ python_library( "compiler.py", ], deps = [ - "fbsource//third-party/pypi/pyre-extensions:pyre-extensions", ":passes", ":utils", "//caffe2:torch", "//executorch/backends/cadence/aot/quantizer:fusion_pass", "//executorch/backends/cadence/aot/quantizer:quantizer", "//executorch/backends/transforms:decompose_sdpa", + "//executorch/backends/transforms:remove_clone_ops", "//executorch/exir:lib", ], ) @@ -49,5 +49,7 @@ python_library( "//caffe2:torch", "//executorch/exir:pass_base", "//executorch/exir/dialects:lib", + "//executorch/exir/passes:lib", + "//executorch/exir/passes:spec_prop_pass", ], ) diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py index 302252c42a..39511ae917 100644 --- a/backends/cadence/aot/compiler.py +++ b/backends/cadence/aot/compiler.py @@ -11,23 +11,23 @@ import torch from executorch.backends.cadence.aot.passes import ( + InitializePipeline, + RemoveNopExpandOpPass, RemoveZeroSizedCatArgsPass, + ReplaceLogicalNotBooleanWhereWithWherePass, ReplacePT2DequantWithCadenceDequantPass, ReplacePT2QuantWithCadenceQuantPass, ReplaceScalarTensorWithFullPass, ReplaceSqueezeAndUnsqueezeWithViewPass, ) from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion -from executorch.backends.cadence.aot.quantizer.quantizer import ( - CadenceAtenQuantizer, - CadenceQuantizer, -) +from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer from executorch.backends.cadence.aot.utils import model_is_quantized from executorch.backends.transforms.decompose_sdpa import ( DecomposeScaledDotProductAttention, ) +from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge -from pyre_extensions import assert_is_instance from torch._export import capture_pre_autograd_graph from torch.ao.quantization.pt2e.export_utils import model_is_exported from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e @@ -63,10 +63,8 @@ def quantize_pt2( converted_model = convert_pt2e(prepared_model) # Get patterns and apply fusion of dq -> op -> q to qop - patterns = [ - assert_is_instance(q, CadenceAtenQuantizer).pattern - for q in quantizer.quantizers - ] + # pyre-ignore[16]: no attribute + patterns = [q.pattern for q in quantizer.quantizers] QuantFusion(patterns)(converted_model) return converted_model @@ -148,8 +146,12 @@ def export_to_cadence( # Run a couple required passes for quant/dequant ops cadence_program_manager = edge_program_manager.transform( [ + InitializePipeline(), RemoveZeroSizedCatArgsPass(), + ReplaceLogicalNotBooleanWhereWithWherePass(), ReplaceScalarTensorWithFullPass(), + RemoveCloneOpsTransform(), + RemoveNopExpandOpPass(), ReplaceSqueezeAndUnsqueezeWithViewPass(), ReplacePT2QuantWithCadenceQuantPass(), ReplacePT2DequantWithCadenceDequantPass(), diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml index f79d5f870d..dbfe1e3639 100644 --- a/backends/cadence/aot/functions.yaml +++ b/backends/cadence/aot/functions.yaml @@ -62,16 +62,31 @@ - arg_meta: null kernel_name: torch::executor::full_out +- op: mean.out + kernels: + - arg_meta: null + kernel_name: torch::executor::mean_dim_out + - op: mul.out kernels: - arg_meta: null kernel_name: torch::executor::mul_out +- op: mul.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::mul_scalar_out + - op: permute_copy.out kernels: - arg_meta: null kernel_name: torch::executor::permute_copy_out +- op: rsqrt.out + kernels: + - arg_meta: null + kernel_name: torch::executor::rsqrt_out + - op: sigmoid.out kernels: - arg_meta: null @@ -134,3 +149,8 @@ kernels: - arg_meta: null kernel_name: impl::reference::quantized_relu_out + +func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::reference::quantized_matmul_out diff --git a/backends/cadence/aot/passes.py b/backends/cadence/aot/passes.py index ca8a44f00c..db419bfb5e 100644 --- a/backends/cadence/aot/passes.py +++ b/backends/cadence/aot/passes.py @@ -4,18 +4,19 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import Any, Dict, Tuple +# pyre-strict + +from typing import Any, cast, Dict, Sequence, Tuple import torch from executorch.backends.cadence.aot.utils import get_edge_overload_packet from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue +from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue +from executorch.exir.passes import dead_code_elimination_pass +from executorch.exir.passes.spec_prop_pass import SpecPropPass from torch._subclasses import FakeTensor from torch.utils._pytree import tree_map_only - -# pyre-strict - # Similar to what's done in executorch/exir/pass_base.py Argument = Any # pyre-ignore @@ -173,3 +174,95 @@ def call_operator( init_args[0] = new_args args = tuple(args) return super().call_operator(op, args, kwargs, meta) + + +class RemoveNopExpandOpPass(ExportPass): + """ + For an expand op, if the operator shape matches the expand shape, then the + expand is a nop. + """ + + def call_operator( + self, + op, # pyre-ignore + args: tuple[Argument, ...], + kwargs: dict[str, Argument], + meta: NodeMetadata, + ) -> ProxyValue: + if get_edge_overload_packet(op) not in { + exir_ops.edge.aten.expand_copy, + exir_ops.edge.aten.expand, + }: + return super().call_operator(op, args, kwargs, meta) + + # Parse the args, and check for nop condition + arg0 = cast(ProxyValue, args[0]) + arg1 = cast(Sequence[int], args[1]) + in_tensor = arg0.to_tensor() + if list(in_tensor.shape) == list(arg1): + return arg0 + + return super().call_operator(op, args, kwargs, meta) + + +class ReplaceLogicalNotBooleanWhereWithWherePass(ExportPass): + """ + A where op with a logical_not and a boolean tensor can be replaced + by a where op with flipped inputs and the initial boolean tensor. + """ + + def replace_logical_nop_where_with_where( + self, graph_module: torch.fx.GraphModule + ) -> None: + graph = graph_module.graph + for node in graph.nodes: + # We are only interested in where nodes + if node.target != exir_ops.edge.aten.where.self: + continue + + # If the third arg is not a logical_not, bail. + if node.args[0].target != exir_ops.edge.aten.logical_not.default: + continue + + # Get the third arg node and its input + logical_not_node = node.args[0] + logical_not_input_tensor = ( + logical_not_node.args[0].to_tensor() + if isinstance(logical_not_node.args[0], ProxyValue) + else logical_not_node.args[0] + ) + + # If the logical_not input is not a boolean tensor, bail. + if logical_not_input_tensor.meta["spec"].dtype != torch.bool: + continue + + # Replace the where op with another one, flipping the inputs and using the boolean + # tensor from logical_not. + with graph.inserting_before(node): + linear_node = graph.call_function( + exir_ops.edge.aten.where.self, + args=(logical_not_node.args[0], node.args[2], node.args[1]), + ) + # Replace all the uses + node.replace_all_uses_with(linear_node) + + graph_module.recompile() + graph_module.graph.eliminate_dead_code() + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + self.replace_logical_nop_where_with_where(graph_module) + result = super().call(graph_module) + return result + + +class InitializePipeline(ExportPass): + """ + Initialize the Jarvis pipeline. This should invariably be the first pass to + run. + """ + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + dead_code_elimination_pass(graph_module) + result = SpecPropPass()(graph_module) + assert result is not None + return result diff --git a/backends/cadence/aot/quantizer/TARGETS b/backends/cadence/aot/quantizer/TARGETS index 8b3449cd85..6290626216 100644 --- a/backends/cadence/aot/quantizer/TARGETS +++ b/backends/cadence/aot/quantizer/TARGETS @@ -31,7 +31,6 @@ python_library( ], typing = True, deps = [ - "fbsource//third-party/pypi/pyre-extensions:pyre-extensions", ":patterns", ":utils", "//caffe2:torch", diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py index 4cd3c6bfb4..51bace9168 100644 --- a/backends/cadence/aot/quantizer/quantizer.py +++ b/backends/cadence/aot/quantizer/quantizer.py @@ -26,7 +26,6 @@ is_annotated, no_outside_users, ) -from pyre_extensions import assert_is_instance from torch import fx @@ -100,14 +99,11 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: continue for output, *custom_spec in anchors.output: - assert_is_instance(output, fx.Node).meta["quantization_annotation"] = ( - QuantizationAnnotation( - # pyre-ignore[6]: incompatible parameter type - output_qspec=( - custom_spec[0] if custom_spec else output_act_qspec - ), - _annotated=True, - ) + # pyre-ignore[16]: no attribute + output.meta["quantization_annotation"] = QuantizationAnnotation( + # pyre-ignore[6]: incompatible parameter type + output_qspec=(custom_spec[0] if custom_spec else output_act_qspec), + _annotated=True, ) def annotate_inputs( @@ -118,16 +114,17 @@ def annotate_inputs( spec: Optional[QuantizationSpec], ) -> None: for node, idx, *custom_spec in inputs: - _node = assert_is_instance(node, fx.Node) - annotation = _node.meta.get( + # pyre-ignore[16]: no attribute + annotation = node.meta.get( "quantization_annotation", QuantizationAnnotation(_annotated=True), ) - # pyre-ignore[6]: incompatible parameter type - annotation.input_qspec_map[_node.args[idx]] = ( + # pyre-ignore[16]: no attribute + annotation.input_qspec_map[node.args[idx]] = ( custom_spec[0] if custom_spec else spec ) - _node.meta["quantization_annotation"] = annotation + # pyre-ignore[16]: no attribute + node.meta["quantization_annotation"] = annotation annotate_inputs(anchors.inputs, input_act_qspec) annotate_inputs(anchors.weights, weight_qspec) diff --git a/backends/cadence/reference/operators/CMakeLists.txt b/backends/cadence/reference/operators/CMakeLists.txt index c22dc0c997..c81e934850 100644 --- a/backends/cadence/reference/operators/CMakeLists.txt +++ b/backends/cadence/reference/operators/CMakeLists.txt @@ -32,12 +32,15 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mean.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_rsqrt.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sigmoid.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp" @@ -60,7 +63,8 @@ target_include_directories(aten_ops_cadence PUBLIC ${ROOT_DIR}/.. add_library( custom_ops "quantized_linear_out.cpp" "quantized_conv_out.cpp" "quantized_relu_out.cpp" "quantized_layer_norm.cpp" - "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp") + "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp" + "quantized_matmul_out.cpp") target_include_directories(custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} ${_common_include_directories}) diff --git a/backends/cadence/reference/operators/quantized_matmul_out.cpp b/backends/cadence/reference/operators/quantized_matmul_out.cpp index 95df35caba..49dd222a96 100644 --- a/backends/cadence/reference/operators/quantized_matmul_out.cpp +++ b/backends/cadence/reference/operators/quantized_matmul_out.cpp @@ -13,6 +13,9 @@ namespace impl { namespace reference { namespace native { +using Tensor = exec_aten::Tensor; +using RuntimeContext = torch::executor::RuntimeContext; + // The quantized matmul. The quantized matmul accumulates in a wider register, // whose type is TA. template < @@ -50,27 +53,32 @@ __attribute__((noinline)) void qmatmul( } } -template +template void inline _typed_quantized_matmul( const Tensor& X, int64_t X_zero_point, const Tensor& Y, int64_t Y_zero_point, - const c10::optional& bias, + const exec_aten::optional& bias, int64_t out_multiplier, int64_t out_shift, int64_t out_zero_point, bool transposed, Tensor& out) { - ctype* __restrict__ out_data = out.mutable_data_ptr(); - const ctype* __restrict__ X_data = X.const_data_ptr(); - const ctype* __restrict__ Y_data = Y.const_data_ptr(); + size_t batch_size = getLeadingDims(X, X.dim() - 2); + size_t leading_dim = X.size(X.dim() - 2); + size_t out_dim = Y.size(Y.dim() - 1 - transposed); + size_t in_dim = X.size(X.dim() - 1); + + T* __restrict__ out_data = out.mutable_data_ptr(); + const T* __restrict__ X_data = X.const_data_ptr(); + const T* __restrict__ Y_data = Y.const_data_ptr(); for (size_t i = 0; i < batch_size; ++i) { - const ctype* x = X_data + i * leading_dim * in_dim; - const ctype* y = Y_data + i * in_dim * out_dim; - ctype* z = out_data + i * leading_dim * out_dim; + const T* x = X_data + i * leading_dim * in_dim; + const T* y = Y_data + i * in_dim * out_dim; + T* z = out_data + i * leading_dim * out_dim; if (transposed) { - qmatmul( + qmatmul( z, static_cast(out_multiplier), static_cast(out_shift), @@ -83,7 +91,7 @@ void inline _typed_quantized_matmul( in_dim, out_dim); } else { - qmatmul( + qmatmul( z, static_cast(out_multiplier), static_cast(out_shift), @@ -101,24 +109,18 @@ void inline _typed_quantized_matmul( } void quantized_matmul_out( + RuntimeContext& ctx, const Tensor& X, int64_t X_zero_point, const Tensor& Y, int64_t Y_zero_point, - const c10::optional& bias, + const exec_aten::optional& bias, int64_t out_multiplier, int64_t out_shift, int64_t out_zero_point, bool transposed, Tensor& out) { - (void)bias; - - size_t batch_size = getLeadingDims(X, X.dim() - 2); - size_t leading_dim = X.size(X.dim() - 2); - size_t out_dim = Y.size(Y.dim() - 1 - transposed); - size_t in_dim = X.size(X.dim() - 1); - - if (out.ScalarType() == at::ScalarType::Byte) { + if (out.scalar_type() == at::ScalarType::Byte) { _typed_quantized_matmul( X, X_zero_point, @@ -130,7 +132,7 @@ void quantized_matmul_out( out_zero_point, transposed, out); - } else if (out.ScalarType() == at::ScalarType::Char) { + } else if (out.scalar_type() == at::ScalarType::Char) { _typed_quantized_matmul( X, X_zero_point, diff --git a/examples/cadence/models/babyllama.py b/examples/cadence/models/babyllama.py new file mode 100644 index 0000000000..603eb5f3d9 --- /dev/null +++ b/examples/cadence/models/babyllama.py @@ -0,0 +1,42 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Example script for exporting simple models to flatbuffer + +import logging + +from executorch.backends.cadence.aot.ops_registrations import * # noqa + +import torch + +from executorch.backends.cadence.aot.export_example import export_model + +from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer + + +FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" +logging.basicConfig(level=logging.INFO, format=FORMAT) + + +def main() -> None: + args = ModelArgs( + dim=512, + vocab_size=512, + hidden_dim=1024, + n_heads=8, + # use_kv_cache=True, + n_layers=1, + ) + seq = 64 + b = 1 + model = Transformer(args) + example_inputs = (torch.randint(0, 10, [b, seq], dtype=torch.int64),) + + export_model(model, example_inputs) + + +if __name__ == "__main__": + main() From 3d5a1491a75509856884f01ecc13f3710267e99c Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Mon, 29 Jul 2024 20:43:34 -0700 Subject: [PATCH 18/75] Add FVP tests for linear op (#4393) Summary: The tests ran without modifications. Change-Id: I6bdae84c17b5da47935035b0a46696881c085c44 Pull Request resolved: https://github.com/pytorch/executorch/pull/4393 Reviewed By: cccclai Differential Revision: D60403878 Pulled By: digantdesai fbshipit-source-id: 638e25c960fb94a9bfc5e95379b46a75740c0285 --- backends/arm/test/ops/test_linear.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py index 0e6747fe27..61117ad7fa 100644 --- a/backends/arm/test/ops/test_linear.py +++ b/backends/arm/test/ops/test_linear.py @@ -155,7 +155,7 @@ def _test_linear_tosa_BI_pipeline( def _test_linear_tosa_u55_BI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -169,8 +169,12 @@ def _test_linear_tosa_u55_BI_pipeline( .partition() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) + @parameterized.expand(test_data_suite_rank1 + test_data_suite_rank4) def test_linear_tosa_MI( self, From c659b9c23ef790f62db96e38d982433e9d9358e9 Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Mon, 29 Jul 2024 20:49:00 -0700 Subject: [PATCH 19/75] Add flakyness mark to conv BI test (#4390) Summary: Change-Id: I391003f8480283872fdc0566e489bb9bb3926c6f Pull Request resolved: https://github.com/pytorch/executorch/pull/4390 Reviewed By: cccclai Differential Revision: D60403912 Pulled By: digantdesai fbshipit-source-id: 04043b2c3f03aac03be19b467e70138d082481e6 --- backends/arm/test/ops/test_depthwise_conv.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py index 7eacbac432..8389b423e5 100644 --- a/backends/arm/test/ops/test_depthwise_conv.py +++ b/backends/arm/test/ops/test_depthwise_conv.py @@ -9,6 +9,8 @@ from typing import Tuple +import pytest + import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.ops.test_conv import Conv2d @@ -189,7 +191,9 @@ def _test_dw_conv2d_u55_BI_pipeline( def test_dw_conv2d_tosa_MI(self, test_name, model): self._test_dw_conv2d_tosa_MI_pipeline(model, model.get_inputs()) + # TODO: Investigate flakyness (MLTORCH-307) @parameterized.expand(testsuite) + @pytest.mark.flaky(reruns=3) def test_dw_conv2d_tosa_BI(self, test_name, model): self._test_dw_conv2d_tosa_BI_pipeline(model, model.get_inputs()) From 38724d072dcc0285776212a44113390ddd73d3c6 Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Mon, 29 Jul 2024 20:51:52 -0700 Subject: [PATCH 20/75] Add test debug features (#4144) Summary: - Functions to get unbuilt default CompileSpec to change it before passing to ArmTester - dump_operator_distribution to print a list with all operators in the graph and the number of times they appear. - dump_dtype_distribution to print a list with the dtype of all placeholders and the number of times they appear. - Cast data in tensor to float and correct shape if tensor dtype is FP32 when dumping Partition artifact. Signed-off-by: Erik Lundell Change-Id: I7196527d060ba182b8ada8e48535d4bb7681ab68 Change-Id: I4678d19a40d5ee6ccab68798fdec7090db0eb8f8 Pull Request resolved: https://github.com/pytorch/executorch/pull/4144 Reviewed By: cccclai Differential Revision: D59568414 Pulled By: digantdesai fbshipit-source-id: 1c5928b6c6d1969ad497e0dd29e2d28fcb441cde --- backends/arm/test/common.py | 29 +++++- backends/arm/test/misc/test_debug_feats.py | 25 +++++ backends/arm/test/runner_utils.py | 16 +++ backends/arm/test/tester/arm_tester.py | 108 +++++++++++++++++++-- 4 files changed, 166 insertions(+), 12 deletions(-) diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py index 906164aac3..f85fd1f2da 100644 --- a/backends/arm/test/common.py +++ b/backends/arm/test/common.py @@ -89,17 +89,26 @@ def get_tosa_compile_spec(permute_memory_to_nhwc=True, custom_path=None): """ Default compile spec for TOSA tests. """ + return get_tosa_compile_spec_unbuilt(permute_memory_to_nhwc, custom_path).build() + + +def get_tosa_compile_spec_unbuilt( + permute_memory_to_nhwc=False, custom_path=None +) -> ArmCompileSpecBuilder: + """Get the ArmCompileSpecBuilder for the default TOSA tests, to modify + the compile spec before calling .build() to finalize it. + """ intermediate_path = custom_path or tempfile.mkdtemp(prefix="arm_tosa_") if not os.path.exists(intermediate_path): os.makedirs(intermediate_path, exist_ok=True) - compile_spec = ( + compile_spec_builder = ( ArmCompileSpecBuilder() .tosa_compile_spec() .set_permute_memory_format(permute_memory_to_nhwc) .dump_intermediate_artifacts_to(intermediate_path) - .build() ) - return compile_spec + + return compile_spec_builder def get_u55_compile_spec( @@ -108,7 +117,20 @@ def get_u55_compile_spec( """ Default compile spec for Ethos-U55 tests. """ + return get_u55_compile_spec_unbuilt( + permute_memory_to_nhwc, quantize_io=quantize_io, custom_path=custom_path + ).build() + + +def get_u55_compile_spec_unbuilt( + permute_memory_to_nhwc=False, quantize_io=False, custom_path=None +) -> ArmCompileSpecBuilder: + """Get the ArmCompileSpecBuilder for the default TOSA tests, to modify + the compile spec before calling .build() to finalize it. + """ artifact_path = custom_path or tempfile.mkdtemp(prefix="arm_u55_") + if not os.path.exists(artifact_path): + os.makedirs(artifact_path, exist_ok=True) compile_spec = ( ArmCompileSpecBuilder() .ethosu_compile_spec( @@ -120,6 +142,5 @@ def get_u55_compile_spec( .set_quantize_io(is_option_enabled("quantize_io") or quantize_io) .set_permute_memory_format(permute_memory_to_nhwc) .dump_intermediate_artifacts_to(artifact_path) - .build() ) return compile_spec diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py index 9a0702c900..bf2a3aebd2 100644 --- a/backends/arm/test/misc/test_debug_feats.py +++ b/backends/arm/test/misc/test_debug_feats.py @@ -120,3 +120,28 @@ def test_numerical_diff_prints(self): pass # Implicit pass test else: self.fail() + + +class TestDumpOperatorsAndDtypes(unittest.TestCase): + def test_dump_ops_and_dtypes(self): + model = Linear(20, 30) + ( + ArmTester( + model, + example_inputs=model.get_inputs(), + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .dump_dtype_distribution() + .dump_operator_distribution() + .export() + .dump_dtype_distribution() + .dump_operator_distribution() + .to_edge() + .dump_dtype_distribution() + .dump_operator_distribution() + .partition() + .dump_dtype_distribution() + .dump_operator_distribution() + ) + # Just test that there are no execeptions. diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py index 19d76e13b4..58c99a9201 100644 --- a/backends/arm/test/runner_utils.py +++ b/backends/arm/test/runner_utils.py @@ -510,4 +510,20 @@ def dbg_tosa_fb_to_json(tosa_fb: bytes) -> Dict: with open(os.path.join(tmp, "output.json"), "r") as f: json_out = json.load(f) + # Cast float tensors to proper dtype. + try: + for region in json_out["regions"]: + for block in region["blocks"]: + for tensor in block["tensors"]: + if "data" in tensor: + if tensor["type"] == "FP32": + data = np.array(tensor["data"]) + data = data.astype(np.int8) + data = np.frombuffer(data, dtype=np.float32) + data = data.reshape(tensor["shape"]) + tensor["data"] = data + except Exception: + # This is just nice-to-have if it works, don't care if it fails. + pass + return json_out diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py index 97ab67b3d1..be5ea7dd71 100644 --- a/backends/arm/test/tester/arm_tester.py +++ b/backends/arm/test/tester/arm_tester.py @@ -4,7 +4,10 @@ # LICENSE file in the root directory of this source tree. import logging -from typing import Any, List, Literal, Optional, Tuple + +from collections import Counter +from pprint import pformat +from typing import Any, List, Literal, Optional, Tuple, Union import executorch.backends.xnnpack.test.tester.tester as tester @@ -31,6 +34,7 @@ from executorch.backends.xnnpack.test.tester import Tester from executorch.exir import EdgeCompileConfig from executorch.exir.backend.compile_spec_schema import CompileSpec +from torch.fx import Graph logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -39,7 +43,6 @@ class Partition(tester.Partition): def dump_artifact(self, path_to_dump: Optional[str]): super().dump_artifact(path_to_dump) - from pprint import pformat to_print = None for spec in self.graph_module.lowered_module_0.compile_specs: @@ -55,12 +58,7 @@ def dump_artifact(self, path_to_dump: Optional[str]): to_print = f"\n Vela command stream: \n{to_print}" break assert to_print is not None, "No TOSA nor Vela compile spec found" - - if path_to_dump: - with open(path_to_dump, "a") as fp: - fp.write(to_print) - else: - print(to_print) + _dump_str(to_print, path_to_dump) class Serialize(tester.Serialize): @@ -272,6 +270,66 @@ def run_method_and_compare_outputs( return self + def get_graph(self, stage: str | None = None) -> Graph: + if stage is None: + stage = self.cur + artifact = self.get_artifact(stage) + if self.cur == self.stage_name(tester.ToEdge) or self.cur == self.stage_name( + Partition + ): + graph = artifact.exported_program().graph + elif self.cur == self.stage_name(tester.Export) or self.cur == self.stage_name( + tester.Quantize + ): + graph = artifact.graph + else: + raise RuntimeError( + "Can only get a graph from Quantize, ToEdge, Export, and Partition stages." + ) + + return graph + + def dump_operator_distribution( + self, path_to_dump: Optional[str] = None + ) -> ArmQuantizer: + """Dump a dictionary with {operator: operator count} for the operators in the + graph of the current stage. + + Returns self for daisy-chaining. + """ + graph = self.get_graph(self.cur) + op_dist = _get_operator_distribution(graph) + to_print = self.cur + " operators: " + _format_dict(op_dist) + "\n" + _dump_str(to_print, path_to_dump) + return self + + def dump_dtype_distribution( + self, path_to_dump: Optional[str] = None + ) -> ArmQuantizer: + """Dump a dictionary with {dtype: dtype count} for the dtypes of the nodes in the + graph of the current stage. + + Returns self for daisy-chaining. + """ + graph = self.get_graph(self.cur) + op_dist = _get_dtype_distribution(graph) + to_print = self.cur + " placeholder data types: " + _format_dict(op_dist) + "\n" + _dump_str(to_print, path_to_dump) + return self + + @staticmethod + def _calculate_reference_output( + module: Union[torch.fx.GraphModule, torch.nn.Module], inputs + ) -> torch.Tensor: + """ + Note: I'd prefer to use the base class method here, but since it use the + exported program, I can't. The partitioner stage clears the state_dict + of the exported program, which causes an issue when evaluating the + module. + """ + + return module.forward(*inputs) + def transpose_data_format( self, data: Tuple[torch.Tensor], to: Literal["NHWC", "NCHW"] ): @@ -331,3 +389,37 @@ def _compare_outputs( ) logger.error(f"{atol=}, {rtol=}, {qtol=}") raise e + + +def _get_dtype_distribution(graph: Graph) -> dict: + """Counts the occurences of placeholder data types in a graph. + The result is a dict {'data type':'number of placeholders'} + """ + return Counter( + [ + node.meta["val"].dtype + for node in list(graph.nodes) + if node.op == "placeholder" + ] + ) + + +def _get_operator_distribution(graph: Graph) -> dict[str, int]: + """Counts the occurences of operator names in a graph. + The result is a dict {'operator name':'number of nodes'} + """ + return Counter( + [str(node.target) for node in list(graph.nodes) if node.op == "call_function"] + ) + + +def _dump_str(to_print: str, path_to_dump: Optional[str] = None): + if path_to_dump: + with open(path_to_dump, "a") as fp: + fp.write(to_print) + else: + print(to_print) + + +def _format_dict(to_print: dict) -> str: + return pformat(to_print, compact=True, indent=1) From 3c25aec9cae58de560e0a807efbb1939ab89f2fe Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Mon, 29 Jul 2024 20:57:02 -0700 Subject: [PATCH 21/75] Add docstrings to all unittest.TestCase:s (#4391) Summary: This avoids logging a long default docstring which makes the output of test collection a lot easier to read and thus debug. I also updated the mv2net weight parameter as the current way of calling it is deprecated. This also removes a warning from test collection. Change-Id: I05f000e9ef42ab9d63f234539da3309f69ccbe16 Pull Request resolved: https://github.com/pytorch/executorch/pull/4391 Reviewed By: cccclai Differential Revision: D60404067 Pulled By: digantdesai fbshipit-source-id: 4ad44eb35887278cacd7e2c0ac93114b125e72af --- backends/arm/test/misc/test_debug_feats.py | 4 ++++ backends/arm/test/models/test_mobilenet_v2_arm.py | 3 ++- backends/arm/test/ops/test_add.py | 2 ++ backends/arm/test/ops/test_avg_pool.py | 2 ++ backends/arm/test/ops/test_batch_norm.py | 2 ++ backends/arm/test/ops/test_clone.py | 2 ++ backends/arm/test/ops/test_conv.py | 2 ++ backends/arm/test/ops/test_conv_combos.py | 2 ++ backends/arm/test/ops/test_depthwise_conv.py | 3 +++ backends/arm/test/ops/test_div.py | 2 ++ backends/arm/test/ops/test_full.py | 2 ++ backends/arm/test/ops/test_linear.py | 1 + backends/arm/test/ops/test_mean_dim.py | 2 ++ backends/arm/test/ops/test_softmax.py | 2 ++ backends/arm/test/ops/test_view.py | 2 ++ backends/arm/test/passes/test_tag_io_quant_pass.py | 1 + 16 files changed, 33 insertions(+), 1 deletion(-) diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py index bf2a3aebd2..aa9703f9eb 100644 --- a/backends/arm/test/misc/test_debug_feats.py +++ b/backends/arm/test/misc/test_debug_feats.py @@ -41,6 +41,8 @@ def forward(self, x): class TestDumpPartitionedArtifact(unittest.TestCase): + """Tests dumping the partition artifact in ArmTester. Both to file and to stdout.""" + def _tosa_MI_pipeline(self, module: torch.nn.Module, dump_file=None): ( ArmTester( @@ -96,6 +98,8 @@ def test_BI_artifact(self): class TestNumericalDiffPrints(unittest.TestCase): + """Tests trigging the exception printout from the ArmTester's run and compare function.""" + def test_numerical_diff_prints(self): model = Linear(20, 30) tester = ( diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py index eae5d4358a..248153a518 100644 --- a/backends/arm/test/models/test_mobilenet_v2_arm.py +++ b/backends/arm/test/models/test_mobilenet_v2_arm.py @@ -22,8 +22,9 @@ class TestMobileNetV2(unittest.TestCase): + """Tests MobileNetV2.""" - mv2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights) + mv2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT) mv2 = mv2.eval() normalize = transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py index 622d811822..3bd2b2605c 100644 --- a/backends/arm/test/ops/test_add.py +++ b/backends/arm/test/ops/test_add.py @@ -17,6 +17,8 @@ class TestSimpleAdd(unittest.TestCase): + """Tests a single add op, x+x and x+y.""" + class Add(torch.nn.Module): test_parameters = [ (torch.FloatTensor([1, 2, 3, 5, 7]),), diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py index fb2609939f..32a0e5555a 100644 --- a/backends/arm/test/ops/test_avg_pool.py +++ b/backends/arm/test/ops/test_avg_pool.py @@ -28,6 +28,8 @@ class TestAvgPool2d(unittest.TestCase): + """Tests AvgPool2d.""" + class AvgPool2d(torch.nn.Module): def __init__( self, diff --git a/backends/arm/test/ops/test_batch_norm.py b/backends/arm/test/ops/test_batch_norm.py index 0d6f9dea2c..4935e910d6 100644 --- a/backends/arm/test/ops/test_batch_norm.py +++ b/backends/arm/test/ops/test_batch_norm.py @@ -497,6 +497,8 @@ class TestBatchNorm2d(unittest.TestCase): + """Tests BatchNorm2d.""" + class BatchNorm2d(torch.nn.Module): def __init__( self, diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py index 2fc9b338cf..8386283f24 100644 --- a/backends/arm/test/ops/test_clone.py +++ b/backends/arm/test/ops/test_clone.py @@ -25,6 +25,8 @@ class TestSimpleClone(unittest.TestCase): + """Tests clone.""" + class Clone(torch.nn.Module): sizes = [10, 15, 50, 100] test_parameters = [(torch.ones(n),) for n in sizes] diff --git a/backends/arm/test/ops/test_conv.py b/backends/arm/test/ops/test_conv.py index 614d056072..9ebfe77da2 100644 --- a/backends/arm/test/ops/test_conv.py +++ b/backends/arm/test/ops/test_conv.py @@ -244,6 +244,8 @@ def forward(self, x): class TestConv2D(unittest.TestCase): + """Tests Conv2D, both single ops and multiple Convolutions in series.""" + def _test_conv2d_tosa_MI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] ): diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py index 41f76ccbb7..88006df1a0 100644 --- a/backends/arm/test/ops/test_conv_combos.py +++ b/backends/arm/test/ops/test_conv_combos.py @@ -154,6 +154,8 @@ def forward(self, x): class TestConvCombos(unittest.TestCase): + """Tests conv combined with other ops.""" + def _test_conv_combo_tosa_MI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] ): diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py index 8389b423e5..9b3f79e6a1 100644 --- a/backends/arm/test/ops/test_depthwise_conv.py +++ b/backends/arm/test/ops/test_depthwise_conv.py @@ -132,6 +132,9 @@ class TestDepthwiseConv2D(unittest.TestCase): + """Tests Conv2D where groups == in_channels and out_channels = K * in_channels. This + is a special case enables depthwise convolution.""" + def _test_dw_conv2d_tosa_MI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] ): diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py index b13581dca1..60a0b8a4cc 100644 --- a/backends/arm/test/ops/test_div.py +++ b/backends/arm/test/ops/test_div.py @@ -78,6 +78,8 @@ class TestDiv(unittest.TestCase): + """Tests division""" + class Div(torch.nn.Module): def __init__( self, diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py index 4f01b1c8f9..1be7f59ab8 100644 --- a/backends/arm/test/ops/test_full.py +++ b/backends/arm/test/ops/test_full.py @@ -19,6 +19,8 @@ class TestFull(unittest.TestCase): + """Tests the full op which creates a tensor of a given shape filled with a given value.""" + class Full(torch.nn.Module): # A single full op def forward(self): diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py index 61117ad7fa..33f62955ec 100644 --- a/backends/arm/test/ops/test_linear.py +++ b/backends/arm/test/ops/test_linear.py @@ -91,6 +91,7 @@ class TestLinear(unittest.TestCase): + """tests the linear operation y = Ax + b""" _edge_compile_config: EdgeCompileConfig = EdgeCompileConfig( _skip_dim_order=True, # TODO(T182928844): Delegate dim order op to backend. diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py index 433661e99e..e0db958f74 100644 --- a/backends/arm/test/ops/test_mean_dim.py +++ b/backends/arm/test/ops/test_mean_dim.py @@ -40,6 +40,8 @@ class TestMeanDim(unittest.TestCase): + """Tests MeanDim, called AdaptiveAvgPool2d in Pytorch.""" + class MeanDim(torch.nn.Module): def __init__(self): super().__init__() diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py index b2ef115dad..b3b6230daa 100644 --- a/backends/arm/test/ops/test_softmax.py +++ b/backends/arm/test/ops/test_softmax.py @@ -28,6 +28,8 @@ class TestSoftmax(unittest.TestCase): + """Tests softmax.""" + class Softmax(torch.nn.Module): def __init__(self, dim: int = -1): super().__init__() diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py index 7eda0d9cc2..1f51261bf7 100644 --- a/backends/arm/test/ops/test_view.py +++ b/backends/arm/test/ops/test_view.py @@ -25,6 +25,8 @@ class TestSimpleView(unittest.TestCase): + """Tests the view operation.""" + class View(torch.nn.Module): sizes = [10, 15, 50, 100] diff --git a/backends/arm/test/passes/test_tag_io_quant_pass.py b/backends/arm/test/passes/test_tag_io_quant_pass.py index 8757cf99d8..9f292bb7ca 100644 --- a/backends/arm/test/passes/test_tag_io_quant_pass.py +++ b/backends/arm/test/passes/test_tag_io_quant_pass.py @@ -22,6 +22,7 @@ def forward(self, x): class TestTagIOQuantPass(unittest.TestCase): + """Tests the TagIOQuantPass which tags q/dq nodes on model inputs and outputs to not include them in our partitions.""" def _tosa_BI_u55_pipeline(self, module: torch.nn.Module): ( From 28cfabb58e01c81dc1587180597f974a870a6309 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 29 Jul 2024 21:29:25 -0700 Subject: [PATCH 22/75] Fix use_sdpa_with_kv_cache option (#4456) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4456 As titled. In `export_llava.py` `export_text_model()` needs to respect `use_sdpa_with_kv_cache_op` option. Reviewed By: cccclai Differential Revision: D60431561 fbshipit-source-id: 63d49f39339435fb16f0c1c62288fd31c86b3be8 --- examples/models/llava/export_llava.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index f57823a90a..7cf14e07d1 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -83,11 +83,14 @@ def forward(self, input_pos, embeddings): ) quant_transform = get_quant_weight_transform(args, dtype_override, False) pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args) - + source_transforms = [] + if llava.use_sdpa_with_kv_cache_op: + source_transforms.append(replace_sdpa_with_custom_op) + source_transforms.append(quant_transform) manager = ( text_model_em.set_output_dir("./") .to_dtype(dtype_override) - .source_transform([replace_sdpa_with_custom_op, quant_transform]) + .source_transform(source_transforms) .capture_pre_autograd_graph() .pt2e_quantize(quantizers) ) From b7c8378d57b0e18d30ff30197125a89744d16d70 Mon Sep 17 00:00:00 2001 From: Alexey Kozhevnikov Date: Tue, 30 Jul 2024 09:38:13 -0700 Subject: [PATCH 23/75] nop validation during build (#4449) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4449 Adding validation logic to "build targets" implementation. Actual validation is nop in this diff, implementaion is in next diffs in stack. I need to use late bindings in order to keep validation logic in separate crate because `buck2_build_api` depends on validation, while validation depends on materialization from `buck2_build_api`. Reviewed By: stepancheg Differential Revision: D60238806 fbshipit-source-id: e1484731ce099189555bd306c1f93bab91da7de8 --- shim/third-party/rust/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shim/third-party/rust/Cargo.toml b/shim/third-party/rust/Cargo.toml index 0b8fa9f08e..718d9ea5a6 100644 --- a/shim/third-party/rust/Cargo.toml +++ b/shim/third-party/rust/Cargo.toml @@ -169,7 +169,7 @@ rustyline = "11.0" scopeguard = "1.0.0" sequence_trie = "0.3.6" serde = { version = "1.0.173", features = ["derive", "rc"] } -serde_json = "1.0.48" +serde_json = { version = "1.0.48", features = ["raw_value"] } sha1 = "0.10" sha2 = "0.10" shlex = "1.3" From da7ca6ff22804ae519b8d3f9dd085c885b94e5bc Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Tue, 30 Jul 2024 12:10:32 -0700 Subject: [PATCH 24/75] Fix build error (#4464) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4464 Fix an internal build error with BUCK. Reviewed By: jorgep31415 Differential Revision: D60458756 fbshipit-source-id: a13b3f2de6754dda86ac73eb0f8e24de60a0a98a --- backends/vulkan/tools/gpuinfo/TARGETS | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/vulkan/tools/gpuinfo/TARGETS b/backends/vulkan/tools/gpuinfo/TARGETS index e9dd22e92d..10e3acb4b8 100644 --- a/backends/vulkan/tools/gpuinfo/TARGETS +++ b/backends/vulkan/tools/gpuinfo/TARGETS @@ -23,6 +23,7 @@ buck_filegroup( vulkan_spv_shader_lib( name = "gpuinfo_shader_lib", + is_fbcode = True, spv_filegroups = { ":gpuinfo_shaders": "glsl", }, From ea0c017224bb16c0455e5b8deb700b5638103652 Mon Sep 17 00:00:00 2001 From: Esteban Padilla Cerdio Date: Tue, 30 Jul 2024 13:31:12 -0700 Subject: [PATCH 25/75] Add 3D Texture Bandwidth metric (#4336) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4336 This diff introduces a profiler that obtains the maximum and minimum bandwidth for reading unique addresses from 3D textures in each of its dimensions, using the following shader, where A is a 3D texture and B is a writeonly buffer. The calculation of the texel position will depend on the dimension that is being benchmarked x : pos = ivec3(offset, 0, 0) y : pos = ivec3(0, offset, 0) z : pos = ivec3(0, 0, offset) void main() { vec4 sum = vec4(0); const uint workgroup_width = local_group_size * niter * ${NUNROLL}; uint offset = (gl_WorkGroupID[0] * workgroup_width + gl_LocalInvocationID[0]) & addr_mask; int i = 0; for (; i < niter; ++i) { sum *= texelFetch(A, pos, 0); offset = (offset + local_group_size) & addr_mask; ... ... sum *= texelFetch(A, pos, 0); offset = (offset + local_group_size) & addr_mask; } vec4 zero = vec4(i>>31); B[gl_LocalInvocationID[0]] = sum + zero; } The address mask allows us to control how many unique addresses we are accessing. If the number of unique vectors we want to read is 3, the offset will jump between three unique addresses throughout the iterations, giving us the bandwidth for that specific size of data. If the size of the unique data read is larger than the work group size, then each run will have its own block of data to read, defined by the initial offset calculation, where the offset is obtained through the workgroup ID and the local invocation ID. Finally, we make sure to use the `sum` and `i ` variables so that the compiler's optimizer does not flatten the loops. For a Samsung S22, the bandwidth behaves like this for each of the dimensions. {F1767497386} Comparing the bandwidth for the X dimension to OpenCL, which was obtained through [ArchProbe](https://github.com/microsoft/ArchProbe), we can observe that, although the behavior is the same, Vulkan has an increased bandwidth for most access sizes. {F1767497972} Comparing to the bandwidth for buffers, we can observe that the bandwidth is similar to regular buffers, but still much smaller than UBOs at small access sizes. {F1767497707} Reviewed By: jorgep31415 Differential Revision: D59980139 fbshipit-source-id: acc696ef21e6d07cf6f12d3790084faa64377093 --- .../tools/gpuinfo/glsl/tex_bandwidth.glsl | 59 +++++++++ .../tools/gpuinfo/glsl/tex_bandwidth.yaml | 15 +++ backends/vulkan/tools/gpuinfo/src/app.cpp | 112 ++++++++++++++++++ 3 files changed, 186 insertions(+) create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl new file mode 100644 index 0000000000..d848fc0475 --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl @@ -0,0 +1,59 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_type(DTYPE)} + +layout(std430) buffer; + +${layout_declare_sampler(0, "r", "A", DTYPE)} +${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int niter = 1; +layout(constant_id = 4) const int nvec = 1; +layout(constant_id = 5) const int local_group_size = 1; + +void main() { + // The address mask works as a modulo because x % 2^n == x & (2^n - 1). + // This will help us limit address accessing to a specific set of unique + // addresses depending on the access size we want to measure. + const int addr_mask = nvec - 1; + vec4 sum = vec4(0); + + // This is to distribute the accesses to unique addresses across the workgroups, once the + // size of the access excedes the workgroup width. + const uint workgroup_width = local_group_size * niter * ${NUNROLL}; + uint offset = (gl_WorkGroupID[0] * workgroup_width + gl_LocalInvocationID[0]) & addr_mask; + + int i = 0; + for (; i < niter; ++i){ + VEC4_T in_texel; + $for j in range(int(NUNROLL)): + $if DIM == 0: + in_texel = texelFetch(A, ivec3(offset, 0, 0), 0); + $elif DIM == 1: + in_texel = texelFetch(A, ivec3(0, offset, 0), 0); + $elif DIM == 2: + in_texel = texelFetch(A, ivec3(0, 0, offset), 0); + + sum *= in_texel; + + // On each unroll, a new unique address will be accessed through the offset, + // limited by the address mask to a specific set of unique addresses + offset = (offset + local_group_size) & addr_mask; + } + + // This is to ensure no compiler optimizations occur + vec4 zero = vec4(i>>31); + + B[gl_LocalInvocationID[0]] = sum + zero; +} diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml new file mode 100644 index 0000000000..84da6938fd --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml @@ -0,0 +1,15 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +tex_bandwidth: + parameter_names_with_default_values: + DTYPE: float + NUNROLL: "16" + generate_variant_forall: + DIM: + - RANGE: [0, 2] + shader_variants: + - NAME: tex_bandwidth diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp index 8facdb5160..92eef84068 100644 --- a/backends/vulkan/tools/gpuinfo/src/app.cpp +++ b/backends/vulkan/tools/gpuinfo/src/app.cpp @@ -22,6 +22,9 @@ class App { uint32_t sm_count_; uint32_t nthread_logic_; uint32_t subgroup_size_; + uint32_t max_tex_width_; + uint32_t max_tex_height_; + uint32_t max_tex_depth_; public: App() { @@ -36,6 +39,9 @@ class App { nthread_logic_ = cl_device.getInfo(); buf_cache_size_ = cl_device.getInfo(); max_shared_mem_size_ = cl_device.getInfo(); + max_tex_width_ = cl_device.getInfo(); + max_tex_height_ = cl_device.getInfo(); + max_tex_depth_ = cl_device.getInfo(); VkPhysicalDeviceSubgroupProperties subgroup_props{}; VkPhysicalDeviceProperties2 props2{}; @@ -54,6 +60,9 @@ class App { std::cout << "Cache Size," << buf_cache_size_ << std::endl; std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl; std::cout << "SubGroup Size," << subgroup_size_ << std::endl; + std::cout << "MaxTexWidth," << max_tex_width_ << std::endl; + std::cout << "MaxTexHeight," << max_tex_height_ << std::endl; + std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl; } void reg_count() { @@ -308,6 +317,15 @@ class App { << std::endl; } + std::vector _whd_to_nchw(std::vector sizes) { + const int64_t W = sizes[0]; + const int64_t H = sizes[1]; + const int64_t D = sizes[2]; + + // Channels-packed: {W, H, D} = {W, H, (C / 4) * N} + return {1, D * 4, H, W}; + } + public: void buf_bandwidth() { std::cout << "\n------ Memory Bandwidth ------" << std::endl; @@ -323,12 +341,105 @@ class App { const uint32_t RANGE = 128 * 1024 * 1024; _bandwidth("UBO", RANGE); } + void shared_mem_bandwidth() { std::cout << "\n------ Shared Bandwidth ------" << std::endl; const uint32_t RANGE = max_shared_mem_size_; _bandwidth("Shared", RANGE); } + void tex_bandwidth() { + for (int dim = 0; dim < 3; dim++) { + std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------" + << std::endl; + const uint32_t MAX_SIZE = dim == 0 ? max_tex_width_ + : dim == 1 ? max_tex_height_ + : max_tex_depth_; + + // rgba, float + const uint32_t VEC_WIDTH = 4; + const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); + const uint32_t NVEC = MAX_SIZE; + + const uint32_t RANGE = NVEC * VEC_SIZE; + + // Cache lines flushed + const uint32_t NFLUSH = 4; + // Number of loop unrolls. Changing this value requires an equal change in + // tex_bandwidth.yaml + const uint32_t NUNROLL = 16; + // Number of iterations. Increasing this value reduces noise in exchange + // for higher latency. + const uint32_t NITER = 10; + // Number of memory reads per thread + const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; + // Number of threads needed to read all texells + const uint32_t NTHREAD = NVEC; + // Occupy all threads + const uint32_t local_x = nthread_logic_; + // Ensure that global is a multiple of local, and distribute across all + // SMs + const uint32_t global_x = + (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH; + + auto shader_name = "tex_bandwidth_" + std::to_string(dim); + + std::vector sizes_whd = {MAX_SIZE, 1, 1}; + if (dim == 1) { + sizes_whd = {1, MAX_SIZE, 1}; + } else if (dim == 2) { + sizes_whd = {1, 1, MAX_SIZE}; + } + auto sizes_nchw = _whd_to_nchw(sizes_whd); + + vTensor in_tensor = + api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); + + auto bench = [&](uint32_t access_size, uint32_t dim) { + // Number of texels that fit in this iteration + const uint32_t ntexel_access = access_size / VEC_SIZE; + + StorageBuffer out_buf( + context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto time = benchmark_on_gpu(shader_name, 10, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {global_x, 1, 1}, + {local_x, 1, 1}, + {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)}, + VK_NULL_HANDLE, + 0, + in_tensor.image(), + out_buf.buffer()); + }); + + const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; + double gbps = SIZE_TRANS * 1e-3 / time; + std::cout << "Texture bandwidth accessing \t" << access_size + << "\tB unique data is \t" << gbps << " \tgbps (\t" << time + << "\tus)" << std::endl; + return gbps; + }; + + double max_bandwidth = 0; + double min_bandwidth = DBL_MAX; + for (uint32_t access_size = VEC_SIZE; access_size < RANGE; + access_size *= 2) { + double gbps = bench(access_size, dim); + max_bandwidth = std::max(gbps, max_bandwidth); + min_bandwidth = std::min(gbps, min_bandwidth); + } + + std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth + << std::endl; + std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth + << std::endl; + } + } + // Warp size is a difficult metric to obtain because the hardware limitations // do not always coincide with the way the SM divides the workload. For // instance, the hardware can have a warp size of 64 threads, but an SM might @@ -492,6 +603,7 @@ int main(int argc, const char** argv) { app.ubo_bandwidth(); app.shared_mem_bandwidth(); app.warp_size(); + app.tex_bandwidth(); return 0; } From 298b625a9fdd7eb1695f7552faa8d9c8a88208ef Mon Sep 17 00:00:00 2001 From: Esteban Padilla Cerdio Date: Tue, 30 Jul 2024 13:31:12 -0700 Subject: [PATCH 26/75] Add config file support for constants and test control (#4337) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4337 Now that the tool is getting larger, a configuration file for defining which tests to run and which to skip, as well as specifying some values like thresholds and ranges, comes in handy. This diff adds support for a JSON config file with specifications for each test. Reviewed By: jorgep31415 Differential Revision: D60060188 fbshipit-source-id: d6ee9cbff52b3ab13e9a06a42dd54aec002fae11 --- backends/vulkan/tools/gpuinfo/config.json | 43 ++++++ backends/vulkan/tools/gpuinfo/src/app.cpp | 151 +++++++++++++++++----- 2 files changed, 161 insertions(+), 33 deletions(-) create mode 100644 backends/vulkan/tools/gpuinfo/config.json diff --git a/backends/vulkan/tools/gpuinfo/config.json b/backends/vulkan/tools/gpuinfo/config.json new file mode 100644 index 0000000000..1efb9690fe --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/config.json @@ -0,0 +1,43 @@ +{ + "reg_count": { + "enabled": true, + "threshold": 3, + "compensate": 0.1 + }, + "buf_cacheline_size": { + "enabled": true, + "threshold": 10, + "compensate": 0.1 + }, + "buffer_bandwidth": { + "enabled": true, + "range": 134217728, + "nflush": 4, + "nunroll": 16, + "niter": 10 + }, + "ubo_bandwidth": { + "enabled": true, + "range": 134217728, + "nflush": 4, + "nunroll": 16, + "niter": 10 + }, + "shared_mem_bandwidth": { + "enabled": true, + "nflush": 4, + "nunroll": 16, + "niter": 10 + }, + "warp_size": { + "enabled": true, + "threshold": 3, + "compensate": 0.1 + }, + "tex_bandwidth": { + "enabled": true, + "nflush": 4, + "nunroll": 16, + "niter": 10 + } +} diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp index 92eef84068..c33e8a011d 100644 --- a/backends/vulkan/tools/gpuinfo/src/app.cpp +++ b/backends/vulkan/tools/gpuinfo/src/app.cpp @@ -8,6 +8,8 @@ #include #include +#include +#include #include #include "stats.h" @@ -25,6 +27,46 @@ class App { uint32_t max_tex_width_; uint32_t max_tex_height_; uint32_t max_tex_depth_; + folly::dynamic config_; + + std::vector _whd_to_nchw(std::vector sizes) { + const int64_t W = sizes[0]; + const int64_t H = sizes[1]; + const int64_t D = sizes[2]; + + // Channels-packed: {W, H, D} = {W, H, (C / 4) * N} + return {1, D * 4, H, W}; + } + + float _get_config(const std::string& test, const std::string& key) { + if (config_[test].empty()) { + throw std::runtime_error("Missing config for " + test); + } + + if (!config_[test][key].isNumber()) { + throw std::runtime_error( + "Config for " + test + "." + key + " is not a number"); + } + + float value; + if (config_[test][key].isDouble()) { + value = config_[test][key].getDouble(); + } else { + value = config_[test][key].getInt(); + } + + std::cout << "Read value for " << test << "." << key << " = " << value + << std::endl; + return value; + } + + bool _enabled(const std::string& test) { + if (config_.empty() || config_[test].empty() || + !config_[test]["enabled"].isBool()) { + return true; + } + return config_[test]["enabled"].getBool(); + } public: App() { @@ -65,16 +107,32 @@ class App { std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl; } + void load_config(std::string file_path) { + std::ifstream file(file_path); + std::stringstream buffer; + buffer << file.rdbuf(); + const std::string json_str = buffer.str(); + if (json_str.empty()) { + throw std::runtime_error( + "Failed to read config file from " + file_path + "."); + } + config_ = folly::parseJson(json_str); + } + void reg_count() { + if (!_enabled("reg_count")) { + std::cout << "Skipped Register Count" << std::endl; + return; + } + std::cout << std::endl; std::cout << "------ Register Count ------" << std::endl; const uint32_t NREG_MIN = 1; const uint32_t NREG_MAX = 512; const uint32_t NREG_STEP = 1; - // TODO: Make these values configurable - const double COMPENSATE = 0.01; - const double THRESHOLD = 3; + const double COMPENSATE = _get_config("reg_count", "compensate"); + const double THRESHOLD = _get_config("reg_count", "threshold"); const uint32_t NGRP_MIN = 1; const uint32_t NGRP_MAX = 64; @@ -175,12 +233,16 @@ class App { } void buf_cacheline_size() { + if (!_enabled("buf_cacheline_size")) { + std::cout << "Skipped Buffer Cacheline Size" << std::endl; + return; + } + std::cout << std::endl; std::cout << "------ Buffer Cacheline Size ------" << std::endl; - // TODO: Make these values configurable - const double COMPENSATE = 0.01; - const double THRESHOLD = 10; + const double COMPENSATE = _get_config("buf_cacheline_size", "compensate"); + const double THRESHOLD = _get_config("buf_cacheline_size", "threshold"); const uint32_t PITCH = buf_cache_size_ / nthread_logic_; const uint32_t BUF_SIZE = buf_cache_size_; @@ -237,15 +299,23 @@ class App { private: void _bandwidth(std::string memtype, uint32_t range) { - // TODO: Make these values configurable + auto memtype_lower = memtype; + std::transform( + memtype_lower.begin(), + memtype_lower.end(), + memtype_lower.begin(), + [](unsigned char c) { return std::tolower(c); }); + + auto test_name = memtype_lower + "_bandwidth"; + // Cache lines flushed - const uint32_t NFLUSH = 4; + const uint32_t NFLUSH = _get_config(test_name, "nflush"); // Number of loop unrolls. Changing this value requires an equal change in // buf_bandwidth.yaml - const uint32_t NUNROLL = 16; + const uint32_t NUNROLL = _get_config(test_name, "nunroll"); // Number of iterations. Increasing this value reduces noise in exchange for // higher latency. - const uint32_t NITER = 10; + const uint32_t NITER = _get_config(test_name, "niter"); // Vector dimensions (vec4) const uint32_t VEC_WIDTH = 4; const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); @@ -273,12 +343,6 @@ class App { context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_); vkapi::PipelineBarrier pipeline_barrier{}; - auto memtype_lower = memtype; - std::transform( - memtype_lower.begin(), - memtype_lower.end(), - memtype_lower.begin(), - [](unsigned char c) { return std::tolower(c); }); auto shader_name = "buf_bandwidth_" + memtype_lower; auto time = benchmark_on_gpu(shader_name, 10, [&]() { @@ -317,38 +381,49 @@ class App { << std::endl; } - std::vector _whd_to_nchw(std::vector sizes) { - const int64_t W = sizes[0]; - const int64_t H = sizes[1]; - const int64_t D = sizes[2]; - - // Channels-packed: {W, H, D} = {W, H, (C / 4) * N} - return {1, D * 4, H, W}; - } - public: void buf_bandwidth() { + if (!_enabled("buffer_bandwidth")) { + std::cout << "Skipped Memory Bandwidth" << std::endl; + return; + } + std::cout << "\n------ Memory Bandwidth ------" << std::endl; // Maximum memory space read - 128MB // For regular devices, bandwidth plateaus at less memory than this, so more // is not needed. - const uint32_t RANGE = 128 * 1024 * 1024; + const uint32_t RANGE = _get_config("buffer_bandwidth", "range"); _bandwidth("Buffer", RANGE); } void ubo_bandwidth() { + if (!_enabled("ubo_bandwidth")) { + std::cout << "Skipped UBO Bandwidth" << std::endl; + return; + } + std::cout << "\n------ UBO Bandwidth ------" << std::endl; - const uint32_t RANGE = 128 * 1024 * 1024; + const uint32_t RANGE = _get_config("ubo_bandwidth", "range"); _bandwidth("UBO", RANGE); } void shared_mem_bandwidth() { + if (!_enabled("shared_mem_bandwidth")) { + std::cout << "Skipped Shared Memory Bandwidth" << std::endl; + return; + } + std::cout << "\n------ Shared Bandwidth ------" << std::endl; const uint32_t RANGE = max_shared_mem_size_; _bandwidth("Shared", RANGE); } void tex_bandwidth() { + if (!_enabled("tex_bandwidth")) { + std::cout << "Skipped Texture Bandwidth" << std::endl; + return; + } + for (int dim = 0; dim < 3; dim++) { std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------" << std::endl; @@ -364,13 +439,13 @@ class App { const uint32_t RANGE = NVEC * VEC_SIZE; // Cache lines flushed - const uint32_t NFLUSH = 4; + const uint32_t NFLUSH = _get_config("tex_bandwidth", "nflush"); // Number of loop unrolls. Changing this value requires an equal change in // tex_bandwidth.yaml - const uint32_t NUNROLL = 16; + const uint32_t NUNROLL = _get_config("tex_bandwidth", "nunroll"); // Number of iterations. Increasing this value reduces noise in exchange // for higher latency. - const uint32_t NITER = 10; + const uint32_t NITER = _get_config("tex_bandwidth", "niter"); // Number of memory reads per thread const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; // Number of threads needed to read all texells @@ -458,6 +533,11 @@ class App { // In Case 2, like in Adreno, the driver might decide to pack multiple works // together and dispatch them at once. void warp_size(bool verbose = false) { + if (!_enabled("warp_size")) { + std::cout << "Skipped Warp Size" << std::endl; + return; + } + std::cout << "\n------ Warp Size ------" << std::endl; // Method A: Stress test with a kernel that uses complex ALU operations like @@ -467,8 +547,8 @@ class App { // This timing-based method helps us identify physical warp sizes. It also // helps with Case 2, when threads of multiple warps are managed by the same // scheduler at the same time. - const double COMPENSATE = 0.01; - const double THRESHOLD = 3; + const double COMPENSATE = _get_config("warp_size", "compensate"); + const double THRESHOLD = _get_config("warp_size", "threshold"); uint32_t NITER; @@ -596,7 +676,12 @@ class App { int main(int argc, const char** argv) { App app; - // TODO: Allow user to skip tests + std::string file_path = "config.json"; + if (argc > 1) { + file_path = argv[1]; + }; + app.load_config(file_path); + app.reg_count(); app.buf_cacheline_size(); app.buf_bandwidth(); From 5867129887571ea6f4c064f5447702e77135a923 Mon Sep 17 00:00:00 2001 From: Esteban Padilla Cerdio Date: Tue, 30 Jul 2024 13:31:12 -0700 Subject: [PATCH 27/75] Add metric for 3D texture max concurrent cache read (#4421) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4421 This diff introduces a metric to calculate the maximum concurrent cache line accesses for each dimension of a 3D texture. The experiment works by allowing each thread to access a different texel on the texture and slowly increasing the number of threads, until the cache line is no longer able to handle all simultaneous accesses. By detecting a jump in latency, we can define the optimal maximum size that can be accessed concurrently on each dimension. NOTE: ArchProbe uses this information to[ obtain a supposed cache line size for textures](https://fburl.com/98xiou3g). However, it is unclear why they define the cache line size as being the ratio between the larger concurrency value over the lower, times the texel size. It is also unclear how to extend their calculations to three dimensions. TODO: Understand the relationship between concurrency and cache line size, and modify this metric to output the cache line size. For a Samsung S22, the latency graph looks like this: {F1780375117} Reviewed By: copyrightly Differential Revision: D60246121 fbshipit-source-id: c2bac010077bf14e95f70bb6038acbb47a534dde --- backends/vulkan/tools/gpuinfo/config.json | 5 + .../gpuinfo/glsl/tex_cacheline_concurr.glsl | 39 ++++++++ .../gpuinfo/glsl/tex_cacheline_concurr.yaml | 14 +++ backends/vulkan/tools/gpuinfo/src/app.cpp | 98 ++++++++++++++++++- 4 files changed, 155 insertions(+), 1 deletion(-) create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml diff --git a/backends/vulkan/tools/gpuinfo/config.json b/backends/vulkan/tools/gpuinfo/config.json index 1efb9690fe..7307f29503 100644 --- a/backends/vulkan/tools/gpuinfo/config.json +++ b/backends/vulkan/tools/gpuinfo/config.json @@ -39,5 +39,10 @@ "nflush": 4, "nunroll": 16, "niter": 10 + }, + "tex_cacheline_concurr": { + "enabled": true, + "threshold": 3, + "compensate": 0.1 } } diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl new file mode 100644 index 0000000000..62659c7bb8 --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl @@ -0,0 +1,39 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_type(DTYPE)} + +layout(std430) buffer; + +${layout_declare_sampler(0, "r", "in_tex", DTYPE)} +${layout_declare_buffer(1, "w", "out_buf", DTYPE, "PRECISION", False)} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int niter = 1; + +void main() { + vec4 sum = vec4(0); + int i = 0; + for (; i < niter; ++i){ + $if DIM == 0: + sum += texelFetch(in_tex, ivec3(gl_GlobalInvocationID[0], 0, 0), 0); + $elif DIM == 1: + sum += texelFetch(in_tex, ivec3(0, gl_GlobalInvocationID[0], 0), 0); + $elif DIM == 2: + sum += texelFetch(in_tex, ivec3(0, 0, gl_GlobalInvocationID[0]), 0); + } + + // This is to ensure no compiler optimizations occur + vec4 zero = vec4(i>>31); + + out_buf[0] = sum + zero; +} diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml new file mode 100644 index 0000000000..6b557c9f66 --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml @@ -0,0 +1,14 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +tex_cacheline_concurr: + parameter_names_with_default_values: + DTYPE: float + generate_variant_forall: + DIM: + - RANGE: [0, 2] + shader_variants: + - NAME: tex_cacheline_concurr diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp index c33e8a011d..2b1621db62 100644 --- a/backends/vulkan/tools/gpuinfo/src/app.cpp +++ b/backends/vulkan/tools/gpuinfo/src/app.cpp @@ -291,12 +291,107 @@ class App { if (stride >= MAX_STRIDE) { std::cout << "Unable to conclude a top level buffer cacheline size." << std::endl; - cacheline_size = MAX_STRIDE; + cacheline_size = MAX_STRIDE * sizeof(float); } std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl; } + // Textures are drastically different from buffers in terms of data layout. + // While buffers are a contiguous range of memory, textures are opaque objects + // defined by the vendor and it is possible that nearby points of data are not + // neighboring in memory. Likewise, data points are accessed in + // multi-dimensional patches instead of simple lines. This makes the stride + // method for figuring out the cache line size not applicable. To go around + // this, this experiment runs an increasing amount of threads accessing + // different datapoints in the texture and measures latency. If the cache line + // is big enough to contain all requested data for the amount of threads, + // latency will be low. When there are more threads and hence more data than + // what a single cache line can handle, a second line must be fetched, + // increasing latency in a measurable way. + void tex_cacheline_concurr() { + if (!_enabled("tex_cacheline_concurr")) { + std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl; + return; + } + + const uint32_t TEXEL_WIDTH = 4; + const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH; + + const double COMPENSATE = + _get_config("tex_cacheline_concurr", "compensate"); + const double THRESHOLD = _get_config("tex_cacheline_concurr", "threshold"); + + for (int dim = 0; dim < 3; ++dim) { + std::cout << std::endl; + std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim + << ") ------" << std::endl; + + uint32_t NITER; + + const uint32_t IMG_OTHER_EDGE = dim == 0 ? max_tex_width_ + : dim == 1 ? max_tex_height_ + : max_tex_depth_; + + const uint32_t MAX_NTHREAD = std::min(nthread_logic_, IMG_OTHER_EDGE); + + auto bench = [&](uint32_t nthread) { + std::vector sizes_whd = { + max_tex_width_, max_tex_height_, max_tex_depth_}; + + auto sizes_nchw = _whd_to_nchw(sizes_whd); + + vTensor in_tensor = + api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); + + StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); + + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim); + + auto time = benchmark_on_gpu(shader_name, 100, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {nthread, 1, 1}, + {nthread, 1, 1}, + {SV(NITER)}, + VK_NULL_HANDLE, + 0, + in_tensor.image(), + out_buf.buffer()); + }); + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1); }); + + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + uint32_t nthread = 1; + for (; nthread <= MAX_NTHREAD; ++nthread) { + double time = bench(nthread); + std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time + << std::endl; + + if (dj.push(time)) { + auto max_concurrency = nthread - 1; + std::cout << "TextureCachelineConcurrencyDim" << dim << " (B)," + << max_concurrency * TEXEL_SIZE << std::endl; + break; + } + } + if (nthread >= MAX_NTHREAD) { + std::cout + << "Unable to conclude an optimal texture cacheline concurrency for dim " + << dim << std::endl; + }; + } + + // TODO: Use concurrency information to obtain the cache line size for + // textures as done in https://fburl.com/98xiou3g + } + private: void _bandwidth(std::string memtype, uint32_t range) { auto memtype_lower = memtype; @@ -689,6 +784,7 @@ int main(int argc, const char** argv) { app.shared_mem_bandwidth(); app.warp_size(); app.tex_bandwidth(); + app.tex_cacheline_concurr(); return 0; } From e03181d1ff078d8b534e53aeab7ed4cce77ea7e3 Mon Sep 17 00:00:00 2001 From: Esteban Padilla Cerdio Date: Tue, 30 Jul 2024 13:31:12 -0700 Subject: [PATCH 28/75] Refactor and class split (#4432) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4432 Big classes are scary ☹️ This diff subdivides the tests into categories, places them as functions inside the gpuinfo namespace, instead of as part of the App class, and the App class is now only for persisting device information and configuration. Reviewed By: jorgep31415 Differential Revision: D60290882 fbshipit-source-id: b57f6e824be33320c01eebc5d5b72cbd2ad4c0cf --- backends/vulkan/tools/gpuinfo/config.json | 2 +- backends/vulkan/tools/gpuinfo/include/app.h | 114 +++ .../tools/gpuinfo/include/architecture.h | 285 +++++++ .../vulkan/tools/gpuinfo/include/buffers.h | 203 +++++ .../vulkan/tools/gpuinfo/include/textures.h | 207 +++++ backends/vulkan/tools/gpuinfo/include/utils.h | 9 + backends/vulkan/tools/gpuinfo/src/app.cpp | 790 ------------------ backends/vulkan/tools/gpuinfo/src/main.cpp | 40 + 8 files changed, 859 insertions(+), 791 deletions(-) create mode 100644 backends/vulkan/tools/gpuinfo/include/app.h create mode 100644 backends/vulkan/tools/gpuinfo/include/architecture.h create mode 100644 backends/vulkan/tools/gpuinfo/include/buffers.h create mode 100644 backends/vulkan/tools/gpuinfo/include/textures.h delete mode 100644 backends/vulkan/tools/gpuinfo/src/app.cpp create mode 100644 backends/vulkan/tools/gpuinfo/src/main.cpp diff --git a/backends/vulkan/tools/gpuinfo/config.json b/backends/vulkan/tools/gpuinfo/config.json index 7307f29503..afb5cbc6c5 100644 --- a/backends/vulkan/tools/gpuinfo/config.json +++ b/backends/vulkan/tools/gpuinfo/config.json @@ -23,7 +23,7 @@ "nunroll": 16, "niter": 10 }, - "shared_mem_bandwidth": { + "shared_bandwidth": { "enabled": true, "nflush": 4, "nunroll": 16, diff --git a/backends/vulkan/tools/gpuinfo/include/app.h b/backends/vulkan/tools/gpuinfo/include/app.h new file mode 100644 index 0000000000..a46e9e6b9a --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/app.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +#include "utils.h" + +namespace gpuinfo { + +class App { + private: + folly::dynamic config_; + + public: + size_t buf_cache_size; + uint32_t max_shared_mem_size; + uint32_t sm_count; + uint32_t nthread_logic; + uint32_t subgroup_size; + uint32_t max_tex_width; + uint32_t max_tex_height; + uint32_t max_tex_depth; + + App() { + context()->initialize_querypool(); + + std::cout << context()->adapter_ptr()->stringize() << std::endl + << std::endl; + + auto cl_device = get_cl_device(); + + sm_count = cl_device.getInfo(); + nthread_logic = cl_device.getInfo(); + buf_cache_size = cl_device.getInfo(); + max_shared_mem_size = cl_device.getInfo(); + max_tex_width = cl_device.getInfo(); + max_tex_height = cl_device.getInfo(); + max_tex_depth = cl_device.getInfo(); + + VkPhysicalDeviceSubgroupProperties subgroup_props{}; + VkPhysicalDeviceProperties2 props2{}; + + props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + props2.pNext = &subgroup_props; + subgroup_props.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; + vkGetPhysicalDeviceProperties2( + context()->adapter_ptr()->physical_handle(), &props2); + subgroup_size = subgroup_props.subgroupSize; + + std::cout << std::endl; + std::cout << "SM count," << sm_count << std::endl; + std::cout << "Logic Thread Count," << nthread_logic << std::endl; + std::cout << "Cache Size," << buf_cache_size << std::endl; + std::cout << "Shared Memory Size," << max_shared_mem_size << std::endl; + std::cout << "SubGroup Size," << subgroup_size << std::endl; + std::cout << "MaxTexWidth," << max_tex_width << std::endl; + std::cout << "MaxTexHeight," << max_tex_height << std::endl; + std::cout << "MaxTexDepth," << max_tex_depth << std::endl; + } + + float get_config(const std::string& test, const std::string& key) const { + if (config_[test].empty()) { + throw std::runtime_error("Missing config for " + test); + } + + if (!config_[test][key].isNumber()) { + throw std::runtime_error( + "Config for " + test + "." + key + " is not a number"); + } + + float value; + if (config_[test][key].isDouble()) { + value = config_[test][key].getDouble(); + } else { + value = config_[test][key].getInt(); + } + + std::cout << "Read value for " << test << "." << key << " = " << value + << std::endl; + return value; + } + + bool enabled(const std::string& test) const { + if (config_.empty() || config_[test].empty() || + !config_[test]["enabled"].isBool()) { + return true; + } + return config_[test]["enabled"].getBool(); + } + + void load_config(std::string file_path) { + std::ifstream file(file_path); + std::stringstream buffer; + buffer << file.rdbuf(); + const std::string json_str = buffer.str(); + if (json_str.empty()) { + throw std::runtime_error( + "Failed to read config file from " + file_path + "."); + } + config_ = folly::parseJson(json_str); + } +}; +} // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h new file mode 100644 index 0000000000..0d312ee87c --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/architecture.h @@ -0,0 +1,285 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include "app.h" +#include "stats.h" +#include "utils.h" + +using namespace vkapi; + +namespace gpuinfo { + +void reg_count(const App& app) { + if (!app.enabled("reg_count")) { + std::cout << "Skipped Register Count" << std::endl; + return; + } + + std::cout << std::endl; + std::cout << "------ Register Count ------" << std::endl; + const uint32_t NREG_MIN = 1; + const uint32_t NREG_MAX = 512; + const uint32_t NREG_STEP = 1; + + const double COMPENSATE = app.get_config("reg_count", "compensate"); + const double THRESHOLD = app.get_config("reg_count", "threshold"); + + const uint32_t NGRP_MIN = 1; + const uint32_t NGRP_MAX = 64; + const uint32_t NGRP_STEP = 1; + + uint32_t NITER; + + auto bench = [&](uint32_t ngrp, uint32_t nreg) { + StorageBuffer buffer(context(), vkapi::kFloat, 1); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "reg_count_" + std::to_string(nreg); + + auto time = benchmark_on_gpu(shader_name, 30, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {1, ngrp, 1}, + {1, 1, 1}, + {SV(NITER)}, + VK_NULL_HANDLE, + 0, + buffer.buffer()); + }); + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); }); + + uint32_t nreg_max; + + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + uint32_t nreg = NREG_MIN; + for (; nreg <= NREG_MAX; nreg += NREG_STEP) { + double time = bench(1, nreg); + std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << "\tus" + << std::endl; + if (dj.push(time)) { + nreg -= NREG_STEP; + nreg_max = nreg; + break; + } + } + if (nreg >= NREG_MAX) { + std::cout << "Unable to conclude a maximal register count" << std::endl; + nreg_max = NREG_STEP; + } else { + std::cout << nreg_max << " registers are available at most" << std::endl; + } + + auto find_ngrp_by_nreg = [&](const uint32_t nreg) { + DtJumpFinder<3> dj(COMPENSATE, THRESHOLD); + for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) { + auto time = bench(ngrp, nreg); + std::cout << "Testing occupation (nreg=\t" << nreg << "\t); ngrp=\t" + << ngrp << "\t, time=\t" << time << "\tus" << std::endl; + + if (dj.push(time)) { + ngrp -= NGRP_STEP; + std::cout << "Using " << nreg << " registers can have " << ngrp + << " concurrent single-thread workgroups" << std::endl; + return ngrp; + } + } + std::cout + << "Unable to conclude a maximum number of concurrent single-thread workgroups when " + << nreg << " registers are occupied" << std::endl; + return (uint32_t)1; + }; + + uint32_t ngrp_full, ngrp_half; + ngrp_full = find_ngrp_by_nreg(nreg_max); + ngrp_half = find_ngrp_by_nreg(nreg_max / 2); + + std::string reg_ty; + + if (ngrp_full * 1.5 < ngrp_half) { + std::cout << "All physical threads in an sm share " << nreg_max + << " registers" << std::endl; + reg_ty = "Pooled"; + + } else { + std::cout << "Each physical thread has " << nreg_max << " registers" + << std::endl; + reg_ty = "Dedicated"; + } + + std::cout << std::endl << std::endl; + std::cout << "MaxRegisters," << nreg_max << std::endl; + std::cout << "ConcurrentWorkgroupsFullReg," << ngrp_full << std::endl; + std::cout << "ConcurrentWorkgroupsHalfReg," << ngrp_half << std::endl; + std::cout << "RegisterType," << reg_ty << std::endl; +} + +// Warp size is a difficult metric to obtain because the hardware limitations +// do not always coincide with the way the SM divides the workload. For +// instance, the hardware can have a warp size of 64 threads, but an SM might +// be able to simulate concurrency of 128 threads with a single scheduler. + +// Because of this, it is important to measure the warp size different ways, +// that can evidence both the physical limitations of the hardware, and the +// actual behavior of the driver. + +// Additionally,the SM can behave in two different ways when the assigned +// workload is smaller than the warp size. + +// In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty +// threads and maintain a uniform workload. + +// In Case 2, like in Adreno, the driver might decide to pack multiple works +// together and dispatch them at once. +void warp_size(const App& app, const bool verbose = false) { + if (!app.enabled("warp_size")) { + std::cout << "Skipped Warp Size" << std::endl; + return; + } + + std::cout << "\n------ Warp Size ------" << std::endl; + + // Method A: Stress test with a kernel that uses complex ALU operations like + // integer division to avoid latency hiding. Increase the number of threads + // until a jump in latency is detected. + + // This timing-based method helps us identify physical warp sizes. It also + // helps with Case 2, when threads of multiple warps are managed by the same + // scheduler at the same time. + const double COMPENSATE = app.get_config("warp_size", "compensate"); + const double THRESHOLD = app.get_config("warp_size", "threshold"); + + uint32_t NITER; + + auto bench = [&](uint32_t nthread) { + StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "warp_size_physical"; + + auto time = benchmark_on_gpu(shader_name, 10, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + // Large number of work groups selected to potentially saturate all + // ALUs and thus have a better baseline for comparison. + {nthread, 1024, 1}, + {nthread, 1, 1}, + {SV(NITER)}, + VK_NULL_HANDLE, + 0, + out_buf.buffer()); + }); + + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1); }); + + uint32_t warp_size = app.subgroup_size; + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + + // We increase the number of threads until we hit a jump in the data. + uint32_t nthread = 1; + for (; nthread <= app.nthread_logic; ++nthread) { + double time = bench(nthread); + std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)" + << std::endl; + if (dj.push(time)) { + warp_size = nthread - 1; + break; + } + } + if (nthread >= app.nthread_logic) { + std::cout + << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size" + << std::endl; + } + + // Method B: Let all the threads in a warp race and atomically fetch-add + // a counter, then store the counter values to the output buffer in the + // scheduling order of these threads. If all the order numbers follow an + // ascending order, then the threads are likely executing within a warp. + // Threads in different warps are not managed by the same scheduler, so they + // would race for a same ID out of order, unaware of each other. + + // This method evidences the actual driver behavior when running + // concurrency, regardless of the physical limitations of the hardware. + + // Likewise, this method helps us identify warp sizes when the SM + // sub-divides its ALUs into independent groups, like the three execution + // engines in a Mali G76 core. It helps warp-probing in Case 1 because it + // doesn't depend on kernel timing, so the extra wait time doesn't lead to + // inaccuracy. + auto bench_sm = [&](uint32_t nthread) { + StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "warp_size_scheduler"; + + benchmark_on_gpu(shader_name, 1, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {nthread, 1, 1}, + {nthread, 1, 1}, + {}, + VK_NULL_HANDLE, + 0, + out_buf.buffer()); + }); + + std::vector data(app.nthread_logic); + copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes()); + + if (verbose) { + std::stringstream ss; + for (auto j = 0; j < nthread; ++j) { + ss << data[j] << " "; + } + std::cout << ss.str() << std::endl; + } + + // Check until which point is the data in ascending order. + int32_t last = -1; + int32_t j = 0; + for (; j < nthread; ++j) { + if (last >= data[j]) { + break; + } + last = data[j]; + } + + return j; + }; + + // Test increasing sizes until the data is no longer in ascending order. + uint32_t warp_size_scheduler = warp_size; + int i = 1; + for (; i <= app.nthread_logic; ++i) { + uint32_t nascend = bench_sm(i); + if (nascend != i) { + warp_size_scheduler = nascend; + break; + } + } + if (i > app.nthread_logic) { + std::cout << "Unable to conclude an SM Warp Size." << std::endl; + } + + std::cout << "PhysicalWarpSize," << warp_size << std::endl; + std::cout << "SMWarpSize," << warp_size_scheduler << std::endl; +} +}; // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h new file mode 100644 index 0000000000..8cb0da49ca --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/buffers.h @@ -0,0 +1,203 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include "app.h" +#include "stats.h" +#include "utils.h" + +using namespace vkapi; + +namespace gpuinfo { + +void buf_cacheline_size(const App& app) { + if (!app.enabled("buf_cacheline_size")) { + std::cout << "Skipped Buffer Cacheline Size" << std::endl; + return; + } + + std::cout << std::endl; + std::cout << "------ Buffer Cacheline Size ------" << std::endl; + + const double COMPENSATE = app.get_config("buf_cacheline_size", "compensate"); + const double THRESHOLD = app.get_config("buf_cacheline_size", "threshold"); + + const uint32_t PITCH = app.buf_cache_size / app.nthread_logic; + const uint32_t BUF_SIZE = app.buf_cache_size; + const uint32_t MAX_STRIDE = PITCH; + + uint32_t NITER; + + auto bench = [&](int stride) { + StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); + StorageBuffer out_buf(context(), vkapi::kFloat, 1); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "buf_cacheline_size"; + + auto time = benchmark_on_gpu(shader_name, 100, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {app.nthread_logic, 1, 1}, + {app.nthread_logic, 1, 1}, + {SV(NITER), SV(stride), SV(PITCH)}, + VK_NULL_HANDLE, + 0, + in_buf.buffer(), + out_buf.buffer()); + }); + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1); }); + + uint32_t cacheline_size; + + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + uint32_t stride = 1; + for (; stride <= MAX_STRIDE; ++stride) { + double time = bench(stride); + std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time + << std::endl; + + if (dj.push(time)) { + cacheline_size = stride * sizeof(float); + break; + } + } + if (stride >= MAX_STRIDE) { + std::cout << "Unable to conclude a top level buffer cacheline size." + << std::endl; + cacheline_size = MAX_STRIDE * sizeof(float); + } + + std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl; +} + +void _bandwidth( + const App& app, + const std::string memtype, + const uint32_t range) { + auto memtype_lower = memtype; + std::transform( + memtype_lower.begin(), + memtype_lower.end(), + memtype_lower.begin(), + [](unsigned char c) { return std::tolower(c); }); + + auto test_name = memtype_lower + "_bandwidth"; + + // Cache lines flushed + const uint32_t NFLUSH = app.get_config(test_name, "nflush"); + // Number of loop unrolls. Changing this value requires an equal change in + // buf_bandwidth.yaml + const uint32_t NUNROLL = app.get_config(test_name, "nunroll"); + // Number of iterations. Increasing this value reduces noise in exchange for + // higher latency. + const uint32_t NITER = app.get_config(test_name, "niter"); + // Vector dimensions (vec4) + const uint32_t VEC_WIDTH = 4; + const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); + // Number of vectors that fit in the selected memory space + const uint32_t NVEC = range / VEC_SIZE; + // Number of memory reads per thread + const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; + // Number of threads needed to read al l vectors + // The thread count doesn't divide by thread workload in shared memory + // because of the limited memory size. + const uint32_t NTHREAD = memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD; + // Occupy all threads + const uint32_t local_x = app.nthread_logic; + // Ensure that global is a multiple of local, and distribute across all SMs + const uint32_t global_x = + (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH; + + auto bench = [&](uint32_t access_size) { + // Number of vectors that fit in this iteration + const uint32_t nvec_access = access_size / VEC_SIZE; + + StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); + StorageBuffer out_buf( + context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "buf_bandwidth_" + memtype_lower; + + auto time = benchmark_on_gpu(shader_name, 10, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {global_x, 1, 1}, + {local_x, 1, 1}, + {SV(NITER), SV(nvec_access), SV(local_x)}, + VK_NULL_HANDLE, + 0, + in_buf.buffer(), + out_buf.buffer()); + }); + + const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; + auto gbps = SIZE_TRANS * 1e-3 / time; + std::cout << memtype << " bandwidth accessing \t" << access_size + << "\tB unique data is \t" << gbps << " \tgbps (\t" << time + << "\tus)" << std::endl; + return gbps; + }; + + double max_bandwidth = 0; + double min_bandwidth = DBL_MAX; + for (uint32_t access_size = VEC_SIZE; access_size < range; access_size *= 2) { + double gbps = bench(access_size); + max_bandwidth = std::max(gbps, max_bandwidth); + min_bandwidth = std::min(gbps, min_bandwidth); + } + + std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth + << std::endl; + std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth + << std::endl; +} + +void buf_bandwidth(const App& app) { + if (!app.enabled("buffer_bandwidth")) { + std::cout << "Skipped Memory Bandwidth" << std::endl; + return; + } + + std::cout << "\n------ Memory Bandwidth ------" << std::endl; + // Maximum memory space read - 128MB + // For regular devices, bandwidth plateaus at less memory than this, so more + // is not needed. + const uint32_t RANGE = app.get_config("buffer_bandwidth", "range"); + _bandwidth(app, "Buffer", RANGE); +} + +void ubo_bandwidth(const App& app) { + if (!app.enabled("ubo_bandwidth")) { + std::cout << "Skipped UBO Bandwidth" << std::endl; + return; + } + + std::cout << "\n------ UBO Bandwidth ------" << std::endl; + const uint32_t RANGE = app.get_config("ubo_bandwidth", "range"); + _bandwidth(app, "UBO", RANGE); +} + +void shared_mem_bandwidth(const App& app) { + if (!app.enabled("shared_bandwidth")) { + std::cout << "Skipped Shared Memory Bandwidth" << std::endl; + return; + } + + std::cout << "\n------ Shared Bandwidth ------" << std::endl; + const uint32_t RANGE = app.max_shared_mem_size; + _bandwidth(app, "Shared", RANGE); +} +} // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h new file mode 100644 index 0000000000..bb8a3371a9 --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/textures.h @@ -0,0 +1,207 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include "app.h" +#include "stats.h" +#include "utils.h" + +namespace gpuinfo { + +// Textures are drastically different from buffers in terms of data layout. +// While buffers are a contiguous range of memory, textures are opaque objects +// defined by the vendor and it is possible that nearby points of data are not +// neighboring in memory. Likewise, data points are accessed in +// multi-dimensional patches instead of simple lines. This makes the stride +// method for figuring out the cache line size not applicable. To go around +// this, this experiment runs an increasing amount of threads accessing +// different datapoints in the texture and measures latency. If the cache line +// is big enough to contain all requested data for the amount of threads, +// latency will be low. When there are more threads and hence more data than +// what a single cache line can handle, a second line must be fetched, +// increasing latency in a measurable way. +void tex_cacheline_concurr(const App& app) { + if (!app.enabled("tex_cacheline_concurr")) { + std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl; + return; + } + + const uint32_t TEXEL_WIDTH = 4; + const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH; + + const double COMPENSATE = + app.get_config("tex_cacheline_concurr", "compensate"); + const double THRESHOLD = app.get_config("tex_cacheline_concurr", "threshold"); + + for (int dim = 0; dim < 3; ++dim) { + std::cout << std::endl; + std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim + << ") ------" << std::endl; + + uint32_t NITER; + + const uint32_t IMG_OTHER_EDGE = dim == 0 ? app.max_tex_width + : dim == 1 ? app.max_tex_height + : app.max_tex_depth; + + const uint32_t MAX_NTHREAD = std::min(app.nthread_logic, IMG_OTHER_EDGE); + + auto bench = [&](uint32_t nthread) { + std::vector sizes_whd = { + app.max_tex_width, app.max_tex_height, app.max_tex_depth}; + + auto sizes_nchw = whd_to_nchw(sizes_whd); + + vTensor in_tensor = + api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); + + StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); + + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim); + + auto time = benchmark_on_gpu(shader_name, 100, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {nthread, 1, 1}, + {nthread, 1, 1}, + {SV(NITER)}, + VK_NULL_HANDLE, + 0, + in_tensor.image(), + out_buf.buffer()); + }); + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1); }); + + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + uint32_t nthread = 1; + for (; nthread <= MAX_NTHREAD; ++nthread) { + double time = bench(nthread); + std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time + << std::endl; + + if (dj.push(time)) { + auto max_concurrency = nthread - 1; + std::cout << "TextureCachelineConcurrencyDim" << dim << " (B)," + << max_concurrency * TEXEL_SIZE << std::endl; + break; + } + } + if (nthread >= MAX_NTHREAD) { + std::cout + << "Unable to conclude an optimal texture cacheline concurrency for dim " + << dim << std::endl; + }; + } + + // TODO: Use concurrency information to obtain the cache line size for + // textures as done in https://fburl.com/98xiou3g +} + +void tex_bandwidth(const App& app) { + if (!app.enabled("tex_bandwidth")) { + std::cout << "Skipped Texture Bandwidth" << std::endl; + return; + } + + for (int dim = 0; dim < 3; dim++) { + std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------" + << std::endl; + const uint32_t MAX_SIZE = dim == 0 ? app.max_tex_width + : dim == 1 ? app.max_tex_height + : app.max_tex_depth; + + // rgba, float + const uint32_t VEC_WIDTH = 4; + const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); + const uint32_t NVEC = MAX_SIZE; + + const uint32_t RANGE = NVEC * VEC_SIZE; + + // Cache lines flushed + const uint32_t NFLUSH = app.get_config("tex_bandwidth", "nflush"); + // Number of loop unrolls. Changing this value requires an equal change in + // tex_bandwidth.yaml + const uint32_t NUNROLL = app.get_config("tex_bandwidth", "nunroll"); + // Number of iterations. Increasing this value reduces noise in exchange + // for higher latency. + const uint32_t NITER = app.get_config("tex_bandwidth", "niter"); + // Number of memory reads per thread + const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; + // Number of threads needed to read all texells + const uint32_t NTHREAD = NVEC; + // Occupy all threads + const uint32_t local_x = app.nthread_logic; + // Ensure that global is a multiple of local, and distribute across all + // SMs + const uint32_t global_x = + (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH; + + auto shader_name = "tex_bandwidth_" + std::to_string(dim); + + std::vector sizes_whd = {MAX_SIZE, 1, 1}; + if (dim == 1) { + sizes_whd = {1, MAX_SIZE, 1}; + } else if (dim == 2) { + sizes_whd = {1, 1, MAX_SIZE}; + } + auto sizes_nchw = whd_to_nchw(sizes_whd); + + vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); + + auto bench = [&](uint32_t access_size, uint32_t dim) { + // Number of texels that fit in this iteration + const uint32_t ntexel_access = access_size / VEC_SIZE; + + StorageBuffer out_buf( + context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto time = benchmark_on_gpu(shader_name, 10, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {global_x, 1, 1}, + {local_x, 1, 1}, + {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)}, + VK_NULL_HANDLE, + 0, + in_tensor.image(), + out_buf.buffer()); + }); + + const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; + double gbps = SIZE_TRANS * 1e-3 / time; + std::cout << "Texture bandwidth accessing \t" << access_size + << "\tB unique data is \t" << gbps << " \tgbps (\t" << time + << "\tus)" << std::endl; + return gbps; + }; + + double max_bandwidth = 0; + double min_bandwidth = DBL_MAX; + for (uint32_t access_size = VEC_SIZE; access_size < RANGE; + access_size *= 2) { + double gbps = bench(access_size, dim); + max_bandwidth = std::max(gbps, max_bandwidth); + min_bandwidth = std::min(gbps, min_bandwidth); + } + + std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth + << std::endl; + std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth + << std::endl; + } +} +} // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/utils.h b/backends/vulkan/tools/gpuinfo/include/utils.h index 231fb32c5a..887cb443ef 100644 --- a/backends/vulkan/tools/gpuinfo/include/utils.h +++ b/backends/vulkan/tools/gpuinfo/include/utils.h @@ -54,6 +54,15 @@ void ensure_min_niter( } } +std::vector whd_to_nchw(std::vector sizes) { + const int64_t W = sizes[0]; + const int64_t H = sizes[1]; + const int64_t D = sizes[2]; + + // Channels-packed: {W, H, D} = {W, H, (C / 4) * N} + return {1, D * 4, H, W}; +} + cl_platform_id get_cl_platform_id() { cl_uint nplatform_id; clGetPlatformIDs(0, nullptr, &nplatform_id); diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp deleted file mode 100644 index 2b1621db62..0000000000 --- a/backends/vulkan/tools/gpuinfo/src/app.cpp +++ /dev/null @@ -1,790 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include -#include - -#include "stats.h" -#include "utils.h" - -using namespace vkapi; - -class App { - private: - size_t buf_cache_size_; - uint32_t max_shared_mem_size_; - uint32_t sm_count_; - uint32_t nthread_logic_; - uint32_t subgroup_size_; - uint32_t max_tex_width_; - uint32_t max_tex_height_; - uint32_t max_tex_depth_; - folly::dynamic config_; - - std::vector _whd_to_nchw(std::vector sizes) { - const int64_t W = sizes[0]; - const int64_t H = sizes[1]; - const int64_t D = sizes[2]; - - // Channels-packed: {W, H, D} = {W, H, (C / 4) * N} - return {1, D * 4, H, W}; - } - - float _get_config(const std::string& test, const std::string& key) { - if (config_[test].empty()) { - throw std::runtime_error("Missing config for " + test); - } - - if (!config_[test][key].isNumber()) { - throw std::runtime_error( - "Config for " + test + "." + key + " is not a number"); - } - - float value; - if (config_[test][key].isDouble()) { - value = config_[test][key].getDouble(); - } else { - value = config_[test][key].getInt(); - } - - std::cout << "Read value for " << test << "." << key << " = " << value - << std::endl; - return value; - } - - bool _enabled(const std::string& test) { - if (config_.empty() || config_[test].empty() || - !config_[test]["enabled"].isBool()) { - return true; - } - return config_[test]["enabled"].getBool(); - } - - public: - App() { - context()->initialize_querypool(); - - std::cout << context()->adapter_ptr()->stringize() << std::endl - << std::endl; - - auto cl_device = get_cl_device(); - - sm_count_ = cl_device.getInfo(); - nthread_logic_ = cl_device.getInfo(); - buf_cache_size_ = cl_device.getInfo(); - max_shared_mem_size_ = cl_device.getInfo(); - max_tex_width_ = cl_device.getInfo(); - max_tex_height_ = cl_device.getInfo(); - max_tex_depth_ = cl_device.getInfo(); - - VkPhysicalDeviceSubgroupProperties subgroup_props{}; - VkPhysicalDeviceProperties2 props2{}; - - props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; - props2.pNext = &subgroup_props; - subgroup_props.sType = - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; - vkGetPhysicalDeviceProperties2( - context()->adapter_ptr()->physical_handle(), &props2); - subgroup_size_ = subgroup_props.subgroupSize; - - std::cout << std::endl; - std::cout << "SM count," << sm_count_ << std::endl; - std::cout << "Logic Thread Count," << nthread_logic_ << std::endl; - std::cout << "Cache Size," << buf_cache_size_ << std::endl; - std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl; - std::cout << "SubGroup Size," << subgroup_size_ << std::endl; - std::cout << "MaxTexWidth," << max_tex_width_ << std::endl; - std::cout << "MaxTexHeight," << max_tex_height_ << std::endl; - std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl; - } - - void load_config(std::string file_path) { - std::ifstream file(file_path); - std::stringstream buffer; - buffer << file.rdbuf(); - const std::string json_str = buffer.str(); - if (json_str.empty()) { - throw std::runtime_error( - "Failed to read config file from " + file_path + "."); - } - config_ = folly::parseJson(json_str); - } - - void reg_count() { - if (!_enabled("reg_count")) { - std::cout << "Skipped Register Count" << std::endl; - return; - } - - std::cout << std::endl; - std::cout << "------ Register Count ------" << std::endl; - const uint32_t NREG_MIN = 1; - const uint32_t NREG_MAX = 512; - const uint32_t NREG_STEP = 1; - - const double COMPENSATE = _get_config("reg_count", "compensate"); - const double THRESHOLD = _get_config("reg_count", "threshold"); - - const uint32_t NGRP_MIN = 1; - const uint32_t NGRP_MAX = 64; - const uint32_t NGRP_STEP = 1; - - uint32_t NITER; - - auto bench = [&](uint32_t ngrp, uint32_t nreg) { - StorageBuffer buffer(context(), vkapi::kFloat, 1); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "reg_count_" + std::to_string(nreg); - - auto time = benchmark_on_gpu(shader_name, 100, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {1, ngrp, 1}, - {1, 1, 1}, - {SV(NITER)}, - VK_NULL_HANDLE, - 0, - buffer.buffer()); - }); - return time; - }; - - std::cout << "Calculating NITER..." << std::endl; - ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); }); - std::cout << "NITER," << NITER << std::endl; - - uint32_t nreg_max; - - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - uint32_t nreg = NREG_MIN; - for (; nreg <= NREG_MAX; nreg += NREG_STEP) { - double time = bench(1, nreg); - std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time - << std::endl; - if (dj.push(time)) { - nreg -= NREG_STEP; - nreg_max = nreg; - break; - } - } - if (nreg >= NREG_MAX) { - std::cout << "Unable to conclude a maximal register count" << std::endl; - nreg_max = NREG_STEP; - } else { - std::cout << nreg_max << " registers are available at most" << std::endl; - } - - auto find_ngrp_by_nreg = [&](const uint32_t nreg) { - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) { - auto time = bench(ngrp, nreg); - std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp - << ", time=" << time << " us" << std::endl; - - if (dj.push(time)) { - ngrp -= NGRP_STEP; - std::cout << "Using " << nreg << " registers can have " << ngrp - << " concurrent single-thread workgroups" << std::endl; - return ngrp; - } - } - std::cout - << "Unable to conclude a maximum number of concurrent single-thread workgroups when " - << nreg << " registers are occupied" << std::endl; - return (uint32_t)1; - }; - - uint32_t ngrp_full, ngrp_half; - ngrp_full = find_ngrp_by_nreg(nreg_max); - ngrp_half = find_ngrp_by_nreg(nreg_max / 2); - - std::string reg_ty; - - if (ngrp_full * 1.5 < ngrp_half) { - std::cout << "All physical threads in an sm share " << nreg_max - << " registers" << std::endl; - reg_ty = "Pooled"; - - } else { - std::cout << "Each physical thread has " << nreg_max << " registers" - << std::endl; - reg_ty = "Dedicated"; - } - - std::cout << std::endl << std::endl; - std::cout << "NITER," << NITER << std::endl; - std::cout << "Max registers," << nreg_max << std::endl; - std::cout << "Concurrent full single thread workgroups," << ngrp_full - << std::endl; - std::cout << "Concurrent half single thread workgroups," << ngrp_half - << std::endl; - std::cout << "Register type," << reg_ty << std::endl; - } - - void buf_cacheline_size() { - if (!_enabled("buf_cacheline_size")) { - std::cout << "Skipped Buffer Cacheline Size" << std::endl; - return; - } - - std::cout << std::endl; - std::cout << "------ Buffer Cacheline Size ------" << std::endl; - - const double COMPENSATE = _get_config("buf_cacheline_size", "compensate"); - const double THRESHOLD = _get_config("buf_cacheline_size", "threshold"); - - const uint32_t PITCH = buf_cache_size_ / nthread_logic_; - const uint32_t BUF_SIZE = buf_cache_size_; - const uint32_t MAX_STRIDE = PITCH; - - uint32_t NITER; - - auto bench = [&](int stride) { - StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); - StorageBuffer out_buf(context(), vkapi::kFloat, 1); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "buf_cacheline_size"; - - auto time = benchmark_on_gpu(shader_name, 100, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {nthread_logic_, 1, 1}, - {nthread_logic_, 1, 1}, - {SV(NITER), SV(stride), SV(PITCH)}, - VK_NULL_HANDLE, - 0, - in_buf.buffer(), - out_buf.buffer()); - }); - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1); }); - - uint32_t cacheline_size; - - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - uint32_t stride = 1; - for (; stride <= MAX_STRIDE; ++stride) { - double time = bench(stride); - std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time - << std::endl; - - if (dj.push(time)) { - cacheline_size = stride * sizeof(float); - break; - } - } - if (stride >= MAX_STRIDE) { - std::cout << "Unable to conclude a top level buffer cacheline size." - << std::endl; - cacheline_size = MAX_STRIDE * sizeof(float); - } - - std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl; - } - - // Textures are drastically different from buffers in terms of data layout. - // While buffers are a contiguous range of memory, textures are opaque objects - // defined by the vendor and it is possible that nearby points of data are not - // neighboring in memory. Likewise, data points are accessed in - // multi-dimensional patches instead of simple lines. This makes the stride - // method for figuring out the cache line size not applicable. To go around - // this, this experiment runs an increasing amount of threads accessing - // different datapoints in the texture and measures latency. If the cache line - // is big enough to contain all requested data for the amount of threads, - // latency will be low. When there are more threads and hence more data than - // what a single cache line can handle, a second line must be fetched, - // increasing latency in a measurable way. - void tex_cacheline_concurr() { - if (!_enabled("tex_cacheline_concurr")) { - std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl; - return; - } - - const uint32_t TEXEL_WIDTH = 4; - const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH; - - const double COMPENSATE = - _get_config("tex_cacheline_concurr", "compensate"); - const double THRESHOLD = _get_config("tex_cacheline_concurr", "threshold"); - - for (int dim = 0; dim < 3; ++dim) { - std::cout << std::endl; - std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim - << ") ------" << std::endl; - - uint32_t NITER; - - const uint32_t IMG_OTHER_EDGE = dim == 0 ? max_tex_width_ - : dim == 1 ? max_tex_height_ - : max_tex_depth_; - - const uint32_t MAX_NTHREAD = std::min(nthread_logic_, IMG_OTHER_EDGE); - - auto bench = [&](uint32_t nthread) { - std::vector sizes_whd = { - max_tex_width_, max_tex_height_, max_tex_depth_}; - - auto sizes_nchw = _whd_to_nchw(sizes_whd); - - vTensor in_tensor = - api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); - - StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); - - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim); - - auto time = benchmark_on_gpu(shader_name, 100, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {nthread, 1, 1}, - {nthread, 1, 1}, - {SV(NITER)}, - VK_NULL_HANDLE, - 0, - in_tensor.image(), - out_buf.buffer()); - }); - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1); }); - - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - uint32_t nthread = 1; - for (; nthread <= MAX_NTHREAD; ++nthread) { - double time = bench(nthread); - std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time - << std::endl; - - if (dj.push(time)) { - auto max_concurrency = nthread - 1; - std::cout << "TextureCachelineConcurrencyDim" << dim << " (B)," - << max_concurrency * TEXEL_SIZE << std::endl; - break; - } - } - if (nthread >= MAX_NTHREAD) { - std::cout - << "Unable to conclude an optimal texture cacheline concurrency for dim " - << dim << std::endl; - }; - } - - // TODO: Use concurrency information to obtain the cache line size for - // textures as done in https://fburl.com/98xiou3g - } - - private: - void _bandwidth(std::string memtype, uint32_t range) { - auto memtype_lower = memtype; - std::transform( - memtype_lower.begin(), - memtype_lower.end(), - memtype_lower.begin(), - [](unsigned char c) { return std::tolower(c); }); - - auto test_name = memtype_lower + "_bandwidth"; - - // Cache lines flushed - const uint32_t NFLUSH = _get_config(test_name, "nflush"); - // Number of loop unrolls. Changing this value requires an equal change in - // buf_bandwidth.yaml - const uint32_t NUNROLL = _get_config(test_name, "nunroll"); - // Number of iterations. Increasing this value reduces noise in exchange for - // higher latency. - const uint32_t NITER = _get_config(test_name, "niter"); - // Vector dimensions (vec4) - const uint32_t VEC_WIDTH = 4; - const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); - // Number of vectors that fit in the selected memory space - const uint32_t NVEC = range / VEC_SIZE; - // Number of memory reads per thread - const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; - // Number of threads needed to read al l vectors - // The thread count doesn't divide by thread workload in shared memory - // because of the limited memory size. - const uint32_t NTHREAD = - memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD; - // Occupy all threads - const uint32_t local_x = nthread_logic_; - // Ensure that global is a multiple of local, and distribute across all SMs - const uint32_t global_x = - (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH; - - auto bench = [&](uint32_t access_size) { - // Number of vectors that fit in this iteration - const uint32_t nvec_access = access_size / VEC_SIZE; - - StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); - StorageBuffer out_buf( - context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "buf_bandwidth_" + memtype_lower; - - auto time = benchmark_on_gpu(shader_name, 10, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {global_x, 1, 1}, - {local_x, 1, 1}, - {SV(NITER), SV(nvec_access), SV(local_x)}, - VK_NULL_HANDLE, - 0, - in_buf.buffer(), - out_buf.buffer()); - }); - - const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; - auto gbps = SIZE_TRANS * 1e-3 / time; - std::cout << memtype << " bandwidth accessing \t" << access_size - << "\tB unique data is \t" << gbps << " \tgbps (\t" << time - << "\tus)" << std::endl; - return gbps; - }; - - double max_bandwidth = 0; - double min_bandwidth = DBL_MAX; - for (uint32_t access_size = VEC_SIZE; access_size < range; - access_size *= 2) { - double gbps = bench(access_size); - max_bandwidth = std::max(gbps, max_bandwidth); - min_bandwidth = std::min(gbps, min_bandwidth); - } - - std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth - << std::endl; - std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth - << std::endl; - } - - public: - void buf_bandwidth() { - if (!_enabled("buffer_bandwidth")) { - std::cout << "Skipped Memory Bandwidth" << std::endl; - return; - } - - std::cout << "\n------ Memory Bandwidth ------" << std::endl; - // Maximum memory space read - 128MB - // For regular devices, bandwidth plateaus at less memory than this, so more - // is not needed. - const uint32_t RANGE = _get_config("buffer_bandwidth", "range"); - _bandwidth("Buffer", RANGE); - } - - void ubo_bandwidth() { - if (!_enabled("ubo_bandwidth")) { - std::cout << "Skipped UBO Bandwidth" << std::endl; - return; - } - - std::cout << "\n------ UBO Bandwidth ------" << std::endl; - const uint32_t RANGE = _get_config("ubo_bandwidth", "range"); - _bandwidth("UBO", RANGE); - } - - void shared_mem_bandwidth() { - if (!_enabled("shared_mem_bandwidth")) { - std::cout << "Skipped Shared Memory Bandwidth" << std::endl; - return; - } - - std::cout << "\n------ Shared Bandwidth ------" << std::endl; - const uint32_t RANGE = max_shared_mem_size_; - _bandwidth("Shared", RANGE); - } - - void tex_bandwidth() { - if (!_enabled("tex_bandwidth")) { - std::cout << "Skipped Texture Bandwidth" << std::endl; - return; - } - - for (int dim = 0; dim < 3; dim++) { - std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------" - << std::endl; - const uint32_t MAX_SIZE = dim == 0 ? max_tex_width_ - : dim == 1 ? max_tex_height_ - : max_tex_depth_; - - // rgba, float - const uint32_t VEC_WIDTH = 4; - const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); - const uint32_t NVEC = MAX_SIZE; - - const uint32_t RANGE = NVEC * VEC_SIZE; - - // Cache lines flushed - const uint32_t NFLUSH = _get_config("tex_bandwidth", "nflush"); - // Number of loop unrolls. Changing this value requires an equal change in - // tex_bandwidth.yaml - const uint32_t NUNROLL = _get_config("tex_bandwidth", "nunroll"); - // Number of iterations. Increasing this value reduces noise in exchange - // for higher latency. - const uint32_t NITER = _get_config("tex_bandwidth", "niter"); - // Number of memory reads per thread - const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; - // Number of threads needed to read all texells - const uint32_t NTHREAD = NVEC; - // Occupy all threads - const uint32_t local_x = nthread_logic_; - // Ensure that global is a multiple of local, and distribute across all - // SMs - const uint32_t global_x = - (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH; - - auto shader_name = "tex_bandwidth_" + std::to_string(dim); - - std::vector sizes_whd = {MAX_SIZE, 1, 1}; - if (dim == 1) { - sizes_whd = {1, MAX_SIZE, 1}; - } else if (dim == 2) { - sizes_whd = {1, 1, MAX_SIZE}; - } - auto sizes_nchw = _whd_to_nchw(sizes_whd); - - vTensor in_tensor = - api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); - - auto bench = [&](uint32_t access_size, uint32_t dim) { - // Number of texels that fit in this iteration - const uint32_t ntexel_access = access_size / VEC_SIZE; - - StorageBuffer out_buf( - context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto time = benchmark_on_gpu(shader_name, 10, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {global_x, 1, 1}, - {local_x, 1, 1}, - {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)}, - VK_NULL_HANDLE, - 0, - in_tensor.image(), - out_buf.buffer()); - }); - - const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; - double gbps = SIZE_TRANS * 1e-3 / time; - std::cout << "Texture bandwidth accessing \t" << access_size - << "\tB unique data is \t" << gbps << " \tgbps (\t" << time - << "\tus)" << std::endl; - return gbps; - }; - - double max_bandwidth = 0; - double min_bandwidth = DBL_MAX; - for (uint32_t access_size = VEC_SIZE; access_size < RANGE; - access_size *= 2) { - double gbps = bench(access_size, dim); - max_bandwidth = std::max(gbps, max_bandwidth); - min_bandwidth = std::min(gbps, min_bandwidth); - } - - std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth - << std::endl; - std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth - << std::endl; - } - } - - // Warp size is a difficult metric to obtain because the hardware limitations - // do not always coincide with the way the SM divides the workload. For - // instance, the hardware can have a warp size of 64 threads, but an SM might - // be able to simulate concurrency of 128 threads with a single scheduler. - - // Because of this, it is important to measure the warp size different ways, - // that can evidence both the physical limitations of the hardware, and the - // actual behavior of the driver. - - // Additionally,the SM can behave in two different ways when the assigned - // workload is smaller than the warp size. - - // In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty - // threads and maintain a uniform workload. - - // In Case 2, like in Adreno, the driver might decide to pack multiple works - // together and dispatch them at once. - void warp_size(bool verbose = false) { - if (!_enabled("warp_size")) { - std::cout << "Skipped Warp Size" << std::endl; - return; - } - - std::cout << "\n------ Warp Size ------" << std::endl; - - // Method A: Stress test with a kernel that uses complex ALU operations like - // integer division to avoid latency hiding. Increase the number of threads - // until a jump in latency is detected. - - // This timing-based method helps us identify physical warp sizes. It also - // helps with Case 2, when threads of multiple warps are managed by the same - // scheduler at the same time. - const double COMPENSATE = _get_config("warp_size", "compensate"); - const double THRESHOLD = _get_config("warp_size", "threshold"); - - uint32_t NITER; - - auto bench = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "warp_size_physical"; - - auto time = benchmark_on_gpu(shader_name, 10, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - // Large number of work groups selected to potentially saturate all - // ALUs and thus have a better baseline for comparison. - {nthread, 1024, 1}, - {nthread, 1, 1}, - {SV(NITER)}, - VK_NULL_HANDLE, - 0, - out_buf.buffer()); - }); - - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1); }); - - uint32_t warp_size = subgroup_size_; - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - - // We increase the number of threads until we hit a jump in the data. - uint32_t nthread = 1; - for (; nthread <= nthread_logic_; ++nthread) { - double time = bench(nthread); - std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)" - << std::endl; - if (dj.push(time)) { - warp_size = nthread - 1; - break; - } - } - if (nthread >= nthread_logic_) { - std::cout - << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size" - << std::endl; - } - - // Method B: Let all the threads in a warp race and atomically fetch-add - // a counter, then store the counter values to the output buffer in the - // scheduling order of these threads. If all the order numbers follow an - // ascending order, then the threads are likely executing within a warp. - // Threads in different warps are not managed by the same scheduler, so they - // would race for a same ID out of order, unaware of each other. - - // This method evidences the actual driver behavior when running - // concurrency, regardless of the physical limitations of the hardware. - - // Likewise, this method helps us identify warp sizes when the SM - // sub-divides its ALUs into independent groups, like the three execution - // engines in a Mali G76 core. It helps warp-probing in Case 1 because it - // doesn't depend on kernel timing, so the extra wait time doesn't lead to - // inaccuracy. - auto bench_sm = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "warp_size_scheduler"; - - benchmark_on_gpu(shader_name, 1, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {nthread, 1, 1}, - {nthread, 1, 1}, - {}, - VK_NULL_HANDLE, - 0, - out_buf.buffer()); - }); - - std::vector data(nthread_logic_); - copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes()); - - if (verbose) { - std::stringstream ss; - for (auto j = 0; j < nthread; ++j) { - ss << data[j] << " "; - } - std::cout << ss.str() << std::endl; - } - - // Check until which point is the data in ascending order. - int32_t last = -1; - int32_t j = 0; - for (; j < nthread; ++j) { - if (last >= data[j]) { - break; - } - last = data[j]; - } - - return j; - }; - - // Test increasing sizes until the data is no longer in ascending order. - uint32_t warp_size_scheduler = warp_size; - int i = 1; - for (; i <= nthread_logic_; ++i) { - uint32_t nascend = bench_sm(i); - if (nascend != i) { - warp_size_scheduler = nascend; - break; - } - } - if (i > nthread_logic_) { - std::cout << "Unable to conclude an SM Warp Size." << std::endl; - } - - std::cout << "PhysicalWarpSize," << warp_size << std::endl; - std::cout << "SMWarpSize," << warp_size_scheduler << std::endl; - } -}; - -int main(int argc, const char** argv) { - App app; - - std::string file_path = "config.json"; - if (argc > 1) { - file_path = argv[1]; - }; - app.load_config(file_path); - - app.reg_count(); - app.buf_cacheline_size(); - app.buf_bandwidth(); - app.ubo_bandwidth(); - app.shared_mem_bandwidth(); - app.warp_size(); - app.tex_bandwidth(); - app.tex_cacheline_concurr(); - - return 0; -} diff --git a/backends/vulkan/tools/gpuinfo/src/main.cpp b/backends/vulkan/tools/gpuinfo/src/main.cpp new file mode 100644 index 0000000000..f0e29aaf1a --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/src/main.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "app.h" +#include "architecture.h" +#include "buffers.h" +#include "textures.h" + +using namespace vkapi; + +int main(int argc, const char** argv) { + gpuinfo::App app; + + std::string file_path = "config.json"; + if (argc > 1) { + file_path = argv[1]; + }; + app.load_config(file_path); + + // Architecture + gpuinfo::reg_count(app); + gpuinfo::warp_size(app); + + // Buffers + gpuinfo::buf_cacheline_size(app); + gpuinfo::buf_bandwidth(app); + gpuinfo::ubo_bandwidth(app); + gpuinfo::shared_mem_bandwidth(app); + + // Textures + gpuinfo::tex_bandwidth(app); + gpuinfo::tex_cacheline_concurr(app); + + return 0; +} From 1727aa18c205da4d829c425bdece0b61b6179a6a Mon Sep 17 00:00:00 2001 From: Chen Lai Date: Tue, 30 Jul 2024 14:35:50 -0700 Subject: [PATCH 29/75] fix eval llama (#4469) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4469 Previously the refactor moves files from `examples/...` to `extensions/...`, however llama eval was not covered by CI, fix it here before: ``` (executorch) chenlai@chenlai-mbp executorch % python -m examples.models.llama2.eval_llama -c /Users/chenlai/Documents/stories110M/stories110M/stories110M.pt -p /Users/chenlai/Documents/stories110M/stories110M/params.json -t /Users/chenlai/Documents/stories110M/stories110M/tokenizer.model -d fp32 --max_seq_len 127 --limit 5 /opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable. warn("The installed version of bitsandbytes was compiled without GPU support. " 'NoneType' object has no attribute 'cadam32bit_grad_fp32' /opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:106: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch. impl_abstract("quantized_decomposed::embedding_byte.out") /opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:153: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch. impl_abstract("quantized_decomposed::embedding_byte.dtype_out") /opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:228: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch. impl_abstract("quantized_decomposed::embedding_4bit.out") /opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:281: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch. impl_abstract("quantized_decomposed::embedding_4bit.dtype_out") Traceback (most recent call last): File "/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/Users/chenlai/executorch/examples/models/llama2/eval_llama.py", line 13, in from .eval_llama_lib import build_args_parser, eval_llama File "/Users/chenlai/executorch/examples/models/llama2/eval_llama_lib.py", line 19, in from executorch.extension.llm.export import LLMEdgeManager ImportError: cannot import name 'LLMEdgeManager' from 'executorch.extension.llm.export' (/opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/extension/llm/export/__init__.py) (executorch) chenlai@chenlai-mbp executorch % (executorch) chenlai@chenlai-mbp executorch % ``` after ``` (executorch) chenlai@chenlai-mbp executorch % python -m examples.models.llama2.eval_llama -c /Users/chenlai/Documents/stories110M/stories110M/stories110M.pt -p /Users/chenlai/Documents/stories110M/stories110M/params.json -t /Users/chenlai/Documents/stories110M/stories110M/tokenizer.model -d fp32 --max_seq_len 127 --limit 5 /opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable. warn("The installed version of bitsandbytes was compiled without GPU support. " 'NoneType' object has no attribute 'cadam32bit_grad_fp32' /opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:106: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch. impl_abstract("quantized_decomposed::embedding_byte.out") /opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:153: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch. impl_abstract("quantized_decomposed::embedding_byte.dtype_out") /opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:228: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch. impl_abstract("quantized_decomposed::embedding_4bit.out") /opt/homebrew/anaconda3/envs/executorch/lib/python3.10/site-packages/executorch/exir/passes/_quant_patterns_and_replacements.py:281: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch. impl_abstract("quantized_decomposed::embedding_4bit.dtype_out") 2024-07-30:12:36:04,260 INFO [tokenizer.py:33] #words: 32000 - BOS ID: 1 - EOS ID: 2 2024-07-30:12:36:04,260 INFO [export_llama_lib.py:419] Applying quantizers: [] 2024-07-30:12:36:04,260 INFO [export_llama_lib.py:594] Loading model with checkpoint=/Users/chenlai/Documents/stories110M/stories110M/stories110M.pt, params=/Users/chenlai/Documents/stories110M/stories110M/params.json, use_kv_cache=False, weight_type=WeightType.LLAMA /Users/chenlai/executorch/examples/models/llama2/model.py:99: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True) 2024-07-30:12:36:04,315 INFO [export_llama_lib.py:616] Loaded model with dtype=torch.float32 2024-07-30:12:36:04,395 INFO [huggingface.py:162] Using device 'cpu' 2024-07-30:12:36:27,262 WARNING [task.py:763] [Task: wikitext] metric word_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity 2024-07-30:12:36:27,262 WARNING [task.py:775] [Task: wikitext] metric word_perplexity is defined, but higher_is_better is not. using default higher_is_better=False 2024-07-30:12:36:27,262 WARNING [task.py:763] [Task: wikitext] metric byte_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity 2024-07-30:12:36:27,262 WARNING [task.py:775] [Task: wikitext] metric byte_perplexity is defined, but higher_is_better is not. using default higher_is_better=False 2024-07-30:12:36:27,262 WARNING [task.py:763] [Task: wikitext] metric bits_per_byte is defined, but aggregation is not. using default aggregation=bits_per_byte 2024-07-30:12:36:27,262 WARNING [task.py:775] [Task: wikitext] metric bits_per_byte is defined, but higher_is_better is not. using default higher_is_better=False Repo card metadata block was not found. Setting CardData to empty. 2024-07-30:12:36:29,494 WARNING [repocard.py:107] Repo card metadata block was not found. Setting CardData to empty. 2024-07-30:12:36:30,401 INFO [task.py:395] Building contexts for wikitext on rank 0... 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 718.57it/s] 2024-07-30:12:36:30,410 INFO [evaluator.py:362] Running loglikelihood_rolling requests 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:14<00:00, 2.91s/it] wikitext: {'word_perplexity,none': 10885.215324239069, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 6.144013518032613, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 2.6191813902741017, 'bits_per_byte_stderr,none': 'N/A', 'alias': 'wikitext'} ``` ghstack-source-id: 235865354 exported-using-ghexport Reviewed By: larryliu0820 Differential Revision: D60466386 fbshipit-source-id: 0032af8b3269f107469fe142382dfacb06751808 --- extension/llm/export/__init__.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/extension/llm/export/__init__.py b/extension/llm/export/__init__.py index e69de29bb2..7b17c223c3 100644 --- a/extension/llm/export/__init__.py +++ b/extension/llm/export/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from .builder import LLMEdgeManager + +__all__ = [ + "LLMEdgeManager", +] From 1ec3444707649dc78e2ce49805e39283a4a4bb1a Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 30 Jul 2024 14:49:57 -0700 Subject: [PATCH 30/75] Migrate sampler to extension/llm (#4460) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4460 Move sampler code to extension/llm so that it can be reused by llava runner. Reviewed By: helunwencser Differential Revision: D60458803 fbshipit-source-id: ef8c4d7d3fed4f0777e5ba9cd403da8320efef5a --- examples/models/llama2/runner/runner.h | 2 +- examples/models/llama2/runner/targets.bzl | 2 +- {examples/models/llama2 => extension/llm}/sampler/TARGETS | 0 {examples/models/llama2 => extension/llm}/sampler/sampler.cpp | 2 +- {examples/models/llama2 => extension/llm}/sampler/sampler.h | 0 {examples/models/llama2 => extension/llm}/sampler/targets.bzl | 0 {examples/models/llama2 => extension/llm}/sampler/test/TARGETS | 0 .../models/llama2 => extension/llm}/sampler/test/targets.bzl | 0 .../llama2 => extension/llm}/sampler/test/test_sampler.cpp | 0 9 files changed, 3 insertions(+), 3 deletions(-) rename {examples/models/llama2 => extension/llm}/sampler/TARGETS (100%) rename {examples/models/llama2 => extension/llm}/sampler/sampler.cpp (99%) rename {examples/models/llama2 => extension/llm}/sampler/sampler.h (100%) rename {examples/models/llama2 => extension/llm}/sampler/targets.bzl (100%) rename {examples/models/llama2 => extension/llm}/sampler/test/TARGETS (100%) rename {examples/models/llama2 => extension/llm}/sampler/test/targets.bzl (100%) rename {examples/models/llama2 => extension/llm}/sampler/test/test_sampler.cpp (100%) diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h index c269a8c585..7b9d2763fc 100644 --- a/examples/models/llama2/runner/runner.h +++ b/examples/models/llama2/runner/runner.h @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl index 26659303e0..d525628174 100644 --- a/examples/models/llama2/runner/targets.bzl +++ b/examples/models/llama2/runner/targets.bzl @@ -33,7 +33,7 @@ def define_common_targets(): ], exported_deps = [ "//executorch/backends/xnnpack:xnnpack_backend", - "//executorch/examples/models/llama2/sampler:sampler" + aten_suffix, + "//executorch/extension/llm/sampler:sampler" + aten_suffix, "//executorch/extension/evalue_util:print_evalue" + aten_suffix, "//executorch/extension/runner_util:managed_tensor" + aten_suffix, "//executorch/extension/module:module" + aten_suffix, diff --git a/examples/models/llama2/sampler/TARGETS b/extension/llm/sampler/TARGETS similarity index 100% rename from examples/models/llama2/sampler/TARGETS rename to extension/llm/sampler/TARGETS diff --git a/examples/models/llama2/sampler/sampler.cpp b/extension/llm/sampler/sampler.cpp similarity index 99% rename from examples/models/llama2/sampler/sampler.cpp rename to extension/llm/sampler/sampler.cpp index 1ae4d2f9d7..be3307b715 100644 --- a/examples/models/llama2/sampler/sampler.cpp +++ b/extension/llm/sampler/sampler.cpp @@ -32,7 +32,7 @@ * SOFTWARE. */ -#include +#include namespace torch { namespace executor { diff --git a/examples/models/llama2/sampler/sampler.h b/extension/llm/sampler/sampler.h similarity index 100% rename from examples/models/llama2/sampler/sampler.h rename to extension/llm/sampler/sampler.h diff --git a/examples/models/llama2/sampler/targets.bzl b/extension/llm/sampler/targets.bzl similarity index 100% rename from examples/models/llama2/sampler/targets.bzl rename to extension/llm/sampler/targets.bzl diff --git a/examples/models/llama2/sampler/test/TARGETS b/extension/llm/sampler/test/TARGETS similarity index 100% rename from examples/models/llama2/sampler/test/TARGETS rename to extension/llm/sampler/test/TARGETS diff --git a/examples/models/llama2/sampler/test/targets.bzl b/extension/llm/sampler/test/targets.bzl similarity index 100% rename from examples/models/llama2/sampler/test/targets.bzl rename to extension/llm/sampler/test/targets.bzl diff --git a/examples/models/llama2/sampler/test/test_sampler.cpp b/extension/llm/sampler/test/test_sampler.cpp similarity index 100% rename from examples/models/llama2/sampler/test/test_sampler.cpp rename to extension/llm/sampler/test/test_sampler.cpp From 69f3f1c7dc3f60df78c5a86c035bb0b26fa654f1 Mon Sep 17 00:00:00 2001 From: Gyanendra Sinha Date: Tue, 30 Jul 2024 15:26:18 -0700 Subject: [PATCH 31/75] Fix prewarming (#4454) Summary: Prewarms the model if `config.should_prewarm_model` is `true`. This improves the latency of first inference call as the necessary objects are created when the model is prewarmed. Testing: Existing tests Pull Request resolved: https://github.com/pytorch/executorch/pull/4454 Reviewed By: kirklandsign Differential Revision: D60469148 Pulled By: cccclai fbshipit-source-id: d88883e721269d03298265dd420f08cbbe4787ce --- .../coreml/runtime/delegate/ETCoreMLModelManager.mm | 2 +- .../coreml/runtime/delegate/backend_delegate.mm | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm index e7846256e6..927df0483f 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm @@ -655,7 +655,7 @@ - (void)prewarmRecentlyUsedAssetsWithMaxCount:(NSUInteger)maxCount { NSError *prewarmError = nil; if (![asset prewarmAndReturnError:&prewarmError]) { - ETCoreMLLogError(localError, + ETCoreMLLogError(prewarmError, "%@: Failed to prewarm asset with identifier = %@", NSStringFromClass(strongSelf.assetManager.class), asset.identifier); diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.mm b/backends/apple/coreml/runtime/delegate/backend_delegate.mm index f6eb7a83fd..efa3dd2472 100644 --- a/backends/apple/coreml/runtime/delegate/backend_delegate.mm +++ b/backends/apple/coreml/runtime/delegate/backend_delegate.mm @@ -157,7 +157,7 @@ - (BOOL)_loadAndReturnError:(NSError * _Nullable __autoreleasing *)error { if (self.config.should_prewarm_asset) { [modelManager prewarmRecentlyUsedAssetsWithMaxCount:1]; } - + return YES; } @@ -188,9 +188,14 @@ - (ModelHandle*)loadModelFromAOTData:(NSData*)data return nil; } - return [self.impl loadModelFromAOTData:data - configuration:configuration - error:error]; + auto handle = [self.impl loadModelFromAOTData:data + configuration:configuration + error:error]; + if ((handle != NULL) && self.config.should_prewarm_model) { + [self.impl prewarmModelWithHandle:handle error:nil]; + } + + return handle; } - (BOOL)executeModelWithHandle:(ModelHandle*)handle From 9aeceeee3df8096ba7c89f422f584e26ace60733 Mon Sep 17 00:00:00 2001 From: Yujie Hui Date: Tue, 30 Jul 2024 15:29:34 -0700 Subject: [PATCH 32/75] Implement grid_priors op (#4440) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4440 Modify the spec of customized op `grid_prirors` to take a tensor as input. Compared to previous definition, the `height` and `width` arguments will be determined by the input tensor as `height, width = self.shape[-2:]`. The reason we change the spec is: if we want to support dynamic shape, the input should be a tensor. Implement customized op `grid_priors`. This op is used to generate mapped x,y points from different level feature map to original images. Op spec: ``` (Tensor self, int stride, float offset) -> Tensor ``` Example: ``` input_tensor = torch.rand(size = [1, 5, 2, 3]) stride = 8 offset = 0.5 output.shape = [3x2, 2] output = tensor([[ 4., 4.], [12., 4.], [20., 4.], [ 4., 12.], [12., 12.], [20., 12.]]) ``` Add smoke test for now due to some issue to lower customized op to Vulkan backend. Will add unit test and nn.Module test when be able to lower customized op from PyTorch to Vulkan backend. bypass-github-export-checks bypass-github-pytorch-ci-checks bypass-github-executorch-ci-checks Reviewed By: copyrightly Differential Revision: D60203196 fbshipit-source-id: 93e5180e80e07cc0b9acb50890a1187ce0f82951 --- backends/vulkan/passes/custom_ops_defs.py | 8 +- backends/vulkan/passes/test_custom_ops.py | 9 ++- .../runtime/graph/ops/glsl/grid_priors.glsl | 38 +++++++++ .../runtime/graph/ops/glsl/grid_priors.yaml | 12 +++ .../runtime/graph/ops/impl/GridPriors.cpp | 79 +++++++++++++++++++ .../vulkan/test/vulkan_compute_api_test.cpp | 72 +++++++++++++++++ 6 files changed, 210 insertions(+), 8 deletions(-) create mode 100644 backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl create mode 100644 backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml create mode 100644 backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp diff --git a/backends/vulkan/passes/custom_ops_defs.py b/backends/vulkan/passes/custom_ops_defs.py index 67e7db828a..c76f7ebf75 100644 --- a/backends/vulkan/passes/custom_ops_defs.py +++ b/backends/vulkan/passes/custom_ops_defs.py @@ -49,11 +49,11 @@ def conv_with_clamp_impl( def grid_priors_impl( - height, - width, + x, stride, offset, ): + height, width = x.shape[-2:] shift_x = (torch.arange(0, width) + offset) * stride shift_y = (torch.arange(0, height) + offset) * stride shift_xx, shift_yy = torch.meshgrid(shift_y, shift_x) @@ -64,6 +64,6 @@ def grid_priors_impl( name = "grid_priors" -lib.define(f"{name}(int height, int width, int stride, float offset) -> Tensor") -lib.impl(name, grid_priors_impl) +lib.define(f"{name}(Tensor self, int stride, float offset) -> Tensor") +lib.impl(name, grid_priors_impl, "CompositeExplicitAutograd") grid_priors_op = getattr(getattr(torch.ops, namespace), name) diff --git a/backends/vulkan/passes/test_custom_ops.py b/backends/vulkan/passes/test_custom_ops.py index a1a3a40f67..c68dd6d679 100644 --- a/backends/vulkan/passes/test_custom_ops.py +++ b/backends/vulkan/passes/test_custom_ops.py @@ -97,14 +97,15 @@ class GridPriors(torch.nn.Module): def __init__(self): super().__init__() - def forward(self, height, width, stride, offset): - return torch.ops.et_vk.grid_priors(height, width, stride, offset) + def forward(self, x, stride, offset): + return torch.ops.et_vk.grid_priors(x, stride, offset) model = GridPriors() - sample_input = (2, 3, 4, 0.5) + sample_input = (torch.rand(2, 5, 2, 3), 4, 0.5) custom_out = model(*sample_input) - def calculate_expected_output(height, width, stride, offset): + def calculate_expected_output(x, stride, offset): + height, width = x.shape[-2:] shift_x = (torch.arange(0, width) + offset) * stride shift_y = (torch.arange(0, height) + offset) * stride shift_xx, shift_yy = torch.meshgrid(shift_y, shift_x) diff --git a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl new file mode 100644 index 0000000000..93a2c53e01 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl @@ -0,0 +1,38 @@ +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_type(DTYPE)} + +layout(std430) buffer; + +#include "indexing_utils.h" + +${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_ubo(1, "ivec4", "in_sizes")} +${layout_declare_ubo(2, "ivec4", "out_sizes")} +${layout_declare_ubo(3, "int", "stride", "float", "offset")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int packed_dim = C_DIM; + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim); + + if (pos_out_of_bounds(pos, out_sizes, packed_dim)) { + return; + } + int width = in_sizes.x; + VEC4_T outtex; + if (pos.x == 0) { + float value = (pos.y % width + offset) * stride; + outtex = VEC4_T(value, 0, 0, 0); + } else if (pos.x == 1) { + float value = (pos.y / width + offset) * stride; + outtex = VEC4_T(value, 0, 0, 0); + } + + imageStore(t_out, pos, outtex); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml new file mode 100644 index 0000000000..654edca610 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml @@ -0,0 +1,12 @@ +grid_priors: + parameter_names_with_default_values: + NDIM: 3 + DTYPE: float + PACKING: C_packed + STORAGE: texture3d + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + shader_variants: + - NAME: grid_priors diff --git a/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp b/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp new file mode 100644 index 0000000000..b0658e37c2 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +namespace vkcompute { + +struct GridPriorsParam final { + int32_t stride; + float offset; +}; + +void resize_grid_priors_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + vTensorPtr out = graph->get_tensor(args[0].refs[0]); + vTensorPtr in = graph->get_tensor(extra_args[0]); + std::vector in_sizes = in->sizes(); + int64_t height = in_sizes.at(in_sizes.size() - 2); + int64_t width = in_sizes.at(in_sizes.size() - 1); + std::vector sizes = {height * width, 2}; + out->virtual_resize(sizes); +} + +void add_grid_priors_node( + ComputeGraph& graph, + const ValueRef& in, + const ValueRef& stride_ref, + const ValueRef& offset_ref, + const ValueRef& out) { + vTensorPtr t_out = graph.get_tensor(out); + vTensorPtr t_in = graph.get_tensor(in); + int32_t stride = graph.extract_scalar(stride_ref); + float offset = graph.extract_scalar(offset_ref); + + std::string kernel_name = "grid_priors"; + kernel_name.reserve(kShaderNameReserve); + add_dtype_suffix(kernel_name, *t_out); + + GridPriorsParam param = {stride, offset}; + graph.execute_nodes().emplace_back(new ExecuteNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + graph.create_global_wg_size(out), + graph.create_local_wg_size(out), + // Inputs and Outputs + { + {out, vkapi::MemoryAccessType::WRITE}, + }, + // Shader params buffers + { + t_in->sizes_ubo(), + t_out->sizes_ubo(), + graph.create_params_buffer(param), + }, + // Specialization Constants + {}, + resize_grid_priors_node, + {in})); +} + +void grid_priors(ComputeGraph& graph, const std::vector& args) { + return add_grid_priors_node(graph, args[0], args[1], args[2], args[3]); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(grid_priors.default, grid_priors); +} +} // namespace vkcompute diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 9260475ab6..9d87de8bff 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -2203,3 +2203,75 @@ TEST(VulkanComputeGraphOpsTest, conv2d_prepack_test) { 0, 3, 9, 0, 0, 6, 12, 0, 0, 5, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); } + +void test_grid_priors( + std::vector input_sizes, + std::vector output_sizes, + int stride, + double offset, + const std::vector& data_out_expected) { + GraphConfig config; + ComputeGraph graph(config); + + // Build graph + IOValueRef in = graph.add_input_tensor( + input_sizes, + vkapi::kFloat, + utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED); + IOValueRef out; + out.value = graph.add_tensor( + output_sizes, + vkapi::kFloat, + utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED); + + VK_GET_OP_FN("grid_priors.default") + (graph, + {in.value, + graph.add_scalar(stride), + graph.add_scalar(offset), + out.value}); + + out.staging = graph.set_output_tensor(out.value); + + graph.prepare(); + graph.encode_prepack(); + graph.prepack(); + graph.encode_execute(); + + vTensorPtr t_in = graph.get_tensor(in.value); + vTensorPtr t_out = graph.get_tensor(out.value); + // Resize input + graph.propagate_resize(); + + // run graph + graph.execute(); + + std::vector output_data(t_out->gpu_numel()); + graph.copy_from_staging(out.staging, output_data.data(), output_data.size()); + + // check results + int h_out = utils::val_at(-2, t_out->sizes()); + int w_out = utils::val_at(-1, t_out->sizes()); + for (size_t i = 0; i < h_out; ++i) { + for (size_t j = 0; j < w_out; ++j) { + size_t idx_out = i * w_out + j; + CHECK_VALUE(output_data, idx_out, data_out_expected[idx_out]); + } + } +} + +TEST(VulkanComputeGraphOpsTest, grid_priors_test) { + test_grid_priors( + /*input size = */ {1, 5, 2, 3}, + /*output size = */ {6, 2}, + /*stride = */ 1, + /*offset = */ 0.0, + /*data_out_expected = */ {0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 2, 1}); + + test_grid_priors( + /*input size = */ {1, 5, 2, 3}, + /*output size = */ {6, 2}, + /*stride = */ 8, + /*offset = */ 0.5, + /*data_out_expected = */ {4, 4, 12, 4, 20, 4, 4, 12, 12, 12, 20, 12}); +} From a567abfd0853c0c59302173cb11c727d5fae3416 Mon Sep 17 00:00:00 2001 From: Chirag Modi Date: Tue, 30 Jul 2024 15:38:18 -0700 Subject: [PATCH 33/75] Porting over ET MultiModal Demo App (#4455) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4455 Adding ET demo app for multimodal support. This is the first diff that supports Llama3. This includes major changes to the existing Llama Demo app with having the following features: 1. Llama3 support 2. Settings activity 3. UI/UX improvements to the mainactivity 4. Ability to add multi-images in prep for multimodal support. 5. Metrics Note: You'll need to build the `executorch-llama.aar` and have it placed in the `app/libs` folder. Reviewed By: kirklandsign Differential Revision: D60416605 fbshipit-source-id: 262329c30e1ec28c3905da5c040dc661307f8666 --- .../android/LlamaDemo/app/build.gradle.kts | 5 +- .../app/src/main/AndroidManifest.xml | 32 +- .../example/executorchllamademo/AppLog.java | 49 ++ .../DemoSharedPreferences.java | 90 +++ .../example/executorchllamademo/ETImage.java | 116 ++++ .../executorchllamademo/ETLogging.java | 54 ++ .../executorchllamademo/LogsActivity.java | 86 +++ .../executorchllamademo/LogsAdapter.java | 45 ++ .../executorchllamademo/MainActivity.java | 631 +++++++++++++++--- .../example/executorchllamademo/Message.java | 60 +- .../executorchllamademo/MessageAdapter.java | 67 +- .../executorchllamademo/MessageType.java | 15 + .../executorchllamademo/SettingsActivity.java | 325 +++++++++ .../executorchllamademo/SettingsFields.java | 135 ++++ .../src/main/res/drawable/banner_shape.xml | 7 + .../src/main/res/drawable/baseline_add_24.xml | 5 + .../baseline_add_photo_alternate_24.xml | 5 + .../main/res/drawable/baseline_article_24.xml | 5 + .../main/res/drawable/baseline_close_24.xml | 5 + .../drawable/baseline_delete_forever_24.xml | 5 + .../res/drawable/baseline_restart_alt_24.xml | 6 + .../main/res/drawable/baseline_send_24.xml | 5 + .../res/drawable/baseline_settings_24.xml | 10 + .../main/res/drawable/baseline_stop_24.xml | 5 + .../app/src/main/res/drawable/btn.xml | 8 + .../main/res/drawable/custom_button_round.xml | 7 + .../main/res/drawable/input_text_shape.xml | 10 + .../app/src/main/res/drawable/logo.png | Bin 0 -> 33036 bytes .../main/res/drawable/outline_add_box_48.xml | 5 + .../outline_arrow_drop_down_circle_24.xml | 5 + .../res/drawable/outline_camera_alt_48.xml | 5 + .../main/res/drawable/outline_image_48.xml | 5 + .../src/main/res/drawable/prompt_shape.xml | 6 + .../app/src/main/res/layout/activity_logs.xml | 55 ++ .../app/src/main/res/layout/activity_main.xml | 241 ++++++- .../src/main/res/layout/activity_settings.xml | 233 +++++++ .../app/src/main/res/layout/logs_message.xml | 16 + .../src/main/res/layout/received_message.xml | 40 +- .../app/src/main/res/layout/sent_message.xml | 58 +- .../src/main/res/layout/system_message.xml | 23 + .../app/src/main/res/values/colors.xml | 4 +- .../app/src/main/res/values/strings.xml | 4 + .../app/src/main/res/values/styles.xml | 4 + 43 files changed, 2328 insertions(+), 169 deletions(-) create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/logo.png create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/logs_message.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/system_message.xml diff --git a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts index 3c168689f7..37c8cbf0ba 100644 --- a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts +++ b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts @@ -17,7 +17,7 @@ android { defaultConfig { applicationId = "com.example.executorchllamademo" - minSdk = 24 + minSdk = 28 targetSdk = 33 versionCode = 1 versionName = "1.0" @@ -56,7 +56,10 @@ dependencies { implementation("androidx.camera:camera-core:1.3.0-rc02") implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12") implementation("com.facebook.fbjni:fbjni:0.5.1") + implementation("com.google.code.gson:gson:2.8.6") implementation(files("libs/executorch-llama.aar")) + implementation("com.google.android.material:material:1.12.0") + implementation("androidx.activity:activity:1.9.0") testImplementation("junit:junit:4.13.2") androidTestImplementation("androidx.test.ext:junit:1.1.5") androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1") diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml index 3eaf301b5a..bb231420df 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml @@ -3,32 +3,44 @@ xmlns:tools="http://schemas.android.com/tools" package="com.example.executorchllamademo"> - + + + + + + + - + + android:theme="@style/Theme.AppCompat.Light.NoActionBar"> diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java new file mode 100644 index 0000000000..36d0741938 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java @@ -0,0 +1,49 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Locale; + +public class AppLog { + private final Long timestamp; + private final String message; + + public AppLog(String message) { + this.timestamp = getCurrentTimeStamp(); + this.message = message; + } + + public Long getTimestamp() { + return timestamp; + } + + public String getMessage() { + return message; + } + + public String getFormattedLog() { + return "[" + getFormattedTimeStamp() + "] " + message; + } + + private Long getCurrentTimeStamp() { + return System.currentTimeMillis(); + } + + private String getFormattedTimeStamp() { + return formatDate(timestamp); + } + + private String formatDate(long milliseconds) { + SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.getDefault()); + Date date = new Date(milliseconds); + return formatter.format(date); + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java new file mode 100644 index 0000000000..99a94c00eb --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java @@ -0,0 +1,90 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import android.content.Context; +import android.content.SharedPreferences; +import com.google.gson.Gson; +import com.google.gson.reflect.TypeToken; +import java.lang.reflect.Type; +import java.util.ArrayList; + +public class DemoSharedPreferences { + Context context; + SharedPreferences sharedPreferences; + + public DemoSharedPreferences(Context context) { + this.context = context; + this.sharedPreferences = getSharedPrefs(); + } + + private SharedPreferences getSharedPrefs() { + return context.getSharedPreferences( + context.getString(R.string.demo_pref_file_key), Context.MODE_PRIVATE); + } + + public String getSavedMessages() { + return sharedPreferences.getString(context.getString(R.string.saved_messages_json_key), ""); + } + + public void addMessages(MessageAdapter messageAdapter) { + SharedPreferences.Editor editor = sharedPreferences.edit(); + Gson gson = new Gson(); + String msgJSON = gson.toJson(messageAdapter.getSavedMessages()); + editor.putString(context.getString(R.string.saved_messages_json_key), msgJSON); + editor.apply(); + } + + public void removeExistingMessages() { + SharedPreferences.Editor editor = sharedPreferences.edit(); + editor.remove(context.getString(R.string.saved_messages_json_key)); + editor.apply(); + } + + public void addSettings(SettingsFields settingsFields) { + SharedPreferences.Editor editor = sharedPreferences.edit(); + Gson gson = new Gson(); + String settingsJSON = gson.toJson(settingsFields); + editor.putString(context.getString(R.string.settings_json_key), settingsJSON); + editor.apply(); + } + + public String getSettings() { + return sharedPreferences.getString(context.getString(R.string.settings_json_key), ""); + } + + public void saveLogs() { + SharedPreferences.Editor editor = sharedPreferences.edit(); + Gson gson = new Gson(); + String msgJSON = gson.toJson(ETLogging.getInstance().getLogs()); + editor.putString(context.getString(R.string.logs_json_key), msgJSON); + editor.apply(); + } + + public void removeExistingLogs() { + SharedPreferences.Editor editor = sharedPreferences.edit(); + editor.remove(context.getString(R.string.logs_json_key)); + editor.apply(); + } + + public ArrayList getSavedLogs() { + String logsJSONString = + sharedPreferences.getString(context.getString(R.string.logs_json_key), null); + if (logsJSONString == null || logsJSONString.isEmpty()) { + return new ArrayList<>(); + } + Gson gson = new Gson(); + Type type = new TypeToken>() {}.getType(); + ArrayList appLogs = gson.fromJson(logsJSONString, type); + if (appLogs == null) { + return new ArrayList<>(); + } + return appLogs; + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java new file mode 100644 index 0000000000..cf3c3e5f0a --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java @@ -0,0 +1,116 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import android.content.ContentResolver; +import android.graphics.Bitmap; +import android.graphics.BitmapFactory; +import android.graphics.Color; +import android.net.Uri; +import androidx.annotation.Nullable; +import java.io.FileNotFoundException; +import java.io.InputStream; + +public class ETImage { + private int width; + private int height; + private final byte[] bytes; + private final Uri uri; + private final ContentResolver contentResolver; + + ETImage(ContentResolver contentResolver, Uri uri) { + this.contentResolver = contentResolver; + this.uri = uri; + bytes = getBytesFromImageURI(uri); + } + + public int getWidth() { + return width; + } + + public int getHeight() { + return height; + } + + public Uri getUri() { + return uri; + } + + public byte[] getBytes() { + return bytes; + } + + private byte[] getBytesFromImageURI(Uri uri) { + try { + int RESIZED_IMAGE_WIDTH = 336; + Bitmap bitmap = resizeImage(uri, RESIZED_IMAGE_WIDTH); + + if (bitmap == null) { + ETLogging.getInstance().log("Unable to get bytes from Image URI. Bitmap is null"); + return new byte[0]; + } + + width = bitmap.getWidth(); + height = bitmap.getHeight(); + + byte[] rgbValues = new byte[width * height * 3]; + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + // Get the color of the current pixel + int color = bitmap.getPixel(x, y); + + // Extract the RGB values from the color + int red = Color.red(color); + int green = Color.green(color); + int blue = Color.blue(color); + + // Store the RGB values in the byte array + rgbValues[(y * width + x) * 3] = (byte) red; + rgbValues[(y * width + x) * 3 + 1] = (byte) green; + rgbValues[(y * width + x) * 3 + 2] = (byte) blue; + } + } + return rgbValues; + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } + } + + @Nullable + private Bitmap resizeImage(Uri uri, int maxLength) throws FileNotFoundException { + InputStream inputStream = contentResolver.openInputStream(uri); + if (inputStream == null) { + ETLogging.getInstance().log("Unable to resize image, input streams is null"); + return null; + } + Bitmap bitmap = BitmapFactory.decodeStream(inputStream); + if (bitmap == null) { + ETLogging.getInstance().log("Unable to resize image, bitmap during decode stream is null"); + return null; + } + + float aspectRatio; + int finalWidth, finalHeight; + + if (bitmap.getWidth() > bitmap.getHeight()) { + // width > height --> width = maxLength, height scale with aspect ratio + aspectRatio = bitmap.getWidth() / (float) bitmap.getHeight(); + finalWidth = maxLength; + finalHeight = Math.round(maxLength / aspectRatio); + } else { + // height >= width --> height = maxLength, width scale with aspect ratio + aspectRatio = bitmap.getHeight() / (float) bitmap.getWidth(); + finalHeight = maxLength; + finalWidth = Math.round(maxLength / aspectRatio); + } + + return Bitmap.createScaledBitmap(bitmap, finalWidth, finalHeight, false); + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java new file mode 100644 index 0000000000..e595348945 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java @@ -0,0 +1,54 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import android.app.Application; +import android.util.Log; +import java.util.ArrayList; + +public class ETLogging extends Application { + private static ETLogging singleton; + + private ArrayList logs; + private DemoSharedPreferences mDemoSharedPreferences; + + @Override + public void onCreate() { + super.onCreate(); + singleton = this; + mDemoSharedPreferences = new DemoSharedPreferences(this.getApplicationContext()); + logs = mDemoSharedPreferences.getSavedLogs(); + if (logs == null) { // We don't have existing sharedPreference stored + logs = new ArrayList<>(); + } + } + + public static ETLogging getInstance() { + return singleton; + } + + public void log(String message) { + AppLog appLog = new AppLog(message); + logs.add(appLog); + Log.d("ETLogging", appLog.getMessage()); + } + + public ArrayList getLogs() { + return logs; + } + + public void clearLogs() { + logs.clear(); + mDemoSharedPreferences.removeExistingLogs(); + } + + public void saveLogs() { + mDemoSharedPreferences.saveLogs(); + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java new file mode 100644 index 0000000000..8700528d44 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import android.app.AlertDialog; +import android.content.DialogInterface; +import android.os.Bundle; +import android.widget.ImageButton; +import android.widget.ListView; +import androidx.appcompat.app.AppCompatActivity; +import androidx.core.graphics.Insets; +import androidx.core.view.ViewCompat; +import androidx.core.view.WindowInsetsCompat; + +public class LogsActivity extends AppCompatActivity { + + private LogsAdapter mLogsAdapter; + + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + setContentView(R.layout.activity_logs); + ViewCompat.setOnApplyWindowInsetsListener( + requireViewById(R.id.main), + (v, insets) -> { + Insets systemBars = insets.getInsets(WindowInsetsCompat.Type.systemBars()); + v.setPadding(systemBars.left, systemBars.top, systemBars.right, systemBars.bottom); + return insets; + }); + + setupLogs(); + setupClearLogsButton(); + } + + @Override + public void onResume() { + super.onResume(); + mLogsAdapter.clear(); + mLogsAdapter.addAll(ETLogging.getInstance().getLogs()); + mLogsAdapter.notifyDataSetChanged(); + } + + private void setupLogs() { + ListView mLogsListView = requireViewById(R.id.logsListView); + mLogsAdapter = new LogsAdapter(this, R.layout.logs_message); + + mLogsListView.setAdapter(mLogsAdapter); + mLogsAdapter.addAll(ETLogging.getInstance().getLogs()); + mLogsAdapter.notifyDataSetChanged(); + } + + private void setupClearLogsButton() { + ImageButton clearLogsButton = requireViewById(R.id.clearLogsButton); + clearLogsButton.setOnClickListener( + view -> { + new AlertDialog.Builder(this) + .setTitle("Delete Logs History") + .setMessage("Do you really want to delete logs history?") + .setIcon(android.R.drawable.ic_dialog_alert) + .setPositiveButton( + android.R.string.yes, + new DialogInterface.OnClickListener() { + public void onClick(DialogInterface dialog, int whichButton) { + // Clear the messageAdapter and sharedPreference + ETLogging.getInstance().clearLogs(); + mLogsAdapter.clear(); + mLogsAdapter.notifyDataSetChanged(); + } + }) + .setNegativeButton(android.R.string.no, null) + .show(); + }); + } + + @Override + protected void onDestroy() { + super.onDestroy(); + ETLogging.getInstance().saveLogs(); + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java new file mode 100644 index 0000000000..76c6a1aa1b --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import android.view.LayoutInflater; +import android.view.View; +import android.view.ViewGroup; +import android.widget.ArrayAdapter; +import android.widget.TextView; +import androidx.annotation.NonNull; +import java.util.Objects; + +public class LogsAdapter extends ArrayAdapter { + public LogsAdapter(android.content.Context context, int resource) { + super(context, resource); + } + + static class ViewHolder { + private TextView logTextView; + } + + @NonNull + @Override + public View getView(int position, View convertView, @NonNull ViewGroup parent) { + ViewHolder mViewHolder = null; + + String logMessage = Objects.requireNonNull(getItem(position)).getFormattedLog(); + + if (convertView == null || convertView.getTag() == null) { + mViewHolder = new ViewHolder(); + convertView = LayoutInflater.from(getContext()).inflate(R.layout.logs_message, parent, false); + mViewHolder.logTextView = convertView.requireViewById(R.id.logsTextView); + } else { + mViewHolder = (ViewHolder) convertView.getTag(); + } + mViewHolder.logTextView.setText(logMessage); + return convertView; + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java index 2c94c242ed..44d310231a 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java @@ -8,32 +8,72 @@ package com.example.executorchllamademo; -import android.app.Activity; +import android.Manifest; import android.app.ActivityManager; import android.app.AlertDialog; -import android.content.Context; +import android.content.ContentResolver; +import android.content.ContentValues; +import android.content.Intent; +import android.content.pm.PackageManager; +import android.net.Uri; import android.os.Bundle; +import android.os.Handler; +import android.os.Looper; +import android.provider.MediaStore; import android.system.ErrnoException; import android.system.Os; -import android.widget.Button; +import android.text.InputType; +import android.util.Log; +import android.view.View; import android.widget.EditText; import android.widget.ImageButton; +import android.widget.ImageView; +import android.widget.LinearLayout; import android.widget.ListView; -import java.io.File; +import android.widget.TextView; +import android.widget.Toast; +import androidx.activity.result.ActivityResultLauncher; +import androidx.activity.result.PickVisualMediaRequest; +import androidx.activity.result.contract.ActivityResultContracts; +import androidx.annotation.NonNull; +import androidx.appcompat.app.AppCompatActivity; +import androidx.constraintlayout.widget.ConstraintLayout; +import androidx.core.app.ActivityCompat; +import androidx.core.content.ContextCompat; +import com.google.gson.Gson; +import com.google.gson.reflect.TypeToken; +import java.lang.reflect.Type; +import java.util.ArrayList; +import java.util.List; import org.pytorch.executorch.LlamaCallback; import org.pytorch.executorch.LlamaModule; -public class MainActivity extends Activity implements Runnable, LlamaCallback { +public class MainActivity extends AppCompatActivity implements Runnable, LlamaCallback { private EditText mEditTextMessage; - private Button mSendButton; - private ImageButton mModelButton; + private ImageButton mSendButton; + private ImageButton mGalleryButton; + private ImageButton mCameraButton; private ListView mMessagesView; private MessageAdapter mMessageAdapter; private LlamaModule mModule = null; private Message mResultMessage = null; - - private String mModelFilePath = ""; - private String mTokenizerFilePath = ""; + private ImageButton mSettingsButton; + private TextView mMemoryView; + private ActivityResultLauncher mPickGallery; + private ActivityResultLauncher mCameraRoll; + private List mSelectedImageUri; + private ConstraintLayout mMediaPreviewConstraintLayout; + private LinearLayout mAddMediaLayout; + private static final int MAX_NUM_OF_IMAGES = 5; + private static final int REQUEST_IMAGE_CAPTURE = 1; + private Uri cameraImageUri; + private DemoSharedPreferences mDemoSharedPreferences; + private SettingsFields mCurrentSettingsFields; + private Handler mMemoryUpdateHandler; + private Runnable memoryUpdater; + // UI Specific to user using INSTRUCT_MODE + private boolean INSTRUCT_MODE = false; + private String INSTRUCT_INSTRUCTION = "In Instruct Mode. Press SEND"; @Override public void onResult(String result) { @@ -52,23 +92,13 @@ public void onStats(float tps) { }); } - private static String[] listLocalFile(String path, String suffix) { - File directory = new File(path); - if (directory.exists() && directory.isDirectory()) { - File[] files = directory.listFiles((dir, name) -> name.toLowerCase().endsWith(suffix)); - String[] result = new String[files.length]; - for (int i = 0; i < files.length; i++) { - if (files[i].isFile() && files[i].getName().endsWith(suffix)) { - result[i] = files[i].getAbsolutePath(); - } - } - return result; + private void setLocalModel(String modelPath, String tokenizerPath, float temperature) { + if (mModule != null) { + mModule.resetNative(); + mModule = null; } - return new String[0]; - } - - private void setLocalModel(String modelPath, String tokenizerPath) { - Message modelLoadingMessage = new Message("Loading model...", false); + Message modelLoadingMessage = new Message("Loading model...", false, MessageType.SYSTEM, 0); + ETLogging.getInstance().log("Loading model " + modelPath + " with tokenizer " + tokenizerPath); runOnUiThread( () -> { mSendButton.setEnabled(false); @@ -76,9 +106,15 @@ private void setLocalModel(String modelPath, String tokenizerPath) { mMessageAdapter.notifyDataSetChanged(); }); long runStartTime = System.currentTimeMillis(); - mModule = new LlamaModule(modelPath, tokenizerPath, 0.8f); + mModule = new LlamaModule(modelPath, tokenizerPath, temperature); int loadResult = mModule.load(); + long loadDuration = System.currentTimeMillis() - runStartTime; + String modelLoadError = ""; + String modelInfo = ""; if (loadResult != 0) { + // TODO: Map the error code to a reason to let the user know why model loading failed + modelInfo = "*Model could not load (Error Code: " + loadResult + ")*" + "\n"; + loadDuration = 0; AlertDialog.Builder builder = new AlertDialog.Builder(this); builder.setTitle("Load failed: " + loadResult); runOnUiThread( @@ -86,18 +122,37 @@ private void setLocalModel(String modelPath, String tokenizerPath) { AlertDialog alert = builder.create(); alert.show(); }); + } else { + String[] segments = modelPath.split("/"); + String pteName = segments[segments.length - 1]; + segments = tokenizerPath.split("/"); + String tokenizerName = segments[segments.length - 1]; + modelInfo = + "Successfully loaded model. " + + pteName + + " and tokenizer " + + tokenizerName + + " in " + + (float) loadDuration / 1000 + + " sec." + + " You can send text or image for inference"; } - long loadDuration = System.currentTimeMillis() - runStartTime; - String modelInfo = - "Model path: " + Message modelLoadedMessage = new Message(modelInfo, false, MessageType.SYSTEM, 0); + + String modelLoggingInfo = + modelLoadError + + "Model path: " + modelPath + "\nTokenizer path: " + tokenizerPath + + "\nTemperature: " + + temperature + "\nModel loaded time: " + loadDuration + " ms"; - Message modelLoadedMessage = new Message(modelInfo, false); + ETLogging.getInstance().log("Load complete. " + modelLoggingInfo); + runOnUiThread( () -> { mSendButton.setEnabled(true); @@ -107,55 +162,26 @@ private void setLocalModel(String modelPath, String tokenizerPath) { }); } - private String memoryInfo() { - final ActivityManager am = (ActivityManager) getSystemService(Context.ACTIVITY_SERVICE); - ActivityManager.MemoryInfo memInfo = new ActivityManager.MemoryInfo(); - am.getMemoryInfo(memInfo); - return "Total RAM: " - + Math.floorDiv(memInfo.totalMem, 1000000) - + " MB. Available RAM: " - + Math.floorDiv(memInfo.availMem, 1000000) - + " MB."; - } - - private void modelDialog() { - String[] pteFiles = listLocalFile("/data/local/tmp/llama/", ".pte"); - String[] binFiles = listLocalFile("/data/local/tmp/llama/", ".bin"); - String[] modelFiles = listLocalFile("/data/local/tmp/llama/", ".model"); - String[] tokenizerFiles = new String[binFiles.length + modelFiles.length]; - System.arraycopy(binFiles, 0, tokenizerFiles, 0, binFiles.length); - System.arraycopy(modelFiles, 0, tokenizerFiles, binFiles.length, modelFiles.length); - AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this); - modelPathBuilder.setTitle("Select model path"); - AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this); - tokenizerPathBuilder.setTitle("Select tokenizer path"); - modelPathBuilder.setSingleChoiceItems( - pteFiles, - -1, - (dialog, item) -> { - mModelFilePath = pteFiles[item]; - mEditTextMessage.setText(""); - dialog.dismiss(); - tokenizerPathBuilder.create().show(); - }); - - tokenizerPathBuilder.setSingleChoiceItems( - tokenizerFiles, - -1, - (dialog, item) -> { - mTokenizerFilePath = tokenizerFiles[item]; - Runnable runnable = - new Runnable() { - @Override - public void run() { - setLocalModel(mModelFilePath, mTokenizerFilePath); - } - }; - new Thread(runnable).start(); - dialog.dismiss(); - }); + private void loadLocalModelAndParameters( + String modelFilePath, String tokenizerFilePath, float temperature) { + Runnable runnable = + new Runnable() { + @Override + public void run() { + setLocalModel(modelFilePath, tokenizerFilePath, temperature); + } + }; + new Thread(runnable).start(); + } - modelPathBuilder.create().show(); + private void populateExistingMessages(String existingMsgJSON) { + Gson gson = new Gson(); + Type type = new TypeToken>() {}.getType(); + ArrayList savedMessages = gson.fromJson(existingMsgJSON, type); + for (Message msg : savedMessages) { + mMessageAdapter.add(msg); + } + mMessageAdapter.notifyDataSetChanged(); } @Override @@ -169,27 +195,379 @@ protected void onCreate(Bundle savedInstanceState) { finish(); } - mEditTextMessage = findViewById(R.id.editTextMessage); - mSendButton = findViewById(R.id.sendButton); + mEditTextMessage = requireViewById(R.id.editTextMessage); + mSendButton = requireViewById(R.id.sendButton); mSendButton.setEnabled(false); - mModelButton = findViewById(R.id.modelButton); - mMessagesView = findViewById(R.id.messages_view); - mMessageAdapter = new MessageAdapter(this, R.layout.sent_message); + mMessagesView = requireViewById(R.id.messages_view); + mMessageAdapter = new MessageAdapter(this, R.layout.sent_message, new ArrayList()); mMessagesView.setAdapter(mMessageAdapter); - mModelButton.setOnClickListener( + mDemoSharedPreferences = new DemoSharedPreferences(this.getApplicationContext()); + String existingMsgJSON = mDemoSharedPreferences.getSavedMessages(); + if (!existingMsgJSON.isEmpty()) { + populateExistingMessages(existingMsgJSON); + } + mSettingsButton = requireViewById(R.id.settings); + mSettingsButton.setOnClickListener( view -> { - mModule.stop(); - mMessageAdapter.clear(); - mMessageAdapter.notifyDataSetChanged(); - modelDialog(); + Intent myIntent = new Intent(MainActivity.this, SettingsActivity.class); + MainActivity.this.startActivity(myIntent); }); + mCurrentSettingsFields = new SettingsFields(); + mMemoryUpdateHandler = new Handler(Looper.getMainLooper()); onModelRunStopped(); - modelDialog(); + setupMediaButton(); + setupGalleryPicker(); + setupCameraRoll(); + startMemoryUpdate(); + setupShowLogsButton(); + } + + @Override + protected void onPause() { + super.onPause(); + mDemoSharedPreferences.addMessages(mMessageAdapter); + } + + @Override + protected void onResume() { + super.onResume(); + // Check for if settings parameters have changed + Gson gson = new Gson(); + String settingsFieldsJSON = mDemoSharedPreferences.getSettings(); + if (!settingsFieldsJSON.isEmpty()) { + SettingsFields updatedSettingsFields = + gson.fromJson(settingsFieldsJSON, SettingsFields.class); + if (updatedSettingsFields == null) { + // Added this check, because gson.fromJson can return null + askUserToSelectModel(); + return; + } + boolean isUpdated = !mCurrentSettingsFields.equals(updatedSettingsFields); + boolean isLoadModel = updatedSettingsFields.getIsLoadModel(); + if (isUpdated) { + if (isLoadModel) { + // If users change the model file, but not pressing loadModelButton, we won't load the new + // model + checkForUpdateAndReloadModel(updatedSettingsFields); + } else { + askUserToSelectModel(); + } + checkForPromptChange(updatedSettingsFields); + checkForClearChatHistory(updatedSettingsFields); + // Update current to point to the latest + mCurrentSettingsFields = new SettingsFields(updatedSettingsFields); + } + } else { + askUserToSelectModel(); + } + } + + private void checkForClearChatHistory(SettingsFields updatedSettingsFields) { + if (updatedSettingsFields.getIsClearChatHistory()) { + mMessageAdapter.clear(); + mMessageAdapter.notifyDataSetChanged(); + mDemoSharedPreferences.removeExistingMessages(); + // changing to false since chat history has been cleared. + updatedSettingsFields.saveIsClearChatHistory(false); + mDemoSharedPreferences.addSettings(updatedSettingsFields); + } + } + + private void checkForUpdateAndReloadModel(SettingsFields updatedSettingsFields) { + // TODO need to add 'load model' in settings and queue loading based on that + String modelPath = updatedSettingsFields.getModelFilePath(); + String tokenizerPath = updatedSettingsFields.getTokenizerFilePath(); + double temperature = updatedSettingsFields.getTemperature(); + if (!modelPath.isEmpty() && !tokenizerPath.isEmpty()) { + if (updatedSettingsFields.getIsLoadModel() + || !modelPath.equals(mCurrentSettingsFields.getModelFilePath()) + || !tokenizerPath.equals(mCurrentSettingsFields.getTokenizerFilePath()) + || temperature != mCurrentSettingsFields.getTemperature()) { + loadLocalModelAndParameters( + updatedSettingsFields.getModelFilePath(), + updatedSettingsFields.getTokenizerFilePath(), + (float) updatedSettingsFields.getTemperature()); + updatedSettingsFields.saveLoadModelAction(false); + mDemoSharedPreferences.addSettings(updatedSettingsFields); + } + } else { + askUserToSelectModel(); + } + } + + private void checkForPromptChange(SettingsFields updatedSettingsFields) { + if (updatedSettingsFields.isSystemPromptChanged() + || updatedSettingsFields.isUserPromptChanged()) { + enableInstructMode(); + } else { + disableInstructMode(); + } + } + + private void enableInstructMode() { + INSTRUCT_MODE = true; + mEditTextMessage.setText(INSTRUCT_INSTRUCTION); + mEditTextMessage.setInputType(InputType.TYPE_NULL); + mEditTextMessage.clearFocus(); + } + + private void disableInstructMode() { + INSTRUCT_MODE = false; + mEditTextMessage.setText(""); + mEditTextMessage.setInputType(InputType.TYPE_CLASS_TEXT); + mEditTextMessage.clearFocus(); + } + + private void askUserToSelectModel() { + String askLoadModel = + "To get started, select your desired model and tokenizer " + "from the top right corner"; + Message askLoadModelMessage = new Message(askLoadModel, false, MessageType.SYSTEM, 0); + ETLogging.getInstance().log(askLoadModel); + runOnUiThread( + () -> { + mMessageAdapter.add(askLoadModelMessage); + mMessageAdapter.notifyDataSetChanged(); + }); + } + + private void setupShowLogsButton() { + ImageButton showLogsButton = requireViewById(R.id.showLogsButton); + showLogsButton.setOnClickListener( + view -> { + Intent myIntent = new Intent(MainActivity.this, LogsActivity.class); + MainActivity.this.startActivity(myIntent); + }); + } + + private void setupMediaButton() { + mAddMediaLayout = requireViewById(R.id.addMediaLayout); + mAddMediaLayout.setVisibility(View.GONE); // We hide this initially + + ImageButton addMediaButton = requireViewById(R.id.addMediaButton); + addMediaButton.setOnClickListener( + view -> { + mAddMediaLayout.setVisibility(View.VISIBLE); + }); + + mGalleryButton = requireViewById(R.id.galleryButton); + mGalleryButton.setOnClickListener( + view -> { + // Launch the photo picker and let the user choose only images. + mPickGallery.launch( + new PickVisualMediaRequest.Builder() + .setMediaType(ActivityResultContracts.PickVisualMedia.ImageOnly.INSTANCE) + .build()); + }); + mCameraButton = requireViewById(R.id.cameraButton); + mCameraButton.setOnClickListener( + view -> { + Log.d("CameraRoll", "Check permission"); + if (ContextCompat.checkSelfPermission(MainActivity.this, Manifest.permission.CAMERA) + != PackageManager.PERMISSION_GRANTED) { + ActivityCompat.requestPermissions( + MainActivity.this, + new String[] {Manifest.permission.CAMERA}, + REQUEST_IMAGE_CAPTURE); + } else { + launchCamera(); + } + }); + } + + private void setupCameraRoll() { + // Registers a camera roll activity launcher. + mCameraRoll = + registerForActivityResult( + new ActivityResultContracts.TakePicture(), + result -> { + if (result && cameraImageUri != null) { + Log.d("CameraRoll", "Photo saved to uri: " + cameraImageUri); + mAddMediaLayout.setVisibility(View.GONE); + List uris = new ArrayList<>(); + uris.add(cameraImageUri); + showMediaPreview(uris); + } else { + // Delete the temp image file based on the url since the photo is not successfully + // taken + if (cameraImageUri != null) { + ContentResolver contentResolver = MainActivity.this.getContentResolver(); + contentResolver.delete(cameraImageUri, null, null); + Log.d("CameraRoll", "No photo taken. Delete temp uri"); + } + } + }); + mMediaPreviewConstraintLayout = requireViewById(R.id.mediaPreviewConstraintLayout); + ImageButton mediaPreviewCloseButton = requireViewById(R.id.mediaPreviewCloseButton); + mediaPreviewCloseButton.setOnClickListener( + view -> { + mMediaPreviewConstraintLayout.setVisibility(View.GONE); + mSelectedImageUri = null; + }); + + ImageButton addMoreImageButton = requireViewById(R.id.addMoreImageButton); + addMoreImageButton.setOnClickListener( + view -> { + Log.d("addMore", "clicked"); + mMediaPreviewConstraintLayout.setVisibility(View.GONE); + // Direct user to select type of input + mCameraButton.callOnClick(); + }); + } + + private String updateMemoryUsage() { + ActivityManager.MemoryInfo memoryInfo = new ActivityManager.MemoryInfo(); + ActivityManager activityManager = (ActivityManager) getSystemService(ACTIVITY_SERVICE); + if (activityManager == null) { + return "---"; + } + activityManager.getMemoryInfo(memoryInfo); + long totalMem = memoryInfo.totalMem / (1024 * 1024); + long availableMem = memoryInfo.availMem / (1024 * 1024); + long usedMem = totalMem - availableMem; + return usedMem + "MB"; + } + + private void startMemoryUpdate() { + mMemoryView = requireViewById(R.id.ram_usage_live); + memoryUpdater = + new Runnable() { + @Override + public void run() { + mMemoryView.setText(updateMemoryUsage()); + mMemoryUpdateHandler.postDelayed(this, 1000); + } + }; + mMemoryUpdateHandler.post(memoryUpdater); + } + + @Override + public void onRequestPermissionsResult( + int requestCode, @NonNull String[] permissions, @NonNull int[] grantResults) { + super.onRequestPermissionsResult(requestCode, permissions, grantResults); + if (requestCode == REQUEST_IMAGE_CAPTURE && grantResults.length != 0) { + if (grantResults[0] == PackageManager.PERMISSION_GRANTED) { + launchCamera(); + } else if (grantResults[0] == PackageManager.PERMISSION_DENIED) { + Log.d("CameraRoll", "Permission denied"); + } + } + } + + private void launchCamera() { + ContentValues values = new ContentValues(); + values.put(MediaStore.Images.Media.TITLE, "New Picture"); + values.put(MediaStore.Images.Media.DESCRIPTION, "From Camera"); + values.put(MediaStore.Images.Media.RELATIVE_PATH, "DCIM/Camera/"); + cameraImageUri = + MainActivity.this + .getContentResolver() + .insert(MediaStore.Images.Media.EXTERNAL_CONTENT_URI, values); + mCameraRoll.launch(cameraImageUri); + } + + private void setupGalleryPicker() { + // Registers a photo picker activity launcher in single-select mode. + mPickGallery = + registerForActivityResult( + new ActivityResultContracts.PickMultipleVisualMedia(MAX_NUM_OF_IMAGES), + uris -> { + if (!uris.isEmpty()) { + Log.d("PhotoPicker", "Selected URIs: " + uris); + mAddMediaLayout.setVisibility(View.GONE); + for (Uri uri : uris) { + MainActivity.this + .getContentResolver() + .takePersistableUriPermission(uri, Intent.FLAG_GRANT_READ_URI_PERMISSION); + } + showMediaPreview(uris); + } else { + Log.d("PhotoPicker", "No media selected"); + } + }); + + mMediaPreviewConstraintLayout = requireViewById(R.id.mediaPreviewConstraintLayout); + ImageButton mediaPreviewCloseButton = requireViewById(R.id.mediaPreviewCloseButton); + mediaPreviewCloseButton.setOnClickListener( + view -> { + mMediaPreviewConstraintLayout.setVisibility(View.GONE); + mSelectedImageUri = null; + }); + + ImageButton addMoreImageButton = requireViewById(R.id.addMoreImageButton); + addMoreImageButton.setOnClickListener( + view -> { + Log.d("addMore", "clicked"); + mMediaPreviewConstraintLayout.setVisibility(View.GONE); + mGalleryButton.callOnClick(); + }); + } + + private List getProcessedImagesForModel(List uris) { + List imageList = new ArrayList<>(); + if (uris != null) { + uris.forEach( + (uri) -> { + imageList.add(new ETImage(this.getContentResolver(), uri)); + }); + } + return imageList; + } + + private void showMediaPreview(List uris) { + if (mSelectedImageUri == null) { + mSelectedImageUri = uris; + } else { + mSelectedImageUri.addAll(uris); + } + + if (mSelectedImageUri.size() > MAX_NUM_OF_IMAGES) { + mSelectedImageUri = mSelectedImageUri.subList(0, MAX_NUM_OF_IMAGES); + Toast.makeText( + this, "Only max " + MAX_NUM_OF_IMAGES + " images are allowed", Toast.LENGTH_SHORT) + .show(); + } + Log.d("mSelectedImageUri", mSelectedImageUri.size() + " " + mSelectedImageUri); + + mMediaPreviewConstraintLayout.setVisibility(View.VISIBLE); + + List imageViews = new ArrayList(); + + // Pre-populate all the image views that are available from the layout (currently max 5) + imageViews.add(requireViewById(R.id.mediaPreviewImageView1)); + imageViews.add(requireViewById(R.id.mediaPreviewImageView2)); + imageViews.add(requireViewById(R.id.mediaPreviewImageView3)); + imageViews.add(requireViewById(R.id.mediaPreviewImageView4)); + imageViews.add(requireViewById(R.id.mediaPreviewImageView5)); + + // Hide all the image views (reset state) + for (int i = 0; i < imageViews.size(); i++) { + imageViews.get(i).setVisibility(View.GONE); + } + + // Only show/render those that have proper Image URIs + for (int i = 0; i < mSelectedImageUri.size(); i++) { + imageViews.get(i).setVisibility(View.VISIBLE); + imageViews.get(i).setImageURI(mSelectedImageUri.get(i)); + } + } + + private void addSelectedImagesToChatThread(List selectedImageUri) { + if (selectedImageUri == null) { + return; + } + mMediaPreviewConstraintLayout.setVisibility(View.GONE); + for (int i = 0; i < selectedImageUri.size(); i++) { + Uri imageURI = selectedImageUri.get(i); + Log.d("image uri ", "test " + imageURI.getPath()); + mMessageAdapter.add(new Message(imageURI.toString(), true, MessageType.IMAGE, 0)); + } + mMessageAdapter.notifyDataSetChanged(); } private void onModelRunStarted() { - mSendButton.setText("Stop"); + mSendButton.setClickable(false); + mSendButton.setImageResource(R.drawable.baseline_stop_24); mSendButton.setOnClickListener( view -> { mModule.stop(); @@ -197,16 +575,49 @@ private void onModelRunStarted() { } private void onModelRunStopped() { - setTitle(memoryInfo()); - mSendButton.setText("Generate"); + mSendButton.setClickable(true); + mSendButton.setImageResource(R.drawable.baseline_send_24); mSendButton.setOnClickListener( view -> { - String prompt = mEditTextMessage.getText().toString(); - mMessageAdapter.add(new Message(prompt, true)); + addSelectedImagesToChatThread(mSelectedImageUri); + // TODO: When ET supports multimodal, this is where we will add the images as part of the + // prompt. + List processedImageList = getProcessedImagesForModel(mSelectedImageUri); + processedImageList.forEach( + image -> { + ETLogging.getInstance() + .log( + "Image preprocessed:" + + " uri = " + + image.getUri().getLastPathSegment() + + "," + + " width = " + + image.getWidth() + + "," + + " height = " + + image.getHeight() + + "," + + " bytes size = " + + image.getBytes().length); + }); + String prompt; + if (INSTRUCT_MODE) { + prompt = mCurrentSettingsFields.getEntirePrompt(); + mEditTextMessage.setText(INSTRUCT_INSTRUCTION); + } else { + prompt = mEditTextMessage.getText().toString(); + mEditTextMessage.setText(""); + } + mMessageAdapter.add(new Message(prompt, true, MessageType.TEXT, 0)); mMessageAdapter.notifyDataSetChanged(); mEditTextMessage.setText(""); - mResultMessage = new Message("", false); + mResultMessage = new Message("", false, MessageType.TEXT, 0); mMessageAdapter.add(mResultMessage); + // Scroll to bottom of the list + mMessagesView.smoothScrollToPosition(mMessageAdapter.getCount() - 1); + // After images are added to prompt and chat thread, we clear the imageURI list + // Note: This has to be done after imageURIs are no longer needed by LlamaModule + mSelectedImageUri = null; Runnable runnable = new Runnable() { @Override @@ -218,9 +629,11 @@ public void run() { onModelRunStarted(); } }); - + ETLogging.getInstance().log("Running inference.. prompt=" + prompt); + long generateStartTime = System.currentTimeMillis(); mModule.generate(prompt, MainActivity.this); - + long generateDuration = System.currentTimeMillis() - generateStartTime; + mResultMessage.setTotalGenerationTime(generateDuration); runOnUiThread( new Runnable() { @Override @@ -228,6 +641,7 @@ public void run() { onModelRunStopped(); } }); + ETLogging.getInstance().log("Inference completed"); } }; new Thread(runnable).start(); @@ -242,8 +656,27 @@ public void run() { @Override public void run() { mMessageAdapter.notifyDataSetChanged(); - setTitle(memoryInfo()); } }); } + + @Override + public void onBackPressed() { + super.onBackPressed(); + if (mAddMediaLayout != null && mAddMediaLayout.getVisibility() == View.VISIBLE) { + mAddMediaLayout.setVisibility(View.GONE); + } else { + // Default behavior of back button + finish(); + } + } + + @Override + protected void onDestroy() { + super.onDestroy(); + mMemoryUpdateHandler.removeCallbacks(memoryUpdater); + // This is to cover the case where the app is shutdown when user is on MainActivity but + // never clicked on the logsActivity + ETLogging.getInstance().saveLogs(); + } } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java index 81b77b1aba..b2e5380e2a 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java @@ -8,14 +8,50 @@ package com.example.executorchllamademo; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Locale; + public class Message { private String text; - private boolean isSent; + private final boolean isSent; private float tokensPerSecond; + private long totalGenerationTime; + private final long timestamp; + private final MessageType messageType; + private String imagePath; + private final int promptID; + + private static final String TIMESTAMP_FORMAT = "hh:mm a"; // example: 2:23 PM - public Message(String text, boolean isSent) { - this.text = text; + public Message(String text, boolean isSent, MessageType messageType, int promptID) { this.isSent = isSent; + this.messageType = messageType; + this.promptID = promptID; + + if (messageType == MessageType.IMAGE) { + this.imagePath = text; + } else { + this.text = text; + } + + if (messageType != MessageType.SYSTEM) { + this.timestamp = System.currentTimeMillis(); + } else { + this.timestamp = (long) 0; + } + } + + public int getPromptID() { + return promptID; + } + + public MessageType getMessageType() { + return messageType; + } + + public String getImagePath() { + return imagePath; } public String getText() { @@ -34,7 +70,25 @@ public void setTokensPerSecond(float tokensPerSecond) { this.tokensPerSecond = tokensPerSecond; } + public void setTotalGenerationTime(long totalGenerationTime) { + this.totalGenerationTime = totalGenerationTime; + } + public float getTokensPerSecond() { return tokensPerSecond; } + + public long getTotalGenerationTime() { + return totalGenerationTime; + } + + public long getTimestamp() { + return timestamp; + } + + public String getFormattedTimestamp() { + SimpleDateFormat formatter = new SimpleDateFormat(TIMESTAMP_FORMAT, Locale.getDefault()); + Date date = new Date(timestamp); + return formatter.format(date); + } } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java index 656da1967d..d9cbd95a1a 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java @@ -8,33 +8,86 @@ package com.example.executorchllamademo; +import android.net.Uri; import android.view.LayoutInflater; import android.view.View; import android.view.ViewGroup; import android.widget.ArrayAdapter; +import android.widget.ImageView; import android.widget.TextView; +import java.util.ArrayList; public class MessageAdapter extends ArrayAdapter { - public MessageAdapter(android.content.Context context, int resource) { + + private final ArrayList savedMessages; + + public MessageAdapter( + android.content.Context context, int resource, ArrayList savedMessages) { super(context, resource); + this.savedMessages = savedMessages; } @Override public View getView(int position, View convertView, ViewGroup parent) { Message currentMessage = getItem(position); + int layoutIdForListItem; - int layoutIdForListItem = - currentMessage.getIsSent() ? R.layout.sent_message : R.layout.received_message; + if (currentMessage.getMessageType() == MessageType.SYSTEM) { + layoutIdForListItem = R.layout.system_message; + } else { + layoutIdForListItem = + currentMessage.getIsSent() ? R.layout.sent_message : R.layout.received_message; + } View listItemView = LayoutInflater.from(getContext()).inflate(layoutIdForListItem, parent, false); - TextView messageTextView = listItemView.findViewById(R.id.message_text); - messageTextView.setText(currentMessage.getText()); + if (currentMessage.getMessageType() == MessageType.IMAGE) { + ImageView messageImageView = listItemView.requireViewById(R.id.message_image); + messageImageView.setImageURI(Uri.parse(currentMessage.getImagePath())); + TextView messageTextView = listItemView.requireViewById(R.id.message_text); + messageTextView.setVisibility(View.GONE); + } else { + TextView messageTextView = listItemView.requireViewById(R.id.message_text); + messageTextView.setText(currentMessage.getText()); + } + String metrics = ""; + TextView tokensView; if (currentMessage.getTokensPerSecond() > 0) { - TextView tokensView = listItemView.findViewById(R.id.tokens_per_second); - tokensView.setText("" + currentMessage.getTokensPerSecond() + " t/s"); + metrics = String.format("%.2f", currentMessage.getTokensPerSecond()) + "t/s "; + } + + if (currentMessage.getTotalGenerationTime() > 0) { + metrics = metrics + (float) currentMessage.getTotalGenerationTime() / 1000 + "s "; + } + + if (currentMessage.getTokensPerSecond() > 0 || currentMessage.getTotalGenerationTime() > 0) { + tokensView = listItemView.requireViewById(R.id.generation_metrics); + tokensView.setText(metrics); + TextView separatorView = listItemView.requireViewById(R.id.bar); + separatorView.setVisibility(View.VISIBLE); + } + + if (currentMessage.getTimestamp() > 0) { + TextView timestampView = listItemView.requireViewById(R.id.timestamp); + timestampView.setText(currentMessage.getFormattedTimestamp()); } return listItemView; } + + @Override + public void add(Message msg) { + super.add(msg); + savedMessages.add(msg); + } + + @Override + public void clear() { + super.clear(); + savedMessages.clear(); + } + + public ArrayList getSavedMessages() { + return savedMessages; + } } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java new file mode 100644 index 0000000000..6042acb572 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java @@ -0,0 +1,15 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +public enum MessageType { + TEXT, + IMAGE, + SYSTEM +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java new file mode 100644 index 0000000000..1d109e0195 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java @@ -0,0 +1,325 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import android.app.AlertDialog; +import android.content.DialogInterface; +import android.os.Bundle; +import android.text.Editable; +import android.text.TextWatcher; +import android.widget.Button; +import android.widget.EditText; +import android.widget.ImageButton; +import android.widget.TextView; +import androidx.appcompat.app.AppCompatActivity; +import androidx.core.graphics.Insets; +import androidx.core.view.ViewCompat; +import androidx.core.view.WindowInsetsCompat; +import com.google.gson.Gson; +import java.io.File; + +public class SettingsActivity extends AppCompatActivity { + + private String mModelFilePath = ""; + private String mTokenizerFilePath = ""; + private TextView mModelTextView; + private TextView mTokenizerTextView; + private ImageButton mModelImageButton; + private ImageButton mTokenizerImageButton; + private EditText mSystemPromptEditText; + private EditText mUserPromptEditText; + private Button mLoadModelButton; + private double mSetTemperature; + private String mSystemPrompt; + private String mUserPrompt; + + public SettingsFields mSettingsFields; + + private DemoSharedPreferences mDemoSharedPreferences; + public static double TEMPERATURE_MIN_VALUE = 0.1; + + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + setContentView(R.layout.activity_settings); + ViewCompat.setOnApplyWindowInsetsListener( + requireViewById(R.id.main), + (v, insets) -> { + Insets systemBars = insets.getInsets(WindowInsetsCompat.Type.systemBars()); + v.setPadding(systemBars.left, systemBars.top, systemBars.right, systemBars.bottom); + return insets; + }); + mDemoSharedPreferences = new DemoSharedPreferences(getBaseContext()); + mSettingsFields = new SettingsFields(); + setupSettings(); + } + + private void setupSettings() { + mModelTextView = requireViewById(R.id.modelTextView); + mTokenizerTextView = requireViewById(R.id.tokenizerTextView); + mModelImageButton = requireViewById(R.id.modelImageButton); + mTokenizerImageButton = requireViewById(R.id.tokenizerImageButton); + mSystemPromptEditText = requireViewById(R.id.systemPromptText); + mUserPromptEditText = requireViewById(R.id.userPromptText); + loadSettings(); + + // TODO: The two setOnClickListeners will be removed after file path issue is resolved + mModelImageButton.setOnClickListener( + view -> { + setupModelSelectorDialog(); + }); + mTokenizerImageButton.setOnClickListener( + view -> { + setupTokenizerSelectorDialog(); + }); + mModelFilePath = mSettingsFields.getModelFilePath(); + if (!mModelFilePath.isEmpty()) { + mModelTextView.setText(getFilenameFromPath(mModelFilePath)); + } + mTokenizerFilePath = mSettingsFields.getTokenizerFilePath(); + if (!mTokenizerFilePath.isEmpty()) { + mTokenizerTextView.setText(getFilenameFromPath(mTokenizerFilePath)); + } + + setupParameterSettings(); + setupPromptSettings(); + setupClearChatHistoryButton(); + setupLoadModelButton(); + } + + private void setupLoadModelButton() { + mLoadModelButton = requireViewById(R.id.loadModelButton); + mLoadModelButton.setEnabled(true); + mLoadModelButton.setOnClickListener( + view -> { + new AlertDialog.Builder(this) + .setTitle("Load Model") + .setMessage("Do you really want to load the new model?") + .setIcon(android.R.drawable.ic_dialog_alert) + .setPositiveButton( + android.R.string.yes, + new DialogInterface.OnClickListener() { + public void onClick(DialogInterface dialog, int whichButton) { + mSettingsFields.saveLoadModelAction(true); + mLoadModelButton.setEnabled(false); + } + }) + .setNegativeButton(android.R.string.no, null) + .show(); + }); + } + + private void setupClearChatHistoryButton() { + Button clearChatButton = requireViewById(R.id.clearChatButton); + clearChatButton.setOnClickListener( + view -> { + new AlertDialog.Builder(this) + .setTitle("Delete Chat History") + .setMessage("Do you really want to delete chat history?") + .setIcon(android.R.drawable.ic_dialog_alert) + .setPositiveButton( + android.R.string.yes, + new DialogInterface.OnClickListener() { + public void onClick(DialogInterface dialog, int whichButton) { + mSettingsFields.saveIsClearChatHistory(true); + } + }) + .setNegativeButton(android.R.string.no, null) + .show(); + }); + } + + private void setupParameterSettings() { + setupTemperatureSettings(); + } + + private void setupTemperatureSettings() { + mSetTemperature = mSettingsFields.getTemperature(); + EditText temperatureEditText = requireViewById(R.id.temperatureEditText); + temperatureEditText.setText(String.valueOf(mSetTemperature)); + temperatureEditText.addTextChangedListener( + new TextWatcher() { + @Override + public void beforeTextChanged(CharSequence s, int start, int count, int after) {} + + @Override + public void onTextChanged(CharSequence s, int start, int before, int count) {} + + @Override + public void afterTextChanged(Editable s) { + mSetTemperature = Double.parseDouble(s.toString()); + // This is needed because temperature is changed together with model loading + // Once temperature is no longer in LlamaModule constructor, we can remove this + mSettingsFields.saveLoadModelAction(true); + saveSettings(); + } + }); + } + + private void setupPromptSettings() { + setupSystemPromptSettings(); + setupUserPromptSettings(); + } + + private void setupSystemPromptSettings() { + mSystemPrompt = mSettingsFields.getSystemPrompt(); + mSystemPromptEditText.setText(mSystemPrompt); + mSystemPromptEditText.addTextChangedListener( + new TextWatcher() { + @Override + public void beforeTextChanged(CharSequence s, int start, int count, int after) {} + + @Override + public void onTextChanged(CharSequence s, int start, int before, int count) {} + + @Override + public void afterTextChanged(Editable s) { + mSystemPrompt = s.toString(); + } + }); + + ImageButton resetSystemPrompt = requireViewById(R.id.resetSystemPrompt); + resetSystemPrompt.setOnClickListener( + view -> { + new AlertDialog.Builder(this) + .setTitle("Reset System Prompt") + .setMessage("Do you really want to reset system prompt?") + .setIcon(android.R.drawable.ic_dialog_alert) + .setPositiveButton( + android.R.string.yes, + new DialogInterface.OnClickListener() { + public void onClick(DialogInterface dialog, int whichButton) { + // Clear the messageAdapter and sharedPreference + mSystemPromptEditText.setText(mSettingsFields.getSystemPromptTemplate()); + } + }) + .setNegativeButton(android.R.string.no, null) + .show(); + }); + } + + private void setupUserPromptSettings() { + mUserPrompt = mSettingsFields.getUserPrompt(); + mUserPromptEditText.setText(mUserPrompt); + mUserPromptEditText.addTextChangedListener( + new TextWatcher() { + @Override + public void beforeTextChanged(CharSequence s, int start, int count, int after) {} + + @Override + public void onTextChanged(CharSequence s, int start, int before, int count) {} + + @Override + public void afterTextChanged(Editable s) { + mUserPrompt = s.toString(); + } + }); + + ImageButton resetUserPrompt = requireViewById(R.id.resetUserPrompt); + resetUserPrompt.setOnClickListener( + view -> { + new AlertDialog.Builder(this) + .setTitle("Reset Prompt Template") + .setMessage("Do you really want to reset the prompt template?") + .setIcon(android.R.drawable.ic_dialog_alert) + .setPositiveButton( + android.R.string.yes, + new DialogInterface.OnClickListener() { + public void onClick(DialogInterface dialog, int whichButton) { + // Clear the messageAdapter and sharedPreference + mUserPromptEditText.setText(mSettingsFields.getUserPromptTemplate()); + } + }) + .setNegativeButton(android.R.string.no, null) + .show(); + }); + } + + private void setupModelSelectorDialog() { + String[] pteFiles = listLocalFile("/data/local/tmp/llama/", ".pte"); + AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this); + modelPathBuilder.setTitle("Select model path"); + + modelPathBuilder.setSingleChoiceItems( + pteFiles, + -1, + (dialog, item) -> { + mModelFilePath = pteFiles[item]; + mModelTextView.setText(getFilenameFromPath(mModelFilePath)); + mLoadModelButton.setEnabled(true); + dialog.dismiss(); + }); + + modelPathBuilder.create().show(); + } + + private static String[] listLocalFile(String path, String suffix) { + File directory = new File(path); + if (directory.exists() && directory.isDirectory()) { + File[] files = directory.listFiles((dir, name) -> name.toLowerCase().endsWith(suffix)); + String[] result = new String[files.length]; + for (int i = 0; i < files.length; i++) { + if (files[i].isFile() && files[i].getName().endsWith(suffix)) { + result[i] = files[i].getAbsolutePath(); + } + } + return result; + } + return null; + } + + private void setupTokenizerSelectorDialog() { + String[] binFiles = listLocalFile("/data/local/tmp/llama/", ".bin"); + String[] tokenizerFiles = new String[binFiles.length]; + System.arraycopy(binFiles, 0, tokenizerFiles, 0, binFiles.length); + AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this); + tokenizerPathBuilder.setTitle("Select tokenizer path"); + tokenizerPathBuilder.setSingleChoiceItems( + tokenizerFiles, + -1, + (dialog, item) -> { + mTokenizerFilePath = tokenizerFiles[item]; + mTokenizerTextView.setText(getFilenameFromPath(mTokenizerFilePath)); + mLoadModelButton.setEnabled(true); + dialog.dismiss(); + }); + + tokenizerPathBuilder.create().show(); + } + + private String getFilenameFromPath(String uriFilePath) { + String[] segments = uriFilePath.split("/"); + if (segments.length > 0) { + return segments[segments.length - 1]; // get last element (aka filename) + } + return ""; + } + + private void loadSettings() { + Gson gson = new Gson(); + String settingsFieldsJSON = mDemoSharedPreferences.getSettings(); + if (!settingsFieldsJSON.isEmpty()) { + mSettingsFields = gson.fromJson(settingsFieldsJSON, SettingsFields.class); + } + } + + private void saveSettings() { + mSettingsFields.saveModelPath(mModelFilePath); + mSettingsFields.saveTokenizerPath(mTokenizerFilePath); + mSettingsFields.saveParameters(mSetTemperature); + mSettingsFields.savePrompts(mSystemPrompt, mUserPrompt); + mDemoSharedPreferences.addSettings(mSettingsFields); + } + + @Override + public void onBackPressed() { + super.onBackPressed(); + saveSettings(); + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java new file mode 100644 index 0000000000..d42a241293 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java @@ -0,0 +1,135 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +public class SettingsFields { + private static final String SYSTEM_PLACEHOLDER = "{{ system_prompt }}"; + private static final String USER_PLACEHOLDER = "{{ user_prompt }}"; + private static String SYSTEM_PROMPT_TEMPLATE = + "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n" + + SYSTEM_PLACEHOLDER + + "<|eot_id|>"; + private static String USER_PROMPT_TEMPLATE = + "<|start_header_id|>user<|end_header_id|>\n" + + USER_PLACEHOLDER + + "<|eot_id|>\n" + + "<|start_header_id|>assistant<|end_header_id|>"; + + public String getModelFilePath() { + return modelFilePath; + } + + public String getTokenizerFilePath() { + return tokenizerFilePath; + } + + public double getTemperature() { + return temperature; + } + + public String getSystemPrompt() { + return systemPrompt; + } + + public String getUserPrompt() { + return userPrompt; + } + + public String getEntirePrompt() { + return systemPrompt + userPrompt; + } + + public String getSystemPromptTemplate() { + return SYSTEM_PROMPT_TEMPLATE; + } + + public String getUserPromptTemplate() { + return USER_PROMPT_TEMPLATE; + } + + public boolean getIsClearChatHistory() { + return isClearChatHistory; + } + + public boolean getIsLoadModel() { + return isLoadModel; + } + + private String modelFilePath; + private String tokenizerFilePath; + private double temperature; + private String systemPrompt; + private String userPrompt; + private boolean isClearChatHistory; + private boolean isLoadModel; + + public SettingsFields() { + modelFilePath = ""; + tokenizerFilePath = ""; + temperature = SettingsActivity.TEMPERATURE_MIN_VALUE; + systemPrompt = SYSTEM_PROMPT_TEMPLATE; + userPrompt = USER_PROMPT_TEMPLATE; + isClearChatHistory = false; + isLoadModel = false; + } + + public SettingsFields(SettingsFields settingsFields) { + this.modelFilePath = settingsFields.modelFilePath; + this.tokenizerFilePath = settingsFields.tokenizerFilePath; + this.temperature = settingsFields.temperature; + this.systemPrompt = settingsFields.getSystemPrompt(); + this.userPrompt = settingsFields.getUserPrompt(); + this.isClearChatHistory = settingsFields.getIsClearChatHistory(); + this.isLoadModel = settingsFields.getIsLoadModel(); + } + + public void saveModelPath(String modelFilePath) { + this.modelFilePath = modelFilePath; + } + + public void saveTokenizerPath(String tokenizerFilePath) { + this.tokenizerFilePath = tokenizerFilePath; + } + + public void saveParameters(Double temperature) { + this.temperature = temperature; + } + + public void savePrompts(String systemPrompt, String userPrompt) { + this.systemPrompt = systemPrompt; + this.userPrompt = userPrompt; + } + + public void saveIsClearChatHistory(boolean needToClear) { + this.isClearChatHistory = needToClear; + } + + public void saveLoadModelAction(boolean shouldLoadModel) { + this.isLoadModel = shouldLoadModel; + } + + public boolean equals(SettingsFields anotherSettingsFields) { + if (this == anotherSettingsFields) return true; + return modelFilePath.equals(anotherSettingsFields.modelFilePath) + && tokenizerFilePath.equals(anotherSettingsFields.tokenizerFilePath) + && temperature == anotherSettingsFields.temperature + && systemPrompt.equals(anotherSettingsFields.systemPrompt) + && userPrompt.equals(anotherSettingsFields.userPrompt) + && isClearChatHistory == anotherSettingsFields.isClearChatHistory + && isLoadModel == anotherSettingsFields.isLoadModel; + } + + public boolean isSystemPromptChanged() { + return !systemPrompt.contains(SYSTEM_PLACEHOLDER); + } + + public boolean isUserPromptChanged() { + return !userPrompt.contains(USER_PLACEHOLDER); + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml new file mode 100644 index 0000000000..70f251ee64 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml @@ -0,0 +1,7 @@ + + + + + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml new file mode 100644 index 0000000000..9f83b8fbe7 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml new file mode 100644 index 0000000000..d710d27110 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml new file mode 100644 index 0000000000..30d5d26b98 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml new file mode 100644 index 0000000000..f8ca0c64b9 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml new file mode 100644 index 0000000000..2c71fc6e56 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml new file mode 100644 index 0000000000..9285db079a --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml @@ -0,0 +1,6 @@ + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml new file mode 100644 index 0000000000..3abc6cb33b --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml @@ -0,0 +1,5 @@ + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml new file mode 100644 index 0000000000..42593b298e --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml @@ -0,0 +1,10 @@ + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml new file mode 100644 index 0000000000..817d57b76a --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml new file mode 100644 index 0000000000..ceb3ac56c9 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml @@ -0,0 +1,8 @@ + + + + + + + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml new file mode 100644 index 0000000000..87c82d2a38 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml new file mode 100644 index 0000000000..15c404c60d --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml @@ -0,0 +1,10 @@ + + + + + + + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/logo.png b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..60e3e5174e9bdec2caf09cd42a9232e1dff65530 GIT binary patch literal 33036 zcmZ6ycRbbK9|wNkcjMxcJueB#-kH}3WmYo7wf9IeD!h$+j6||BZ={r2grvGkMU;z- zQn`tcnZ5mdACKSu>-~N_-s_y#Iq%mw&+$sJvM^?5k@Dv45$iqXr zl4||oLCng``jjBA_`$(J_xra?vlH}h{f+hY1v&XgSa>7CBGgq*+Mc&P*j+heux$kZ z5u7&EvyS{W_tHF=|M`%LL%Bhox8h)g&?&`GrSb}8S=v$4r+ty$HHPt;?mq9%kQ(Ifa?5#aeJo|rSE4w)Nbz~g`ZxdJESpbS z{Cjp@I6UZa2V@~6Sm6Qr9{^Pg1O6z0%7uiUY+j6f1_Vd_KX*u1-mj5*eG%<~;QRky z`ad5oWC3=?H-&Qc(Kd5CN_a94Rg39P@&D%~|MzE3Y*H@j2A+DQ>wmi3<{m@O?Be71 z5iH!h?`t;m9Dl3S%Rha*U;OqIMBa=Yc%_R3M-iV6TUwINd=UeZpq_@V_&z|XNBMt( zVAWn0cov5`@wS6sRTcmL?#;1oLgNzv&xgMr3$mRGoOb6S+y7?POds-5yoZZHo?dtz{bf6kxIrUat5lDt??_dMk{^8xb5ff8)8l1aU7JER{(e6|2duN`I zxB&HB-ofhEySE>)YK@M46bO!-OZ@4TG@sWnC;~+T%Ch%so2rc31(-8BNH;s*Z(fHi z5G8y)_8&uy0y4XKgvH2~Y-FsiNY8kQ@V48VyPksVe;lu9ZvdzXzYD+iuxq*K$_2Kc z;$2$8Fgg3EjtbQo@z{v~>JL+njH2W57ySyQezwVYU_}hHrd#tF5;tl{W=}zuO6jFN~29!Xf>;nxn6CmO>X}I)=20 zqX>kyqO>DWEve{-tv$|02!w2SJy1AkT6>8CySI?> zy~}8Tb9J=|dls4w!SolV6i$e`whUPvs103J+qpCbi2gwR)st01{j=*}u4p{}&yhT- zGlBN@o`-=JE^wdAM!+Vh%>MIN5=if$r3LE@k8U^cBhRtL)6)R5(#42mymSMbGu9c! zm3Vc76Rhy546&Xi0Ude!^egg9AC_+neNJ#j*%^(EJ1A0ra#OmW)ZK#Y0lUwW$FUV;~$nLXVHkd3Fv@ zDSTLDeGY5G1F+l2$tj`)yH;s*lP*8Fe+KgQ{ZpDsYV2?zo~eR5SE3Jlh=z9QOlimB zNqG?MDKGKHnJ2Ga$(hg92?4q>+1~ek9w2`e6w#EzQ0Gcbq>{A^-T~swC3A9-K2kcG z9DUSeyFfj^BYvJWN7MJ|V*ndB*rlDCG^9L)sEk$cV+47TV<$y;Xbqw-$DcUE2)$|E z-~I_mi~r0>(RCNvLA?nka}Ez)g)_HlK8Q9oLi!jE%QZwLF|UfYKP@I@cL4qApqHw6 z%F6B>`NZV;GK>{bsrAzRP|i3%1PDEXLn%ttNRy-Zz_vXwxiK(0i~(KWw--cIAa&18 z@c0%{y4LMmDKp5*8!jwtTnUa-)Tcg50=S4{x@vI;LSY^*xyp0&k*zf(@RI4Y?3;H>-vcs6-} z%VChGTVOXvCk1ArJhH}~;RTN^BhYktN+7K%16*Fx)O-!ue{2<5=|n~8yyNr+1a;{S zIW9W&`GX|LntkcF^zo9Pn(RP2$~A?T{#BROv*_Edr(jpXW2Ftk*u{uVF0w*m+Hn`i zU}t>tr2H5uB%tT@Iy2y4qyMMz_}j+R!1=pJ>|SBO=pCk*``VbkBjs)C|h zIoZC{EluX3C^Hi#tLD>Ix^j*%Ak@FB+uRdGhH}hS9rD!p9g!xEJS-V{@M5Xf5i`YR_cbX~ozfZqN zu`5U8X@GRuJgWCdTCFuo%g%!HW4jwBKWi}dlAiO*xIH_F)a#UoB^k0KzTLrRO4l48 z1m&rTfB`9wvU+y)%2COS&r|5JeuE=NFDd|mGslSar8v-8X(1?=@FY=K)W{{KRH$H04lpr z#Eo)?ksh@cn5d9Ap8}FH745V)H+Eu1Xrsn92|}@x7LWsgDn<_zWra+Q;0`RM&P2R) zOON$w`7O(RcOK&Y?$x6bK(UNlxyji~sur4F9pIswbyv~8U={V2KFPx*L}&AtVwWNp znk01@L~+3%vEXvtwBtE&QC)2HcMFDML3Kco?=3#w4@N^vl#e*lXVCZhm0fO5BD}(o zCnfEnZEBs~!9e2w1eV>07kVF|O`gJ+#SxZO0cgv7HhEWHpx17`$#pj>WohqxQtNWHmUH;SiNpY}LGluj=jNqBzucT4#lFC2! z)cbD+N#8hoKBYeDw#~B(&7hq?!b=Sj4RJwtcijp3{Vo|3f+CG@dTZrtV+Q-}srTzUf9$Z*-{<|Eovj;?QN`hKwoi@~(;whRTl9Y!?tz$45+^sZ`mT zEY}=x_ygGA=F_|L{G#?l6T)J6O%iv$65!g(r7u3!Al&Oqd!$=CaP~%bJItBzAc(;8 zgggArGmxPWq?QD&KJYYlLybLYWpp_>FUK4v)g(u4j%;$;nmV2P&tq9YMldv34R0_-kyl8MIOnWSA-}%}{(H4O8`@n~ zVG)vzS$09f^rODGA3EN9IYoO;m)5M(mE}7@2@Y(m+>IDMFzm6L*ct^yBF8VbTvNbx z;o)UAJ6;KSX`mp7Nsk6*58YFtlp|I*%H50d;!38$znKt21PYgBdvLfsYc|@)E18SR0>eMx4~m(uSI=TQrt=u4tSKL)~+Wzrzmtf`l1C zFV0IvL?8S1_6Q@Rre4TpUf7IT9|dmlfa}U1UbP5yU57mvSz>TbLd;si@LHY4`=h!6 z=I^*Z#7cbxlOsS^?AjZ9wI%gR=)?fWHo<>mk`JDeF<=GUQ|B*L90qSG`2{rh;dx2P zHl8SlEw4Zlzs+6}?sRzX?4B$i{DkU_1P5%w?7^iO;!nKA9%0@bo8`4c!u1$(vS6RE z`!JrcA#YEAjRE^BT*=B*RV)~Y(*vE3Z*nVjA8H>R=cla|8enZ7^e{=)!nLr3@j=qvG9Q<-WlWatsRApTj-Sp+g0f-_M> zz=PJJ-e&sI=Xs&35X}fwtW?QAGh4+JMW;KQ^e7lcskDq97yJmc3ofpVBmGp^1F6fyc_Ha zVpXEqyyJ>iqlL-p=l%ny#9>A^5L|_2*#sQG;4fN%EC1#L{XF-oHvm2csJ9hi7l-0b zKIweUo7epn6#h*3p|SeZ5#(I*S$McZOiMEPVJ%G_@b;K0Ad5GjLMpJ1r&gs|xvLorNZ^1sf_831=sI{c zaeRKjB|TC~y|rV|$Gu7l&^|vp8ieD&*0FfyM!JU%>G&S#8*s7XA`R$jtQ`4+CLnma zATke$6VWs>mofDTb{DdL=S$|DBnxQb_J_w1jE zbs<)jVZkGD*^p7twH!y+J54vW7W!n6e+S88g_+GB5_Y) zndXKP8Z}=f;cWD=Q6Uyj6kzf^OgSOC-?Kb*J^EwcRKk9C@o}c3fOHxP#gMyW%wDc3 z3=fqnixN1K1Y3WibK)?-z5z7f z|Ge#=|5_uUQ}sVlnX>=35X3MNVFQu`k~Via$On8F8UrNZ2Jm+`l!{uS21B$)v}=+m zjK!?<9niSlJ#e6)lEA~n1AM%p@YU;?|2p@m0wSk$!Nn%Fr2zwvecv;YrWcqhzyicB z&{)Z(tV4?DmcERT2=eD#e8T&5(7MI}olWkKZ$lK&z0f0{yS{!nn{Amv46 zlEEos!nw4AcmufVuvJFY1T$b7qn(=wej`!hUFJx%4!8pO26LwDSGT*3@TJK5pE-}$ z--1BL$M!D`VFS7gsQQTuAHf5z!xkej^gZRCi{7p|kE+goGc6}8cr2Y)W}@?vI~m&p z7L0!nCEv+qfc|nG-+;bH4_&WP5j?Z?lT>+Tozc$M8Asi}5|Rxi1qrE72pgS|An<}s z5z>9gsKd7r)xDKqgXubD@`@y1?JU&wfRo_$kJkH>!$GfX@ z6MV!=Z(Jcfp${zloeHPy2%i8Tf8g9_2zMGO{61j-|Wdv zb%Yt0Rd)y2ZToTCA1GYAF%cK<0H2RP!lN$P?;Y>BUM8cq+v`j9fp1z6{cCwI8@wxr zhY|u@sLL5?tiGT}g_^7cOwkUQ!?_M=Em)59uqIW%y|rs`iY0A-M$gU3Dt8>Bo_bp^ z5{|eHk<6nE@enm%)!IBsM)9$XM$yFEg@oe2L-N){Bp_%=jjTZ<%fU~&rZ{K>B^ZJ= z4VIOJsy z^8v|K%NMX`YDnn=U3{?ddByBI?VT&u9(aNdA&fynew*7UQJLhg#nU<)yN|z*Ctn8@ z7)rUiTjJkaUZ!PG(bImCsRw$+_KM5hKXT$Iu^A->fO(tbXzhCCSs$^Fu@(S_g=;L) z$0-=_EfTW?=p7K9^NWu>IW@K8%Yfe&u%kdp$YF?2-k9R444*@iP9n{;{!BS8ND^2q z2IiTibD@yU zrCmN@;wAYC9V*$-Jm+5w2pgvvm>S9FI`Uaz-k;y>b!_W0d0gIFRWl?qWzu5C=ZwlfDnJ)XVjPH52em zg6Zh8=34tcH$2U+kx!Zr$4l;(C-q=M$^fP0TU)bNN2;F8-iw5@zrfOvE}#yBiHaZ7 zW8_PR)5}nzUbYMI{e7xYw<)$u&SMXlXzWymV5pZdME{tQ7fp8Z#L7SuUaAL#N0{&- z?7jyt@%g7oBt8udS@tQr*iL9wWqF}HrrKE+kucaYnN&AK_w&++&y^2O0-jH&Nd7IF z;=f1xC9&lmD+(_HdA;E%OW+DadC>_*Y23)uz^Ad+0}q$E#6EvhFKXODyn96b*RR+P z!7C1){>%<96q)NM(410O_r*W7Tu00wKs-?4!GE!!JXg7C=Hz69A5Zh?@rRJjd_E0< z6Sw4ZhV!*9;rUvAc?D8Foh}mdcy70G zX;KNWc*UcD$FDoS5B*C+N#%p07s(E9vf|;*#7uuaOYBXE?~9#?zaIw*PGSWrZmpx5 zFtN^*`UOvz!-fRXp(}u4XW)R?I{9%E_z6nRVtt}2sPEg3Q7&a(ci3F6A&=q8D8~Q~ zEsWCcT)2B(x)0C*n4_b>d9)g4_bjd`7ZP6kO52av1?%lzbfpBL*+DwOsu9i$xV zMObD1432O(s``Y?Y&LZZt z3N@z$gS*D-W55GKF;*3EZDu#&t> zxsYvk@)Sa6KAOtR0C~v2#ubzNs`_rhakdQiO+O9;i$*SZqBSGr_1pZPXRgcdDew4d z;iU8{`4N>nqi$TJghI%WQT3M00B7}00S-{6ocZ2rJMO{-o&&Y}_ORbthOFcoQb2dm zc+=_zJioJ|S103=!VYLf{b*9cLBRyVwk$Z9>29ukMA+?oDCq9aTZbw;N8(xA^!qig zmskSt&<;Ye%DFKfivNCeziTmmg|prQ#C-fYfFF6|eUH*7sK`e70v_<;kUFp+3)F)h zUyur}E8BP2&6anJu@XBQBfaX#E42L3xkV`P8y}_1&p@v|RWx)-rGE${LJmdyg<3qc1xjuYqpTyw zX6g@3E-MDcEY|OT2cc%DcXE?gc6~_MfkA36mj!wb@K|^db$^nNbtE z4C1#2=9mrnxe*i;xL(RR>isDcO1fb~0Jq{Hjl@0gh8t6*J%8sWdO-|?%Au5F!@R3F9DAz{UAJ|TUuMYF)T=a8&!8GA z!_&UL4D|{^eb}mp_dd#SQT(pw_sYlbTp2}oT)O%z1Y=H zB?c~}#=r0w|03&$(*gn0u zw${~@fGEuR~)}NgX^zEcg!!( z>UH~LDKpCcHJK??XX2J_IDCPM^jaUFmNu3eF6u$FiC?j#*q~5@rI9ey@hRa!gLg!U zdhIrq2SKgFJ~g+|B=-H^e6kYwBCY6x-fy?)+9tRqKIRv)1RkD5n-Clbxx-5eLOl_3 zY}$YH$Y>DQpBtAZm+ZN)tpyub7kxJYW4AVr@KCzBboScoX%ZJ{oDS_t5aJz0uIv`Y z|NZV4iwZt?vC%h-)uO_w-Fn|8>637#62@!PSwqqbeotLgejr8*=)Y)XiKKo)fv^TR zGGlEm6G>hB!tkQ0Gek&IpA@~p%klnTp&@#ROA+j(@dAex0&U8K=l(|o9g_q~p`v=u z9$TF~YA)Biw{cpiFBua*V`Jo7bVqIF;aQ;UBQ!FhdnFxL{Yb!4(jWKSi%UKb;q&CB zO_=91OekiAU%PjlJ8H5D;lt7LFi=#CnWN`#?u*q&TytvELwB;lh@k|{6G-IZe)fOL zL3*M3g^@q~kNV(#@lh_Xy4x&`N3bpBh|(n>AcbTtu9VfxG(_rM*qkA5_+iw>S}k5)7J0)z4_0;0aU&exbr}SM1f!q!+(Sblyl) zU!C#W`NWHCm`|zHj7keu`J+m@v#Uf@xF?4t`7YPJ;Unf_SbYBU3OBX8Q328S=bg`5 zohO+?%Hg9BX}}bj&vE~?PeBRklQgjeqbOW4`Rzot#->FFxDeGe(i*J#!pRTOFG*We zYmL5x_E8`yzuXyyKW)P|4&tVsoaP~Y>$fi=05@jZF{amD`xV7{(PdCUfsCuqQHbyn zF=yqdzE~^Vs3k-Z#?AoxJHRBJ03x|a?z(q3yh@YgmPI&`)O1gvhvQk2wORln#KsOC z(kY=3(`eATz+ynnR2ui6R49ID2@g>t5~am|~%3y$##vjxs;gNg#7Hp2ecZeQF8K0KVg>`cMuc z8{KFC9tV<*30aHGbE=q6NzGqFotf;u)VSUWOv6!6ce5&BrdKA%Jl05$kvm@qTB^^$ z+R>lTHV?qu^=!$R|CblwYh_ixR;MK@iRwBUwm*->$ z25L-Do{xMhT5=rF$MiD}p8vGq28c)IK0k^!E}7z9*3up88$*?#E-BW;SRykY zntGD%Jb*bEKLs6hDvDY?LPSf@81TAmam=loFXD{+zi};)qd9?_UTMjW+t}-iw6}5g zw@w$bh~G)e^A5KVLnjxXqGuQPWyc%0<9L+4@84HR?cJODG7&vjY%q(DVbX$uGNDqtlr_UXM_2`n5TY&!^ zNOd3AX_07EBCI+J;ABCFA7h9Bytq&*E1uwG873L|;>YBa$$;gv8Xgf}Zz$mBjx;pa zU`^4@5oGt&hesH~E;7@+Rf&$fNK9hTUuoj%u|>J}=`k;4CXbFRC>wFhu?#29lop-@ zKH8OIlUY{n00X2XGQFS@B$YqV=h6I+ncb|~un9}c*kk^>F={w{$xh_1oxPSNt|$yo zD+fM`>=fMw)&lT9twZZ`k|A*x5&XIN>%YH#1rA`{?Oi~)Wo2y4iDY`Jf=$8YMZv#~ zM%*k%67TX*2Bz3xf9M8^KyutNc_P~4|=-TDhel`SU}!q+#j8q zo7kvaN%i{}uhoZ`yDuWaiCn#Eh6;NT!$Uj}@q`_opQT8N5dWaQ2_f;DZz3W_>r11! zD2u4)54HS4&o0f>CnOED#Hl%cIX``Exp!}5@gooQJOA`B0x?OJkVF(ZjWKv&gIjzl zK|&EUcV%19cJH|rQCHY^6%`bukld5na&$UZu(gZvgp)H< znWo&v>IQlw?1)Rlyx?YO(90(bu2_o)^*5S3A3snUo8380a=DGpFQG~z4Gk&Y$mIc> zk)+v$M-`4tVDos+dPX90JvcV(&WE>*pQvN5%<kA!fO)`T2RA`>esdnUM$VI z>OKK%TF77Dd6b9!@bqNa!`>MsP$RLL%s#!QS%lJX6DE%(g5aCSywMHm;cX~GQQtfB zjUITAHFPj17T&g_U+}2m8d|vlcpc698jLMF#Nc>v9aH~X+wQ<`xnaF>-P>HX`iECW z1p34jGSOn~g1wc>r(QP+nz$|IySod~@gQ0jz4Pf+Hbi-Z{^HlqZ&1tc#D$!`wWIS1 zA<4Y@>^l6ARr298UE72gd8G45c}Aas!QN*igD&vrCrOtHR};&B@!Ee2{qLoQ(G+0v z5dtw^@?%rhj$)4wLQSo<9FbVvLzH-q)O+KwaaMPalP=VdU!x+-BfhCUM|W)4(Y$s% zFzQ{Z2Mp7sE6m4Qo-LlE>I$muM^y5YmEnTamn%?_56)-#H`iCpiEr!{8yDw_mL1{y z0%YBa;@JBJXhTNYo#f&~Kg!P6M#(R;&-pR|Rly#X!u?lGmdHjgDU$Czha0K z&FJdb{nlcq#SjzYHh8B9REI9HYliw z4+z)vnMD%bQIo9=`%bo=9YSBn5T)RnSW6_17X+yz#d!6G&Ho_&2@X5Ij<);+=p|#z z3~6>3E1Qf7lpe%Y9XkzcOn5%8hcf9+3xNFKKz=?p3WA67$<7fE~zUSSA=Yy;ju=uQ!(faOF>>4pWe9S;`EFS zko9H6tv=s%ALGp8Y&0bu4XRW4^aXM;p@6_h=AZXTbE?sS*UNx0@#xZu%f(#Z7DXy>r&Swj31+Wqgh&ra}_l-Ml>!d49}vsOraTh;hrIr0sjwDK$;6S_E#@ z#WQd~yHCaM9Z62(1||5-I2C1ZJMN|G+4#UGocNnKd@{A0J|2MpiXTYz2gV0 zvXPQmkXn{Z=RE?+R9~5~+ERv657nme!s`kD1XZ1@UUGnu?K4Q=@Ofi-;fxJ4-drl6 z*%wT&irJGM6pMt+^MHQ@@_#6x{cy72Dy;nt(_v0pJ`xHMGeO47Q*5b?*yquk#41eE zE287R7@@7t6=s&j9%Vq%&T|*0Z@KcDW8m%Ar;KO;vu_oRpIk=W#}gvaQV1XO&Q;Xm zII@GVk#NdFTl1}^sG(Ty1+Z&leLX(Qj4Zwiy}83UDwxH%T7ic@I&5F)MHHO`ZGug^ z_paz{+Hl0wxyWqgL()$Qf{n%SsU{#R$=;Ax?I>$i=# zfLbm@;Ewy}+&%js;^+8xbZB4;8DIZe#BNCc+YWnT*`7TxFnP)?iU>Shd#?jW9z?}`}W zh5}LA!uYm#P}CvsU)LJ1MIWw59#C1j$j?ju+BP;<9txj5^8OQ5J`gD#yq|LQ&{e|Ne(Ms^5c%Io5+_a~CUik!k*>(f*flM#wMY&p@mTaEFDCK!VB#0qWjE;f}+QWGLlp+ODXv z==P}^h~EFR@hKi0x%+@#6?+4j0K(b};2LL!FemJs~)OC$prko}!o_S4Z|g z3TgVIZ|t&n2zEU7$sZ}C%Ad-YCftHdaLaC{ z@n{akueV7Ym~l}KInadHonHnFY&42FuRwo@hqBfi5*sMO>4V#V1CfZeLgzhmg?CFz zJ47@6sH&z&TLpQU!x9`c|guRC#mzycm;Sqjmk1#d0&-KOS zU2TjaD8hXpPNj@J&L02~Wd1FpAlqiIu75Yky%yC7fnmTdOpZ2y|3McS@HsY14u;?g z*P*yLZngL zx6V8Id(y-(bZBdfrdyi`Cv|L`1J@(?$WW+Gt@IY6q) zp-ZN|U4-o91|Pz@pQtP(Iq)gyYu@;9;w|`j^l*TCar%b}p<@TAqkP_ZyM_T-{{d

`5eqyv=sNbZS=eDEM*Xau+I2kIH{6hQSq)SkJ>X*^(X$Pk(Q zu^j_my&t>d3C;Gil6f&EZosqy_}{^N!yKAA!N>0QSy9C6LxAqXLpBE#8mQDF*uCMj z=LH_!2(CSGF5X~{&D$jqxxbLo_BusM9;^a7E*jVg>Hg^gMo6(O@MPx0>;&?g($IPb(TcEJ)$Y2O(PF+h>+kmDiDv8dt+9>5U~8g1}kLIm((MredXFZ&=^-~JB> z`~lpcQkvU`lZQCjoMDp!o%07RMu4PY59|t?mQDhFfkWWVkssM2iie+;MNPG{A9^?L z7~R-6Un3wvAjXc$5t8U3ZPd&#HGkiTaB*I|lJq9HSK}mqlVRzP>6#Nj zBfW2Re0+RnW+ruaXL&XB)V^zPgAXjdGr6!exxc@E?32N3*@Jfj-S0e1~!k^h`ee!9#p7F&uG8>4Que09uHsJ{rpZj}h07))^b)}j8Rv~VhW{ePVM z-C_(mB5dvYhoBw6g~?sg9$9o>FIa_E@1lhTwr}lvt4?Vw(^wZCe%FrvD zU7pCaj5@RSan$Y!nD>>gum`Vsq=tDIAr@aQr{b0;Sze$TR=IJ#%XHAyZ6q-LNz&>{Sn!FlQRpv8g_q59fYN{orTp z=jzc_Tx3KJp{DJ%5U%U=vLM+>NZr6RL@kd85gwc>(!cfBTew2YUpLx>{~#kc)nVVk zWy7Xl2DZ5abrhC2N(e~4reU+3=1LB}1qfxaeHuI4#|OXlWlIw$<3HrC>$m)r`&sKO1XzN(ewhkT5cK_r z&HM7bR1h-YyKz2l!?;ZZJM5d)zFqNoVpbk>Zl{kOG)c3y;Cp9eGMSB*3&7*fu2}9J z`SxEt^n>F&j;RiV{MaeA2I6XMYwYL!HX|LdE4b6Ak;lQ;t&%gb2%5Lw98rKmJt<_cn4%ts~-OiA#xp6*e&A>D()9KZYtm8E6;%HOeiLrscSELlf$DhtGHaD;5x_UkSrV@o4Jx zld4`#lsn(MGNtbP{DkO$Y2^OMfBxlAjE0jWqGL_;^SKiU3HRbbYu(n9cJ7w+v3;gMtP)y)%^IGDCbdP53{%iTY6dw@Fj zM&gZ2)Q5AZo&9wKe$nMdkuBrRuyDDRe8bGsKN6P#0%Y5HI6Ws2>`JJ6#=*%X0aV-| zR$q7`5W7C$q0DD4{<|xA*G(W+oLi0<{aM!S;Vv(jTwsu815ICsEcs*vELvbubF(27 zUF=@_+}FEX`EO=yY=?g6f7k_MoPelP7WrY7I1Gq@SU$zbRayPUVAy1=msH|842;$beV0fP>}N!yr1 zHwKv)K0HF$nO+T{J?!E4fB-e#g!09Y=$xz;x2kkl)l;8+LoVWELriZQ7qQf@F$;9+ zzfQS4a|kI%*T+etcr&Df)#1NZ8RIRz@w z^e>Nj9u}ljygOK``n8Mv-1;EyhR1}y_XEico198M(0LV+*5JO%B_-*xmr(X<-N6QF zI`edJUaRJ3LoOc9`vA3&|1`gO>GKYN+PfIZ7AbdzS%hg9L=pi6&z%r7L=VJ6rgwK} zE~wO%gpl@=4@x+14mqJ@&7Km?{|gd123Pn3>(hXOhJfM+pjYOc$b6G&oaZD6)zpsc z5+Qdvw2_(NG})8Y=Z$HfL(?e;+nC*|0teDBK63j>soG1uo1j&Zf(LfT3|ANVTH`~#k>>3$XA0y#>g)}_axglpdl%2+%pZSrXl-UHo^&X zwZNf>H(zxH0c_(uvk}zCS=0#B@WbQQJG*<=-?mGAfa^??K{tx}`N^bo#JcV7y^T|p zXTFRT97}MX_FhY;QX6B;f=>CG8|*OU^IR$JcL`0Z0(ifq)Fu7#qNhT{&G(D7MW_=r z)qhmMgq@Z9JAw4W?H9Aq#h&1^&L6iet0B#G_OR0!>oMIMe}rp{abz_GOP3-~x#Ps% z#)K7j+y9UYz1YBrpTi=|_!!ptnBUW`O+}}%_b!+JwkBvRHHi|y`T%9jd8pu`EV4y9 zQ#uAReCXveC0C@ z?k7DhU%QvScK+ibjRn6*Nq>#EZm#GDT+$6{^2eQP;Jht7VLSh$n~^}Rxn_>PXvAWr z49F9By-43c%FG}w9W~H)>YkH(`5Ziqj^ey*wo7VGwA4szTcI}mwj3u?8Dc;qWjxP7L z?@5eVIaZg^f0^I=-tp7?mcmk?UIn)56IdBGcyl6g?qVd_>(1&vFUnFhuc{!FI2rW8YRR$+R-ld$Z3X>{tXk$>1^r#W$_aQsxP%sKRTmz!NL zoU}Sa_UrZvBriEMNW<@%)K0x~*x9{O=oR+CB4qi}Q^b-ag{mE}ZqM58@Zz#G@~cZ{ zVd#Cen$-Gj_oM%arf{-Bwk#aqC#@huAzF&SRd? zs_ZXB2iF5kM(Kd#xaMh#FE7L2t<~)W{`~LZ8%y=AHa)Nz0XZ`g8TPeO7X(a$I7ZGD z&THP7cWWzYjvCfwoLpd6O|Zd$rHG4(=z2jrgCKszWVTECOV=GaAQWz&WTNEa@-U^W ziWx-Y7Z;tPZooGeF1r0fE1r&9UwS}96C%}qdHF7V*#9FBkffiAARrdE&uKjF|MlPw zM{z0}wEi&j3rF;dOv+!VO>aHN~1#WFcv^slOUwT$Tay2Nv_xTVCe5_NWXo1~=SV5`=(+4X) zpC`0{vq<`|C|DXvOQxei{Zl+a>(~15ZXkV>_4eyU_a3ZR?60wux~MPK($_fYzu?1!YA|9nQ;IkfC4+RqJ@3Z?!7WDfB8y%MS95Cv?$8GUqWiv6o6 zAXZ>KSe2)-?``ep$kP2%-dd6KLQdX_))zDW_jYDn+#(ojTr##V-m<_c^sUTbznWE z8K{IHJ!|0HL6?B)(Zc!C>8t_6l;tG zSNi6~BV$Hx2F*n$NBfM_?Kcz)<TKCKx&Z35+`;R@38=$? zH_r;9szGQNnU3X)j&MaXpf16eML_Fb&rf;Ku|kXxZaUJtmGE0jca~X#jg;tdjDN4# zF?-skeeA~_w-EYGwmu_w6=J;^L_Nx|savN8-jthJ;zE>_d?}1rYpA1lAhX!CdLROO zFh;$_9=AO3Fz$|}8yY^OQXHZ&q-PPjs1FeNIg6PAuzK}j!{{M>nMy&lcF21xoM4lz5UbjA1W2PPM=9nEjbJN4)5ML^q_7-nQk zsXaIy-((mP7|WIM4Y) z-8<_ZJmSK7-Gv>+&hgwXlH(d!_E{el2h<~T9H7Y%2~0P>4ezk3R3MU39Oe~*!xHKx z4CF_a^QkN%C!@?6^)vo9n=?XIQCzG`X+a+y#}XH|epbXQh9z7YEib$-r(T)b{L_dtGp;rt-4H9s2M zA4yF2bk2P5Ot^N{2M>8K_nGm&_L{(-gPuF<}JGe!cS23monbfRzv+X%fSDzw~fmGbO>h zv%d}^z6##!#sd8AIH41J(x=5_kLwIDDMG;V$%uhR5nNNfaUWT^rWv#(B&FxvKDZ&K zD;Mu?EgC71=8N6^+5H}SRhh;j|0CxdXA0UM-sxhKUgDUCH=js|ad{rO`O+1bd`o0z zR}4B`hpzYV+`aDvwcG6x#G}8;9^*|(VIW(YAC`>r>cGAC^(I1Hlc!&7+}w|M>sw zp3PwF`%)$?WXqBeaZ9$chY*=$r^uf5wqz+qmLl6&LMnONOEJ=hvPLM3Eo8~QGr#$q z-|u(6|9sE6&YUxI&VBAP*L|PsdcCgK^Z9uF$B^s|+ythh)0mLE9e9pgZ(`##u^?uF z`1D`ER*Tm*XHC()J$HKuzwl?csOn3j7c1xrO+Rkh8 z{|mdHqC*bux$8&0dN}&+MZ(geZcn-Fr)z)%npY3v1z{J?AkLjEWqro~kz9!ES(i*D z{QcfZ}t8~U+B&{%3Cv| zQc3UTu*&5z(0v#FbUWV4)R1lQ0AHOPe-oJtvnx>hpsyH_$#|^s(VwLgt$I?0hl*@* zU&CdZsN)*^7U08U=CB<9_*WGZN6)=DW`LsXG@klq>}(Cc9|5IdAKeTCK8a>Y0*kc% zz+VwQ<1EBmRv(DJ#e_e9Oj!E2wS8ncYmfORJg1#8CME3}(~s}FgOI9S%0nY0op?V7Xc z)P70xR1RePh04i}F|Coiti;6~RNrSJQ34~6Os%OxoS8VCi@<>QqgopkJfvFdR6b99@zZbwhJoi^ySnv|M%a`OXCGK8Q2E?Nu_M29oYTS1AFt1*O7%$ z_h#R|XD%q6uiH`!?fYM;X4=daONAlRk1Aiw-VCyF1DL|VTIM0q?NHZt#g0NwZa-9y zx_!&l{?UB_UuJKKqx|IxXiAQ!#L`t)t9L9{7_TOn&3XR86!Ai11s!S{j5SKv+4Zd> z-(G}%p21L(GM`2rj*e@2OBeb1#Q7Q5&OOk&Lk>J4{BZbenk|*LsMv!6NxVjq?TGC} zCv&wWB29*tQQ>w7%V)_9x4DaCY2m}W|8NCaiBYdZcFSrL z2$Z{)kBM*{&KECii)&cn#((t@>Fl)LYHd2f)Od&TD9cM~`G}U&AUuVJgZVF+kq=ck z;Wi&Cs+{euz2D(hEY#tpal6(nr=y@EEwnD;=NVq0G3tmy)=$sW{mNN1Egimg%@@ji zyQpB(S9Nv6hvACCzh?#ACsY0l-+yNfkg=3Y#SEi)NpY!?x=&!U^{e0Pmt=4Jk`H@R z=+e!)0()?1fdA%F6t)zjL4Z2S(h^U`Q)Ify;GF4Vw3u##i1^ zdbHpV&H7BQ6bqC7bZ{o$e&BSvOpkkMgylwZC|5__kn>%n!up`4ZHvnNCBSK+ephAg z*WG_=3Bxgo3Y!wV44~TaAduGhF>LjY+QxL^sNKH&D$cT%mq!G(Dmhyu1Edb*m+1=q zdy+Rpx{nw+bry#_T!|?OnnN8lHnkoh2Uw{=Qr0H=Qo4)AL#y=ljo{&%^oH`~|lA(d!fA7R1(l zEz;b32=rMiydbKwPip@6>A896`=;vvxwH}G!!fu%5yj6mkmnH#sh#>tf2&d^C;Vj& zZN_1t>Um+Z6reooL(kI+egm72zO`Qb|Qof z_m~c{#DH@9pJy^56TlPqh6~jC{PtzIpN}+kzonn<(8I-1cPD0)#Gi`QU;c*X@WCDT zor|(Ik~loe(m$*?@?~#vTq+8Me8a$Vhfv8gSutL#43wY7D&;Rw?|`>mFh+d4sO#>6 z<@!-sh~wg|>bMHS+=1ysYLf?1S9gUNz;-F^)>IDubGi3ob?ij9cQQNQ&MU?{pIe;M z3f2GERbe9d0e2|`7jY2|C(2y=>MLiN9!K)!|}7haZc=LOAh61|J5>%~QuFMz9?zV59IOaY&UlMEL{H0o*S(eTe#QpM32ekew%Dm14G@ls-J5l z3R}1@0jx~Ng+ef25W9VY(Muj(?_1Pcd$z_j_yfC6_HR0Z;mPNxD4^XqrUrq=@bmdi zFli#{SB6l9*!bqJfhq1PQ7te8DX_8w8?`q9+ZhF@RWGJ z;JD9n>&tF;%Iirrhl)eWYUSbC-Z@-rqw7?DQVZbe1DiK$%g%EzdAPnsv9EZbysroj z(qcK@o?lvGn#6(nWk7AQ>0|)SOt~V%$9j4o5`F7Fg8FRe1ko|G->F(D!ieN+akQTpM zhI(lXWe?j-Xi|FtkTXJnNpkCTwA+jgzqM?Ah{KBU-mx6@d`sU%Nc7`)B*|-Me5N#g zx8%PicHtJFPgv5Z=@38jInwIZs+ITVnr1~R`a4vSY{T4-HpSHE^jmPEKz>G*Gu_*- z$sFG~u_q$`d3aVK15#R2y?!IZ?of4@+^t$3`!Fv}@Q!YXN>T&(9E4G9GAPKExSIjQ^lVA|n-hGkja zcV}B>(~zJyyC;r- zm(q^>@FCQJ^Zf^J8|c0-IblN?ncSI!#wR8BSC=cg|J6Lo$~l>c3wqZz$oldq250L5 zrToeXuanC*82qEPV*BW)(COVqX=^q>F;$@8P@vDzPW3c~7&A0!e-WK?tFX^~sbut) z?^&b6AC>Ma2MnD1jE#LDdB8Q9C8W}UztB3zGTZBZIz-$oi9M#CA5j0u!{Qj_5&ZtS z(XOPSP?ctzictp5oZG6lMFAc(KBk!9Nv{~aNlk3{>s)S>iN12Ffv!m+B-{bIL_`Fc z*c}qVFa2n4;jTR1R&-ONv91&6kzhf$i?>5T+7PE%jysi)*bADWVmbcrSd|{325r`$ z0;PwuI_`IcUkPz!T*M6d0?N~0N$81|2Y!6xsOjV=g|x@tF1)5c@-Z~Eeqwhs^iiR9 zh7Y7LaFE@ducc3~$^#6JIpI9A;EvplV?5km>wiOXth(Fp%bTOtdTiHJC$9Wgm;7JU zlo9y}&iQaQ#XxujZ;pigP_~3Yc9EI~??rYZ3shlyv_b+P6^jbU!q44^cAriQQ7Avld4JEZ#`uTgUS!K;O;Gi@#pe-MDuVW3wG`3Qtt)|`8r+4=!_NvJ+I z>V)DudiGc!kj37ArHH1y{fe)&V?ju*X(B;vqUq=qegs=k_2t?dxeVAiM8w`DWEg`W*vwWzQcsvM95j-Xjtd!B71} z?h<;w>e}MgH69j()1^!qN2z2-eD*@FuP|zC$I|`Ct&U);@Mz zLxW^}+AG4kPG%980j)=M!`;TRw>)(j*+2243{&a*|t&t)T*vp~_ zn-J8#HW57k~SWBbBf*E6gOZ3h$qqU@8r{TJ(f}c z@<)(zEk7L8PY&u<^Aeqq^LmwLgo@80==?>RnH+p%;<7R@`8sVg8N*)x0qAj{3JBDf z)`F$^5Oqfm+2EH}Wwr{(C%YmkLMVx9MS`R|v~5gy=}8w=vI3Lm5R<%a_{l@y%zGvF zegj35V(8v0oJ=ZYB1*TfVUdsl)s%h|lRJU-3?+z+3g5f<0A-oEdYBdBmwZjncNIXX zfg*dl9*Rc-mmq*Rw}}Vz`5h)y5CH5X=(@B?wwi=+lH?L2$Fp-YVY0iO#vf|qjHm2Z8jEUSUAi)CF#yQud)F4r64RFe|T;p<{XP zeCNGGhg^0uEnl2wW~U0>6a=IwoE}YBJ*3K`ohh_Hif-k_lf9SBp0C}>%EgH{vzBtA z0o`CPsiRdtX8{H4&3L=w!JeYaKx$TC0d=o3w3{1)(v-+jQ679|XTM=D2%B!xK!34h- zPH(R0ltI-G8+ANb!E7U)7r_d4%2hEwyJzk_p*o!ojaTJiWOI_(L{QVf$^ zpK}o{o|khur=2*OWAl`5YpJu5)u%IeUgI7&@Aq*J;H*L_^X|of&%jKCXs1faw``yZwf{Hi!WBp6{H3!eea_;=i%64eT=InE z4pZ{ea~uA(Bh2qaU*5TWY%ku|TkNCm$9f~;kROg}S+N(2n4>@1fy%~(JM0;s9EqV~ zjjC2e)W*2O*r_x+eoM^zy;7V^8<4Fz%=sHH@Fa3+Mkgq+`cNmotC^eb9nt-p>22<$ zoqs;x0VM?0JkD|?CI8`IUC4MtA=YGYn=>bTMgR3mJF5QU-K-9@%e*U(l{ zcz5MaNMs@74HFe^lTZCG)Ux4Uy+Cj2dp^Csq7-GmiJeJe_!|#Mcc0tofZL_<>Rg`h z+vJ#K-RPR{|Ef=(?hF34+iQP4yf~1iBHORpD>S%wRcyNX0y%sVqxMM!z4!P)HO&b+ zu8aKLM7v5y97Dk~%}L?DRxWGA2$`cPrrcjqN)wo17m{j5-5+-U>?%AYCepocar1FwN!J3#MOItT-r% z8C@t@U3Qg!Z}z=OlE3521DzYE!)|}H|GQ)Hx3A0rm`g5G2Fe^E@j7?%fKUG1%>a7# z@5A7%zv}gU{>lCO_n%~CPDARixuwiFb|siZ!m7YL7xtu4SGsp!jW@D0!CApa-19r$p??b=>@rW1C3Q z<~7-(^V^s%u36C)UV@dQgckD#Z$Iv2nB{TqCiFVyeJLj9m~Ut61i8mx(|5dJyL@{l%ekBhF@oKspbn;Ph;dEz{UWxoG)WQjWZbd%wY10`fdXkY`=1}f(fRq&)M7} zPIw!dUf{>l_{Get?#7Dk3uB1b4ia)B4FseY!8X<(4wfyT%#kNEi;QsP_DKS zC4IVtwaXw)2c4FMGrtRw=@_8Wse_L=Iq&b$d(Fqn+dBx%o&7(Jt&17mT|p{Vd;=#V ztG4vDn}}=@BwD$>R|c+9Rnt6xuMyFFfyFkSxY>9N7s+*Eek?2N0_Jyk=@biSqcdRf z=I5BL;tyRfV9kD$xj*tGlh>?rC!2|dWP~?$hVC;Kb(nr=Q@=eCENZ?qF&Dv0sXLD* zUS=jeuSkUIjbqJDsVL!vC7qD{E%$Q1TQiE$X54CFF!T3R19V!Lm*donNLS5=YW0}l z+hPCs#g(ZCmxs1>$Lw1L=(_XUw(Iou|0TM!xhCXn>S2LZMd~~SeWyqHm-?KvvM!2+ zS}#bEg{9qGOmK^vy5Vb5FgUjbdB(}0VjDU+OOv;aj{wrv?!^^|tdHLQqwxl3)g|#- zX} z9|JE8WUEFBjp+9{`ic)(36$%j#Nc58N}CoM&*6>5zaNMcCpd51QE-;yD*g6B?ZUzD zh5DZzN7~cn5qy)E>0 zW4a+RrV_Z}twNCvohzNJ8Q;Mj=l)gSF0^{O-77&NBiL?o@EArl$) z7d5KG{f=~G?THB;`1PP6n_b_?EQv~DHGt?+F$K!L-wCPIIOX_J>Q;k6aFWx zqe-WP{pNSLK$1R!!JpteeC?d-<1k&_#&2De(H5V@PJ5SbFndGjH~sXk+mLtB*W16v zCeV3v!t6yE!xvI)IEMlYS$>!q!j><;Z05c*+|i^dVrJ@YTutyCUWT?Pj?^a-z)0+>xN9dFv}=D z$=#Sf0yf+PNYCWW1Ej~k*L1y=4aA+)*BbO!yFWbrM-q%E}p8X7JTTve( zIo;#H&bGbz6*XzUX8TI-SP_d&_M0@zUINsy*I!}y6X@NBV}iV?LOfuR`4zL4ml(3O z{j*U95YQKR{htzjv-%>+& z{VqFt9dNgo4y@{gjc$gSDn_=n$4LI!)MolAi|DY^120D-gwJ*PvYb5WpHyout@UtO5>^l7-Yabr<`yvdIj$N#=@|OdTOo9W;R{(%Em-gEje{)nKiPsT zWbQ~aTr$TowTomBnw2UtnX_spa!!oI`cbI&kr>K&N|3SzWauOXs3uUKTejiQq)PSr zaGW&?vRZMDQAR8ODIEZB`biXqiqs3lupwE^3d*ip}p#<(-EF-lNS80d~;bWZdoI){^T<8mbVGKaAM3zg$| zSxqWG*lsM_oK``P{*;7^H%r_Z+I{ORkril`EJyVdMD9Pv=R(wul9o z2p36kvHwmtGn{5L$ivDqDI2nW@AuqQ>&-!h?vMi4WP7)+zWm~BPdNP;_q7sOZQK>N zP!ZJ`Vx!2%Je|B?aS?J4$t_nk=+UG#ZuxlvwugrgTk=tqN$%>c@Y3dHkp_2xW5YI!_r(weuW~!NY?S7gjn1*MSLFu;ZSeOkOybwFPV_og%dFc!!*Xo##m!g21~Z(hAZDy}H?7E*J%`R0#hNLmjJmR`p$k6@_e5iF>|~M*%ep_nET60dVlKHFzed~&SL2Q*#f<~ zRDDFiN#8Y>?<%6~^$|})T^U?;m+z`J^wQI<#V56$@u->2Ugby`kh~1Nv(>+IJtj0` z#7?DF2IahU5UMsKHXTDhTIu^v@X(sO-E~}_xDJtH%(&IT%B@UYA!aa@qse%wDBO?H zPps|fcX4KRy!dB9y2Go`MK3pLzbMrYA()k;zPFpRfY*ZcM6tCzM&@3U-e zo*#yZOMY?!N>BF#;+cru`-QW8_fr?=IIsgVN`iE>jK2h~;+y6pO(D=`%xoMF6+Tbp zf3-l(wLSk&VfSEa+(BM|EiSs5teI=+p`2|XlbbUr)%UlADY+OK4lf*i@vIY-tU?qW zs}C~0cX+3_p*M}2@)6vTZEOg+^XBbaa%V5|@$eO-+Wx*cEWddH(xX>z1t>89i!Oad z;zbwtGI&}Gz%T}4osG7ReO!ySN%$=wwsK-;m2)z?ycVU8#v1Qm=4~l?+ zPrg?DRvCKWSa>oaSr1s?^+=$;8^HH$kdAR>elb(z2G2V3%h++pjWyX6&42VOHqtS$ z;^AdW5u!hlc)5GY~FcIVFElqwL6jC4TJz_|2Rgw9p>)}7$OB~?yF)aCvtRed8 zcY+`sfNJ~fhNKyh97|QT1TmFIkc#Nkn>(*|jUQ;HJ_UXUeWDdIJdvN7{s@wfu0np} zM?4KbzYFQKE&vssrPwYfby>8gG#X72910Sl%rSL5#hD2x3mEELQEKOo;@4o^VR?Hn z`1gqmB!m9i5cTMdGN;5MYi!M=0ej!o)K+CJLlOs23;TOUWmAE%)Dv&E13u$@LQuWi zV5~${z*r~K38t&cF(hC0&9!KEW*fW1s52u>8*ig7|2{0kPwn=W&lEmoi27CvTmu@) zF~?{DS-aM!fdiYrK>wlEh!prLc%B=;-3H#L1E+9`|EN5(MUeW`L zU!rk^5r@IqK>=8JeeBO&pn|4cV7`8ho&WL?<*<`)PpdVE)ZvD4`l#nD^XaINury0F zsFlMANM#vfh28-K>kANvdq13`N?5Edmlj)9uB;dAn_C5Y|50e188ON9Rrku zghuWWb&=D%45t~0ZLgcRntJE!cF#>cR2U3{oLnL$$&guSLVwT&=ZFuL&DtEu_MWcM zT~t+w4fHo~_7eakZ%KeGVl5Kg9M%ASmHdZZcBH@cbc+>7N}Zkq8Hs)){T?U`Rz^m zbuqnCT^(@tSpbM-i5pk(6o@if0Utp^6eKxSBXjrfaoD=n#eOUtyuU{YGB^zkaXJRfUGH0Xzs5&qI`xisl1Cl) zj6oOm@91~h$?a9OipL4`WOB)}l8(^)gE?`EuU@tI;Zg=CsELUe&gfE3)@JIuF(nhU z()lOv<$=sFy5Aq_nqvokI}6v$Fv;%V26hz?=a>#8TbKrgI9SO?NZ$?}Igg*1{X~Zc zy0zvR(HsXntM&eL`XDD#BSJSCq_&UlFx6DK6+!H`8o}EnI_!<(W8uD2)RJJ7I!!Dn%L}`X(mqg2*Sqp^4SeARWM6< zzIU|Oc(qy16n#vr$mzrE{X!OKXEFFTp-VuVemdgT-*yY*DVfX;Bl2nCJ}J2 zNvv8Ebp6F&d?a%yG0t;2HC^Mx{k((Ijx@#h!L^*9KrsX3&?eupb&u}ZX<>L*L+CdM z=!_;-jpnP%>O)+|k#{2WO?kO?MpzN#BfJ(7X$+;32GhAuwOTBr4`a7i42&NfpQoSp*2gmAPpBYO&Oi0;%|iz|1X^Y%UupHtNhfaeY#e5Oo<{{>yN01udjb8mJ& z&=o_b9kYG`^uB8%OYf3en{KsS zjy$j=mC@c_*ncFlCUOYq36SUUwl_tH3g|9Xjj!3P)=y@>+UXDMJ?+snn@M>o^6rZd60maUm^@D04zVcZFX{ram)QA}p7Tup z1xDzXdh_=pd7%&WnkCu9{7>QRr~PJiqaK{N0_>F7O;(@W#LQQoHF-UKwmGaAQUvr# z@(l?pU`xIoS#8ZuXDjt%20!}R!FjmdIz?&%lo`gm%W&PI|8>Fn9k6XHV|2H{F%=xiA0PRPw6rls~%;tFWDFmWxF%iRZhMn@&QS$c)!(vFB+0PZ@S`|h#fNE)Dm}F}- zMVt5g)4Pap7*PKsJtQpnG~_YoTa7BJF$FHkX`?-EIIz>4(MAWe56EA4j2QPR~bnvyf>ZEGoira=#KvhI8-QUsWZYt`^jw8~dLp^KiXM|4w zwm<#es>{7%ovvW=9bxEkevg=gE_xlF|9?cVd*6!)$zh?ys_59?*nrC#<-4(K_evqV zz0;!A8FQo5Bjo0S^X*Qk4_hp6*^ryq`QR*Q_mCx;EC9YkwC?<)sn}yS&9l2uicV54 zQFToL?qn+F_tunkTs1VM=#vMRBmug3fAII^{VxI0D+}~)5%ag%tP1%qJi>bBo86?Q zFF_K}*CA+!G0o>0|-UyFLsVnn*5olsrOv$C1X2 zjA)qS`YYvuAzDG`kkA-rrw`7+VGeq|9$5dLu0eP&feb-@!pYU%BvPjLkp54W>XJp}$I(9y-spaUAh0G(a90DAH= z*(la|BD5A;z=W!0MEub~E|LCL^Cc%pSs_4=A$RbxvNx2~XU}*WMSr@5yl2D(EDZNLQ0KqgdAK|M zq3l+8g5kY1z31ANBF+oTLq{vGH3eJPS?vE#rY6czQBG#AJ=48mhv9bQ*Xfm#=)>vI zc$mn4IP>#ey`M{764PnP>(4hlbk32lo1u7P%pt+0)4+nUXpyY`4V%~?cVRQk+?eUp z`0>$K5R@;$NVR)iKoM`nguE!4(Q-qic!bMjh^)*#Zq&Ctr%f_J-k^MP z9CnN0hZ_v_|Am4nhl%>_mQhSC~q~&wY;S01P1dHRuB^S zzU<)X?VY%Bgw%-NA}Dt~fKT)M*?D>|GV2~C0g!Y>U5Ax<68+C8?HzeHS(O)Jf%`a~ zR2@-tk+%KOa(*<&I0T)+*_UKa%kCt7&JW3Uy1{%%tn#bw~)iG5?lGT z;?-~dzG!0@9(^r3#aunSt*5LHm8v7M(l|yH91)MnFQ=`(VUICGE=|$$_NV^#oEkq#)A=V_M`MamBQp-F0?WUp9vEPt=9oUG>1|GA-goOhFQZ z0%ZmQwqgc4yU*~vTP$ewUb&wVTO$F)vn zxap2y$vudYZryzA`};^73$j2?zXRnkQZe51Ybwm}k)|g?fa7k&rq%wca&q!(tX=F` ztne@O{_rwh6A^T@EF386JyaoAq;i-kqK|Do!NiO~?+=^5v^OV1%6F|CMgwDw`IDh0vjKXvfeh8#Aik18lYU* z$lS0nu1jty>IKNPP#y4d$dJjWI_UvKA+6I57gct6K;6_Pc_4Hv+mnuOPSu?u8LT-?LovX(2mnUw= zCnA}3Pp6n*PB!?Y{b8t$iQb;FN|U{eSC+~I4nej1wlyx$jeamIRA1(REP|3EEvk0@ zlhdf#6W%r3EzjOE70WVZ6h;VQ{-d{Q$-6_lA=GOjNALJ&wZ0cLXp1?|JoKfq^Y+IrQaLky0U?z#Q|@qPcRm0H2S4bP*6&A3BK_5c1QlS0UszNhD$OyXXM$LU z8UrSsQ9t}iYA8}>ZDgq>xmGNwFBA6r(gu>5i^?vLhWNT>J($jO(#-b-rvAbXhfTDQ z^s(n$8J7`1LcLzG=52c4$BAyW;|8aZtZO-;Z}M=Z%lTG(gm31|CPk=-MwD~rr-(iO z{9j;(U}YR-s!Q1&c##5m7YUM7I9?up0j3&qDH^b}sFzGwLqnZW#Y0 zsn~#e`M$yxaoU{gNU6R+O6Q=)Fw5EL5WoZ02+G(fTToA%c*;*X*@sS4=9s*u{CCo=~34G88GhTBGbgVxlw3H2WBeC4jR&PLBsY$HcBCd;M}M<402zxZcj) zz)z*VzxGoF!_85Y6>*q$mimI2VvWSDt_@y*zU*DN`y-nJ^yUOIXt^Mhr6z6^;&W(u zRvp9-(nw|HZD{hcfH#$Z#QqajVv-FNNk;8s>u9$*$ay;li8U3MaNftbGPb}=5*>?E zTavmXVj0I-whrg4UY`r(d1j#lO|YQvkJ&kKnFDCLxs3X<3+>1flKiMWC^B$$}2z5Rq2M% z?(k9LapGJu<`DT`{p&xznkAr=5q*7<02ED;Uo86P^>rN*&}8Gw{1?cD0651+YodYB zG-OnADVR5s4m{U}c4y=Vv_`c$RbPJuLN88Jo@?k@aLb0tN7NdvT@AYjB~d^M(?}*h zSb`rFvX9rjkE&FLS3rX;B_xn2*$IT`omK1}81hpmfm&?3G+#67}#xVp!y*2*1_u z^-Y@k*b@0c09^P5$sd7!z3RRtKQT=67w}#oc#qUhpYr;dWhP46DFnn2Lt3dfq)fmC z)yO=H3=&jl0za6eG%|sA;5V!HR}7aOUfq#?g3AaWV?q}>2bro90>VRl>-D00+wmU; zri=CK1Iyzne*1l=F9qJFu-3E(hHm7C_K*F}KF}Jv?L}M z23~!CeUecoe6)FQ6|H4+N#?=5pY1DmmeyVz5r<#meU(zZj79}rA2G+Hk4Gm7>k!~} zqK}4k@>B&LoND8wdZFdiUr^YLELT3-ANbzTOldz1>htz1SP(DTyE>1mlZ=jjx8*SE z1=u7<4sdaHCD6Ve?E_t*+0F@5KIcjy+6hD^j0QKfwCoITFl?^AlARv^tlbT;B1`LU zWp$?_TI4idwMQ7I4mX)5F^r?`WldUKkDKg476*5u2KtxATlyykmkiE8h&a(O-FP(2 z{}0dD@l~9c2q%_Z7r+Zj z(MnkiH8e4e(R{^fwnK6s`d)}|A`$PZFqM{ca-vkzaOvf$e>6pIEdIF`j0*a8Q;s3@ z-D-0UYV0utJ}5PP{5R8(SNJK}2h7@x9LV3XilB{tt<(Nx>nCPfqjReKnBoN4A)sMTuSZ4=`m4zMJZ%il1P zxsHmoOibLwi)xyl$Q z_T4@fRARt+Miel`qkuJi+(1)gPd2t1TB8uT^8^fN;ALXJ3pvtTdsY9_1jI}K8+{tg zbR$3T)xV%={{Q|$|CAk%r+NZnoVVSn>ZRk%= + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml new file mode 100644 index 0000000000..a8c859d8b3 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml new file mode 100644 index 0000000000..c7b4b2e4a1 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml new file mode 100644 index 0000000000..a8bb4b2f64 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml new file mode 100644 index 0000000000..1627ed98c0 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml new file mode 100644 index 0000000000..b327a544f2 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml @@ -0,0 +1,55 @@ + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml index 089acb572b..ec215e63ba 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml @@ -1,44 +1,237 @@ - - + + + + + + + + + - + + + + + + -