From 28eef00411dc63e5d0d5b95d26a965c8acf5c408 Mon Sep 17 00:00:00 2001
From: pytorchbot <pytorchbot@pytorch.com>
Date: Fri, 23 Aug 2024 11:35:05 +0000
Subject: [PATCH] 2024-08-23 nightly release
 (6c26a872323e13c723e1544282fafb51f880742b)

---
 .ci/docker/common/install_linter.sh           |   4 +
 .github/workflows/lint.yml                    |  17 ++
 CMakeLists.txt                                |   4 +-
 README.md                                     |   2 +-
 backends/apple/mps/TARGETS                    |   4 +-
 backends/apple/mps/targets.bzl                |   2 +-
 backends/apple/mps/test/test_mps_utils.py     |  10 +-
 backends/cadence/aot/compiler.py              |  18 +-
 backends/cadence/aot/passes.py                |  26 +++
 backends/cadence/aot/utils.py                 |   8 +
 .../cadence/cadence_runner/cadence_runner.cpp |   4 +-
 backends/cadence/cadence_runner/targets.bzl   |   8 +-
 backends/cadence/runtime/TARGETS              |   6 +-
 backends/cadence/runtime/executor.py          |   9 +-
 backends/cadence/runtime/runtime.py           |   2 +-
 backends/qualcomm/tests/utils.py              |   4 +-
 backends/vulkan/partitioner/supported_ops.py  |   6 +-
 backends/vulkan/passes/custom_ops_defs.py     |  37 ++++
 .../vulkan/runtime/api/containers/Tensor.cpp  | 204 ++++++++++++------
 .../vulkan/runtime/api/containers/Tensor.h    |  88 ++++++--
 .../vulkan/runtime/graph/ComputeGraph.cpp     |   7 +
 backends/vulkan/runtime/graph/ComputeGraph.h  |   9 +-
 .../vulkan/runtime/graph/ops/ExecuteNode.cpp  |  15 ++
 .../vulkan/runtime/graph/ops/ExecuteNode.h    |  16 +-
 .../runtime/graph/ops/impl/Convolution.cpp    |   1 +
 backends/vulkan/runtime/vk_api/Shader.h       |   8 +-
 backends/vulkan/test/test_vulkan_delegate.py  |  36 ++++
 backends/vulkan/test/utils/test_utils.cpp     |   6 +
 backends/vulkan/test/utils/test_utils.h       |   6 +
 .../vulkan/test/vulkan_compute_api_test.cpp   | 105 +++++++--
 backends/xnnpack/test/TARGETS                 |   6 +-
 backends/xnnpack/test/test_xnnpack_utils.py   |  12 +-
 {sdk => devtools}/CMakeLists.txt              |  20 +-
 {sdk => devtools}/TARGETS                     |   6 +-
 {sdk => devtools}/__init__.py                 |   8 +-
 {sdk => devtools}/backend_debug/TARGETS       |   0
 {sdk => devtools}/backend_debug/__init__.py   |   2 +-
 .../backend_debug/delegation_info.py          |   0
 {sdk => devtools}/backend_debug/tests/TARGETS |   2 +-
 .../tests/test_delegation_info.py             |   2 +-
 {sdk => devtools}/bundled_program/TARGETS     |   4 +-
 .../bundled_program/bundled_program.cpp       |   4 +-
 .../bundled_program/bundled_program.h         |   0
 {sdk => devtools}/bundled_program/config.py   |   0
 {sdk => devtools}/bundled_program/core.py     |  10 +-
 .../bundled_program/schema/README.md          |   0
 .../bundled_program/schema/TARGETS            |   4 +-
 .../bundled_program/schema/__init__.py        |   0
 .../schema/bundled_program_schema.fbs         |   0
 .../schema/bundled_program_schema.py          |   0
 .../bundled_program/schema/scalar_type.fbs    |   0
 .../bundled_program/schema/targets.bzl        |   6 +-
 .../bundled_program/schema/test/TARGETS       |   0
 .../schema/test/test_schema.py                |   4 +-
 .../bundled_program/serialize/TARGETS         |   8 +-
 .../bundled_program/serialize/__init__.py     |   4 +-
 .../bundled_program/serialize/test/TARGETS    |   7 +-
 .../serialize/test/test_serialize.py          |   8 +-
 {sdk => devtools}/bundled_program/targets.bzl |   2 +-
 .../bundled_program/test/TARGETS              |  21 +-
 .../bundled_program/test/test_bundle_data.py  |  10 +-
 .../bundled_program/test/test_config.py       |   6 +-
 .../bundled_program/test/test_end2end.py      |   8 +-
 .../bundled_program/util/TARGETS              |   4 +-
 .../bundled_program/util/test_util.py         |   6 +-
 {sdk => devtools}/bundled_program/version.py  |   0
 {sdk => devtools}/debug_format/TARGETS        |   0
 {sdk => devtools}/debug_format/base_schema.py |   0
 {sdk => devtools}/debug_format/et_schema.py   |   2 +-
 {sdk => devtools}/etdump/TARGETS              |   6 +-
 {sdk => devtools}/etdump/emitter.cpp          |   2 +-
 {sdk => devtools}/etdump/emitter.h            |   2 +-
 {sdk => devtools}/etdump/etdump_flatcc.cpp    |   8 +-
 {sdk => devtools}/etdump/etdump_flatcc.h      |   0
 .../etdump/etdump_schema_flatcc.fbs           |   0
 {sdk => devtools}/etdump/scalar_type.fbs      |   0
 {sdk => devtools}/etdump/schema_flatcc.py     |   2 +-
 {sdk => devtools}/etdump/serialize.py         |   2 +-
 {sdk => devtools}/etdump/targets.bzl          |   0
 {sdk => devtools}/etdump/tests/CMakeLists.txt |   0
 {sdk => devtools}/etdump/tests/TARGETS        |   4 +-
 .../etdump/tests/etdump_test.cpp              |   6 +-
 .../etdump/tests/serialize_test.py            |   6 +-
 {sdk => devtools}/etdump/tests/targets.bzl    |   4 +-
 {sdk => devtools}/etrecord/TARGETS            |   4 +-
 {sdk => devtools}/etrecord/__init__.py        |   2 +-
 {sdk => devtools}/etrecord/_etrecord.py       |   6 +-
 {sdk => devtools}/etrecord/tests/TARGETS      |  12 +-
 .../etrecord/tests/etrecord_test.py           |  10 +-
 {sdk => devtools}/inspector/TARGETS           |  18 +-
 {sdk => devtools}/inspector/__init__.py       |   9 +-
 {sdk => devtools}/inspector/_inspector.py     |  16 +-
 .../inspector/_inspector_utils.py             |  12 +-
 {sdk => devtools}/inspector/inspector_cli.py  |   4 +-
 devtools/inspector/tests/TARGETS              |  41 ++++
 .../inspector/tests/event_blocks_test.py      |   8 +-
 .../inspector/tests/inspector_test.py         |  23 +-
 .../inspector/tests/inspector_utils_test.py   |  12 +-
 {sdk => devtools}/size_analysis_tool/TARGETS  |   8 +-
 .../size_analysis_tool/size_analysis_tool.py  |   2 +-
 .../size_analysis_tool_test.py                |   6 +-
 {sdk => devtools}/targets.bzl                 |   0
 docs/source/extension-module.md               |   2 +-
 docs/source/llm/getting-started.md            |   6 +-
 docs/source/sdk-bundled-io.md                 |  38 ++--
 docs/source/sdk-debugging.md                  |   4 +-
 docs/source/sdk-etdump.md                     |   2 +-
 docs/source/sdk-etrecord.rst                  |   2 +-
 docs/source/sdk-inspector.rst                 |  18 +-
 .../sdk-integration-tutorial.py               |  18 +-
 .../website/docs/tutorials/bundled_program.md |   2 +-
 examples/apple/coreml/executor_runner/main.mm |   2 +-
 .../coreml/scripts/build_executor_runner.sh   |   2 +-
 examples/apple/coreml/scripts/export.py       |   2 +-
 .../apple/coreml/scripts/inspector_cli.py     |   4 +-
 .../apple/coreml/scripts/inspector_utils.py   |  15 +-
 examples/apple/mps/CMakeLists.txt             |   4 +-
 .../executor_runner/mps_executor_runner.mm    |   4 +-
 .../apple/mps/executor_runner/targets.bzl     |   4 +-
 examples/apple/mps/scripts/mps_example.py     |  10 +-
 .../LLaMA/LLaMA.xcodeproj/project.pbxproj     |   4 +-
 .../cross_attention/cross_attention_mask.cpp  | 169 +++++++++++++++
 .../cross_attention/cross_attention_mask.h    |  71 ++++++
 .../cross_attention_mask_test.cpp             |  71 ++++++
 .../flamingo/cross_attention/targets.bzl      |  25 +++
 examples/models/llama2/TARGETS                |   2 +-
 examples/models/llama2/eval_llama.py          |   2 +
 examples/models/llama2/export_llama_lib.py    |  14 +-
 examples/models/llama2/llama_transformer.py   |   9 +
 examples/models/llama2/model.py               |   2 +
 examples/models/llava/runner/llava_runner.cpp |   2 +
 examples/models/llava/runner/llava_runner.h   |   3 +-
 .../executor_runner/qnn_executor_runner.cpp   |   2 +-
 examples/qualcomm/scripts/export_example.py   |   2 +-
 examples/sdk/CMakeLists.txt                   |   2 +-
 examples/sdk/README.md                        |   4 +-
 .../sdk/scripts/export_bundled_program.py     |  12 +-
 examples/sdk/scripts/gen_sample_etrecord.py   |   2 +-
 .../sdk_example_runner/sdk_example_runner.cpp |   4 +-
 examples/sdk/sdk_example_runner/targets.bzl   |   4 +-
 examples/xnnpack/aot_compiler.py              |   2 +-
 examples/xnnpack/targets.bzl                  |   2 +-
 exir/_serialize/TARGETS                       |  12 +-
 exir/emit/_emit_program.py                    |  27 +++
 exir/tests/test_joint_graph.py                |  20 ++
 extension/llm/custom_ops/op_sdpa.cpp          |   4 +-
 extension/llm/custom_ops/op_sdpa_test.cpp     |  15 +-
 .../custom_ops/op_sdpa_with_kv_cache_test.cpp |   9 +-
 .../llm/custom_ops/op_tile_crop_test.cpp      |   2 +-
 extension/llm/runner/image.h                  |  16 +-
 extension/llm/runner/image_prefiller.h        |  24 ++-
 extension/llm/runner/metadata_util.h          |  15 +-
 extension/llm/runner/multimodal_runner.h      |  21 +-
 extension/llm/runner/stats.h                  |  26 ++-
 extension/llm/runner/text_decoder_runner.cpp  |  23 +-
 extension/llm/runner/text_decoder_runner.h    |  24 ++-
 extension/llm/runner/text_prefiller.cpp       |  25 ++-
 extension/llm/runner/text_prefiller.h         |  18 +-
 extension/llm/runner/text_token_generator.h   |  26 ++-
 extension/llm/runner/util.h                   |  17 +-
 extension/llm/sampler/sampler.cpp             |  10 +-
 extension/llm/sampler/sampler.h               |  15 +-
 extension/llm/sampler/test/test_sampler.cpp   |  17 +-
 extension/llm/tokenizer/base64.h              |  17 +-
 extension/llm/tokenizer/bpe_tokenizer.cpp     |  13 +-
 extension/llm/tokenizer/bpe_tokenizer.h       |  25 ++-
 .../llm/tokenizer/test/test_bpe_tokenizer.cpp |  11 +-
 .../llm/tokenizer/test/test_tiktoken.cpp      |  11 +-
 extension/llm/tokenizer/tiktoken.cpp          |  13 +-
 extension/llm/tokenizer/tiktoken.h            |  27 ++-
 extension/llm/tokenizer/tokenizer.h           |  32 ++-
 extension/pybindings/pybindings.cpp           |   6 +-
 .../training/test/training_loop_test.cpp      |   2 +-
 pytest.ini                                    |   3 +-
 runtime/executor/test/targets.bzl             |   4 +-
 schema/targets.bzl                            |   2 +-
 sdk/inspector/tests/TARGETS                   |  40 ----
 setup.py                                      |  12 +-
 .../extension/pybindings/pybindings.bzl       |  12 +-
 test/end2end/TARGETS                          |  12 +-
 .../generate_linear_out_bundled_program.py    |  10 +-
 test/models/targets.bzl                       |   6 +-
 test/run_oss_cpp_tests.sh                     |   2 +-
 183 files changed, 1643 insertions(+), 615 deletions(-)
 rename {sdk => devtools}/CMakeLists.txt (89%)
 rename {sdk => devtools}/TARGETS (54%)
 rename {sdk => devtools}/__init__.py (57%)
 rename {sdk => devtools}/backend_debug/TARGETS (100%)
 rename {sdk => devtools}/backend_debug/__init__.py (83%)
 rename {sdk => devtools}/backend_debug/delegation_info.py (100%)
 rename {sdk => devtools}/backend_debug/tests/TARGETS (86%)
 rename {sdk => devtools}/backend_debug/tests/test_delegation_info.py (96%)
 rename {sdk => devtools}/bundled_program/TARGETS (88%)
 rename {sdk => devtools}/bundled_program/bundled_program.cpp (98%)
 rename {sdk => devtools}/bundled_program/bundled_program.h (100%)
 rename {sdk => devtools}/bundled_program/config.py (100%)
 rename {sdk => devtools}/bundled_program/core.py (98%)
 rename {sdk => devtools}/bundled_program/schema/README.md (100%)
 rename {sdk => devtools}/bundled_program/schema/TARGETS (84%)
 rename {sdk => devtools}/bundled_program/schema/__init__.py (100%)
 rename {sdk => devtools}/bundled_program/schema/bundled_program_schema.fbs (100%)
 rename {sdk => devtools}/bundled_program/schema/bundled_program_schema.py (100%)
 rename {sdk => devtools}/bundled_program/schema/scalar_type.fbs (100%)
 rename {sdk => devtools}/bundled_program/schema/targets.bzl (93%)
 rename {sdk => devtools}/bundled_program/schema/test/TARGETS (100%)
 rename {sdk => devtools}/bundled_program/schema/test/test_schema.py (79%)
 rename {sdk => devtools}/bundled_program/serialize/TARGETS (76%)
 rename {sdk => devtools}/bundled_program/serialize/__init__.py (97%)
 rename {sdk => devtools}/bundled_program/serialize/test/TARGETS (51%)
 rename {sdk => devtools}/bundled_program/serialize/test/test_serialize.py (82%)
 rename {sdk => devtools}/bundled_program/targets.bzl (91%)
 rename {sdk => devtools}/bundled_program/test/TARGETS (68%)
 rename {sdk => devtools}/bundled_program/test/test_bundle_data.py (93%)
 rename {sdk => devtools}/bundled_program/test/test_config.py (97%)
 rename {sdk => devtools}/bundled_program/test/test_end2end.py (88%)
 rename {sdk => devtools}/bundled_program/util/TARGETS (68%)
 rename {sdk => devtools}/bundled_program/util/test_util.py (99%)
 rename {sdk => devtools}/bundled_program/version.py (100%)
 rename {sdk => devtools}/debug_format/TARGETS (100%)
 rename {sdk => devtools}/debug_format/base_schema.py (100%)
 rename {sdk => devtools}/debug_format/et_schema.py (99%)
 rename {sdk => devtools}/etdump/TARGETS (81%)
 rename {sdk => devtools}/etdump/emitter.cpp (98%)
 rename {sdk => devtools}/etdump/emitter.h (92%)
 rename {sdk => devtools}/etdump/etdump_flatcc.cpp (98%)
 rename {sdk => devtools}/etdump/etdump_flatcc.h (100%)
 rename {sdk => devtools}/etdump/etdump_schema_flatcc.fbs (100%)
 rename {sdk => devtools}/etdump/scalar_type.fbs (100%)
 rename {sdk => devtools}/etdump/schema_flatcc.py (97%)
 rename {sdk => devtools}/etdump/serialize.py (98%)
 rename {sdk => devtools}/etdump/targets.bzl (100%)
 rename {sdk => devtools}/etdump/tests/CMakeLists.txt (100%)
 rename {sdk => devtools}/etdump/tests/TARGETS (75%)
 rename {sdk => devtools}/etdump/tests/etdump_test.cpp (99%)
 rename {sdk => devtools}/etdump/tests/serialize_test.py (97%)
 rename {sdk => devtools}/etdump/tests/targets.bzl (82%)
 rename {sdk => devtools}/etrecord/TARGETS (71%)
 rename {sdk => devtools}/etrecord/__init__.py (86%)
 rename {sdk => devtools}/etrecord/_etrecord.py (98%)
 rename {sdk => devtools}/etrecord/tests/TARGETS (64%)
 rename {sdk => devtools}/etrecord/tests/etrecord_test.py (96%)
 rename {sdk => devtools}/inspector/TARGETS (70%)
 rename {sdk => devtools}/inspector/__init__.py (60%)
 rename {sdk => devtools}/inspector/_inspector.py (99%)
 rename {sdk => devtools}/inspector/_inspector_utils.py (97%)
 rename {sdk => devtools}/inspector/inspector_cli.py (93%)
 create mode 100644 devtools/inspector/tests/TARGETS
 rename {sdk => devtools}/inspector/tests/event_blocks_test.py (98%)
 rename {sdk => devtools}/inspector/tests/inspector_test.py (97%)
 rename {sdk => devtools}/inspector/tests/inspector_utils_test.py (94%)
 rename {sdk => devtools}/size_analysis_tool/TARGETS (86%)
 rename {sdk => devtools}/size_analysis_tool/size_analysis_tool.py (99%)
 rename {sdk => devtools}/size_analysis_tool/size_analysis_tool_test.py (98%)
 rename {sdk => devtools}/targets.bzl (100%)
 create mode 100644 examples/models/flamingo/cross_attention/cross_attention_mask.cpp
 create mode 100644 examples/models/flamingo/cross_attention/cross_attention_mask.h
 create mode 100644 examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp
 create mode 100644 examples/models/flamingo/cross_attention/targets.bzl
 delete mode 100644 sdk/inspector/tests/TARGETS

diff --git a/.ci/docker/common/install_linter.sh b/.ci/docker/common/install_linter.sh
index 4a796a72d5..d262176e49 100755
--- a/.ci/docker/common/install_linter.sh
+++ b/.ci/docker/common/install_linter.sh
@@ -13,3 +13,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 # NB: Install all linter dependencies, the caching of lintrunner init could be
 # done after Executorch becomes public
 pip_install -r requirements-lintrunner.txt
+
+# Install google-java-format
+curl -L --retry 3 https://github.com/google/google-java-format/releases/download/v1.23.0/google-java-format_linux-x86-64 > /opt/google-java-format
+chmod +x /opt/google-java-format
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 7cb2cf69b8..ea068f65e1 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -54,3 +54,20 @@ jobs:
           lint.json || true
 
         exit $RC
+
+  android-java-format:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-linter
+      fetch-depth: 0
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        FILES_NEEDS_FORMAT=$(/opt/google-java-format -n extension/android/src/main/java/org/pytorch/executorch/*.java \
+          examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/*.java \
+          examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/*.java)
+        if [ -n "$FILES_NEEDS_FORMAT" ]; then
+          echo "Warning: The following files need formatting. Please use google-java-format."
+          echo "$FILES_NEEDS_FORMAT"
+          exit 1
+        fi
diff --git a/CMakeLists.txt b/CMakeLists.txt
index afb0437fae..b5a5b59235 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -617,7 +617,7 @@ if(EXECUTORCH_BUILD_SDK)
       ON
       CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
   )
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_APPLE)
@@ -676,7 +676,7 @@ if(EXECUTORCH_BUILD_PYBIND)
   endif()
 
   if(NOT EXECUTORCH_BUILD_SDK)
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk)
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
   endif()
 
   # find pytorch lib, to allow pybind to take at::Tensor as input/output
diff --git a/README.md b/README.md
index c4e6e0caf7..914eab472e 100644
--- a/README.md
+++ b/README.md
@@ -93,7 +93,7 @@ tools.
 ├── schema                          #  ExecuTorch PTE file format flatbuffer
 schemas.
 ├── scripts                         #  Utility scripts for size management, dependency management, etc.
-├── sdk                             #  Model profiling, debugging, and introspection.
+├── devtools                        #  Model profiling, debugging, and introspection.
 ├── shim                            #  Compatibility layer between OSS and Internal builds
 ├── test                            #  Broad scoped end-to-end tests.
 ├── third-party                     #  Third-party dependencies.
diff --git a/backends/apple/mps/TARGETS b/backends/apple/mps/TARGETS
index b8ab3427a9..1ab92b3fca 100644
--- a/backends/apple/mps/TARGETS
+++ b/backends/apple/mps/TARGETS
@@ -95,8 +95,8 @@ runtime.python_test(
         "//executorch/examples/models:models",
         "//executorch/exir/tests:models",
         "//executorch/extension/export_util:export_util",
-        "//executorch/sdk:lib",
-        "//executorch/sdk/bundled_program/serialize:lib",
+        "//executorch/devtools:lib",
+        "//executorch/devtools/bundled_program/serialize:lib",
         "fbsource//third-party/pypi/pytest:pytest",
     ],
 )
diff --git a/backends/apple/mps/targets.bzl b/backends/apple/mps/targets.bzl
index 8b9c64e143..74d7944836 100644
--- a/backends/apple/mps/targets.bzl
+++ b/backends/apple/mps/targets.bzl
@@ -47,7 +47,7 @@ def define_common_targets(is_xplat = False, platforms = []):
             "//executorch/exir/backend:backend_lib",
             "//executorch/extension/pybindings/...",
             "//executorch/runtime/backend/...",
-            "//executorch/sdk/runners/...",
+            "//executorch/devtools/runners/...",
             "//executorch/test/...",
             "@EXECUTORCH_CLIENTS",
         ],
diff --git a/backends/apple/mps/test/test_mps_utils.py b/backends/apple/mps/test/test_mps_utils.py
index d7efe8bde4..77c02f533b 100644
--- a/backends/apple/mps/test/test_mps_utils.py
+++ b/backends/apple/mps/test/test_mps_utils.py
@@ -12,16 +12,16 @@
 import torch
 from executorch.backends.apple.mps import MPSBackend
 from executorch.backends.apple.mps.partition import MPSPartitioner
+from executorch.devtools import BundledProgram
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
+    serialize_from_bundled_program_to_flatbuffer,
+)
 from executorch.exir import EdgeCompileConfig, ExirExportedProgram, to_edge
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.backend_details import CompileSpec
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.extension.export_util.utils import export_to_edge
-from executorch.sdk import BundledProgram
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.serialize import (
-    serialize_from_bundled_program_to_flatbuffer,
-)
 from torch.export import export
 
 # Config for Capturing the weights, will be moved in the future
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
index 509e254b55..405f8b5db4 100644
--- a/backends/cadence/aot/compiler.py
+++ b/backends/cadence/aot/compiler.py
@@ -18,12 +18,13 @@
     ReplaceLogicalNotBooleanWhereWithWherePass,
     ReplacePT2DequantWithCadenceDequantPass,
     ReplacePT2QuantWithCadenceQuantPass,
+    ReplaceSafeSoftmaxWithSoftmax,
     ReplaceScalarTensorWithFullPass,
     ReplaceSqueezeAndUnsqueezeWithViewPass,
 )
 from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion
 from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer
-from executorch.backends.cadence.aot.utils import model_is_quantized
+from executorch.backends.cadence.aot.utils import model_gm_has_SDPA, model_is_quantized
 from executorch.backends.transforms.decompose_sdpa import (
     DecomposeScaledDotProductAttention,
 )
@@ -57,13 +58,20 @@ def convert_pt2(
     """
 
     # Export with dynamo
-    model_exp = capture_pre_autograd_graph(model, inputs)
+    model_gm = capture_pre_autograd_graph(model, inputs)
 
-    # Decompose SDPA
-    DecomposeScaledDotProductAttention(False)(model_exp)
+    if model_gm_has_SDPA(model_gm):
+        # Decompose SDPA
+        DecomposeScaledDotProductAttention(False)(model_gm)
+
+        # Swap _safe_softmax with _softmax (see https://github.com/pytorch/pytorch/pull/133882
+        # for details).
+        result = ReplaceSafeSoftmaxWithSoftmax()(model_gm)
+        assert result is not None
+        model_gm = result.graph_module
 
     # Prepare
-    prepared_model = prepare_pt2e(model_exp, quantizer)
+    prepared_model = prepare_pt2e(model_gm, quantizer)
 
     # Calibrate
     prepared_model(*inputs)
diff --git a/backends/cadence/aot/passes.py b/backends/cadence/aot/passes.py
index db419bfb5e..83ef43d151 100644
--- a/backends/cadence/aot/passes.py
+++ b/backends/cadence/aot/passes.py
@@ -266,3 +266,29 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         result = SpecPropPass()(graph_module)
         assert result is not None
         return result
+
+
+class ReplaceSafeSoftmaxWithSoftmax(ExportPass):
+    """
+    Replace _safe_softmax with _softmax
+    """
+
+    def call_operator(
+        self,
+        op,  # pyre-ignore
+        args: tuple[Argument, ...],
+        kwargs: dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op != torch.ops.aten._safe_softmax.default:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # Add False for the half_to_float argument of softmax
+        softmax_args = list(args) + [False]
+
+        return super().call_operator(
+            torch.ops.aten._softmax.default,
+            tuple(softmax_args),
+            kwargs,
+            meta,
+        )
diff --git a/backends/cadence/aot/utils.py b/backends/cadence/aot/utils.py
index f0c294260a..b710f7d4e5 100644
--- a/backends/cadence/aot/utils.py
+++ b/backends/cadence/aot/utils.py
@@ -177,3 +177,11 @@ def print_ops_info(
                 tablefmt="outline",
             )
         )
+
+
+def model_gm_has_SDPA(model_gm: torch.fx.GraphModule) -> bool:
+    for node in model_gm.graph.nodes:
+        if node.op == "call_function":
+            if node.target == torch.ops.aten.scaled_dot_product_attention.default:
+                return True
+    return False
diff --git a/backends/cadence/cadence_runner/cadence_runner.cpp b/backends/cadence/cadence_runner/cadence_runner.cpp
index d76ba004aa..a269ed5a8e 100644
--- a/backends/cadence/cadence_runner/cadence_runner.cpp
+++ b/backends/cadence/cadence_runner/cadence_runner.cpp
@@ -22,13 +22,13 @@
 
 #include <gflags/gflags.h>
 
+#include <executorch/devtools/bundled_program/bundled_program.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/sdk/bundled_program/bundled_program.h>
-#include <executorch/sdk/etdump/etdump_flatcc.h>
 
 static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4MB
 
diff --git a/backends/cadence/cadence_runner/targets.bzl b/backends/cadence/cadence_runner/targets.bzl
index 028ff7ad2e..361fe9712e 100644
--- a/backends/cadence/cadence_runner/targets.bzl
+++ b/backends/cadence/cadence_runner/targets.bzl
@@ -19,12 +19,12 @@ def define_common_targets():
         visibility = ["PUBLIC"],
         deps = [
             "fbsource//arvr/third-party/gflags:gflags",
-            "fbsource//xplat/executorch/kernels/portable:generated_lib",
-            "fbsource//xplat/executorch/runtime/executor:program",
+            "fbsource//xplat/executorch/devtools/etdump:etdump_flatcc",
+            "fbsource//xplat/executorch/devtools/bundled_program:runtime",
             "fbsource//xplat/executorch/extension/data_loader:file_data_loader",
             "fbsource//xplat/executorch/extension/data_loader:buffer_data_loader",
+            "fbsource//xplat/executorch/kernels/portable:generated_lib",
+            "fbsource//xplat/executorch/runtime/executor:program",
             "fbsource//xplat/executorch/util:util",
-            "fbsource//xplat/executorch/sdk/etdump:etdump_flatcc",
-            "fbsource//xplat/executorch/sdk/bundled_program:runtime",
         ],
     )
diff --git a/backends/cadence/runtime/TARGETS b/backends/cadence/runtime/TARGETS
index 9f30cadf6f..1b55a7d541 100644
--- a/backends/cadence/runtime/TARGETS
+++ b/backends/cadence/runtime/TARGETS
@@ -13,9 +13,9 @@ python_library(
     typing = True,
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/bundled_program/serialize:lib",
         "//executorch/exir:lib",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/bundled_program/serialize:lib",
     ],
 )
diff --git a/backends/cadence/runtime/executor.py b/backends/cadence/runtime/executor.py
index 7bcf705c03..d07b1b6a52 100644
--- a/backends/cadence/runtime/executor.py
+++ b/backends/cadence/runtime/executor.py
@@ -18,14 +18,13 @@
 
 import torch
 
-from executorch.exir import ExecutorchProgram, ExecutorchProgramManager
-
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.core import BundledProgram
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.core import BundledProgram
 
-from executorch.sdk.bundled_program.serialize import (
+from executorch.devtools.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
+from executorch.exir import ExecutorchProgram, ExecutorchProgramManager
 
 # If quiet is true, suppress the printing of stdout and stderr output.
 quiet = False
diff --git a/backends/cadence/runtime/runtime.py b/backends/cadence/runtime/runtime.py
index ec282f8f7b..33bb20719c 100644
--- a/backends/cadence/runtime/runtime.py
+++ b/backends/cadence/runtime/runtime.py
@@ -18,10 +18,10 @@
 
 from executorch.backends.cadence.runtime import utils
 from executorch.backends.cadence.runtime.executor import Executor
+from executorch.devtools import Inspector
 from executorch.exir import ExecutorchProgramManager
 from executorch.exir._serialize._program import deserialize_pte_binary
 from executorch.exir.schema import DataLocation
-from executorch.sdk import Inspector
 
 from numpy import ndarray
 
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index 5fd6d5ad19..b206a7e133 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -27,6 +27,8 @@
     QcomChipset,
 )
 from executorch.backends.qualcomm.utils.utils import capture_program
+from executorch.devtools import generate_etrecord
+from executorch.devtools.inspector import Inspector
 from executorch.examples.qualcomm.utils import (
     generate_inputs,
     make_output_dir,
@@ -40,8 +42,6 @@
 from executorch.exir.pass_base import ExportPass
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 from executorch.exir.program._program import ExecutorchProgram
-from executorch.sdk import generate_etrecord
-from executorch.sdk.inspector import Inspector
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 
diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py
index 08d7f96a6b..ca7ce72cae 100644
--- a/backends/vulkan/partitioner/supported_ops.py
+++ b/backends/vulkan/partitioner/supported_ops.py
@@ -8,7 +8,10 @@
 
 import operator
 
-from executorch.backends.vulkan.passes.custom_ops_defs import grid_priors_op  # noqa
+from executorch.backends.vulkan.passes.custom_ops_defs import (  # noqa
+    conv_with_clamp_op,
+    grid_priors_op,
+)
 
 from executorch.exir.dialects._ops import ops as exir_ops
 
@@ -84,6 +87,7 @@ def __contains__(self, op):
 
 CONVOLUTION_OPS = [
     exir_ops.edge.aten.convolution.default,
+    exir_ops.edge.et_vk.conv_with_clamp.default,
 ]
 
 REDUCTION_OPS = [
diff --git a/backends/vulkan/passes/custom_ops_defs.py b/backends/vulkan/passes/custom_ops_defs.py
index 62f21bfee6..fd586b665a 100644
--- a/backends/vulkan/passes/custom_ops_defs.py
+++ b/backends/vulkan/passes/custom_ops_defs.py
@@ -48,6 +48,43 @@ def conv_with_clamp_impl(
 conv_with_clamp_op = getattr(getattr(torch.ops, namespace), name)
 
 
+def conv_with_clamp_out_impl(
+    input,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    transposed=False,
+    output_padding=0,
+    groups=1,
+    output_min=-float("inf"),
+    output_max=float("inf"),
+    out=None,
+):
+    out = conv_with_clamp_impl(
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        output_min,
+        output_max,
+    )
+    return out
+
+
+name = "conv_with_clamp.out"
+lib.define(
+    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Scalar? output_min, Scalar? output_max, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.impl(name, conv_with_clamp_out_impl, "CompositeExplicitAutograd")
+
+
 # The dimension of x should be larger than 1
 def grid_priors_impl(
     x,
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 78aa4796aa..be44679f3b 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -13,36 +13,15 @@
 namespace vkcompute {
 namespace api {
 
-/*
- * Given the strides of a buffer-backed tensor, find the index of the "fastest
- * moving" dimension in WHCN dimension order. If multiple dims have the lowest
- * stride, then the "earlier" dim is assumed to be the fastest moving (width is
- * "earlier" than height).
- */
-int32_t find_fastest_whcn_dim(const std::vector<int64_t>& strides) {
-  if (strides.size() == 0) {
-    return 0;
-  }
-  int32_t fastest_dim = 0;
-  int64_t min_stride = strides.at(0);
-  for (int d = strides.size() - 1; d >= 0; --d) {
-    if (strides.at(d) < min_stride) {
-      fastest_dim = d;
-      min_stride = strides.at(d);
-    }
-  }
-  return (strides.size() - 1 - fastest_dim);
-}
-
 /*
  * Given the strides of a buffer-backed tensor, estimate the equivalent memory
  * layout enum value by identifying the fastest moving dimension.
  */
 utils::GPUMemoryLayout estimate_memory_layout(
-    const std::vector<int64_t>& strides) {
-  int32_t fastest_dim = find_fastest_whcn_dim(strides);
-  if (fastest_dim <= 3) {
-    return utils::GPUMemoryLayout(fastest_dim);
+    const std::vector<int64_t>& dim_order) {
+  int64_t fastest_dim_whcn = dim_order.size() - 1 - dim_order.back();
+  if (fastest_dim_whcn >= 0 && fastest_dim_whcn < 3) {
+    return utils::GPUMemoryLayout(fastest_dim_whcn);
   }
 
   // TODO(ssjia) find a way to gracefully recover from this case by i.e. adding
@@ -51,41 +30,70 @@ utils::GPUMemoryLayout estimate_memory_layout(
   VK_THROW("No compatible GPUMemoryLayout value");
 }
 
+std::vector<int64_t> calculate_dim_order(
+    const size_t ndim,
+    const utils::GPUMemoryLayout memory_layout) {
+  // Special case for zero dim tensors
+  if (ndim == 0) {
+    return {0};
+  }
+  std::vector<int64_t> dim_order(ndim);
+  int64_t last_dim =
+      ndim - utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
+
+  int64_t cur_dim = 0;
+  for (int d = 0; d < ndim; ++d) {
+    if (d == last_dim) {
+      cur_dim++;
+    }
+    dim_order[d] = cur_dim;
+    cur_dim++;
+  }
+  if (last_dim >= 0) {
+    dim_order[ndim - 1] = last_dim;
+  }
+
+  return dim_order;
+}
+
 std::vector<int64_t> calculate_strides(
     const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout) {
+    const std::vector<int64_t>& dim_order) {
   // For zero dim tensors
   if (sizes.size() == 0) {
     return {1};
   }
 
-  const int64_t dim_offset =
-      utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
-  int64_t last_dim = sizes.size() - dim_offset;
-  if (last_dim < 0) {
-    last_dim = sizes.size() - 1;
-  }
-
   size_t ndim = sizes.size();
   std::vector<int64_t> strides(ndim);
 
-  const int64_t last_dim_size = sizes.at(last_dim);
-
-  for (int stride_d = ndim - 1; stride_d >= 0; stride_d--) {
-    strides.at(stride_d) = 1;
-    if (stride_d == last_dim) {
-      continue;
-    }
-    strides.at(stride_d) = last_dim_size;
-    for (int size_d = ndim - 1; size_d > stride_d; size_d--) {
-      if (size_d != last_dim) {
-        strides.at(stride_d) *= sizes.at(size_d);
-      }
+  strides[dim_order[ndim - 1]] = 1;
+  for (int32_t i = ndim - 2; i >= 0; --i) {
+    if (sizes[dim_order[i + 1]] == 0) {
+      strides[dim_order[i]] = strides[dim_order[i + 1]];
+    } else {
+      strides[dim_order[i]] =
+          strides[dim_order[i + 1]] * sizes[dim_order[i + 1]];
     }
   }
+
   return strides;
 }
 
+bool dim_order_is_valid(const std::vector<int64_t>& dim_order) {
+  int64_t sum = 0;
+  for (size_t i = 0; i < dim_order.size(); ++i) {
+    if (dim_order[i] < 0 || dim_order[i] >= dim_order.size()) {
+      return false;
+    }
+    sum += dim_order[i];
+  }
+  int64_t n = static_cast<int64_t>(dim_order.size() - 1);
+  // Sanity check that the sum of the indices in the vector is equal to the sum
+  // of 0 + 1 + 2 + ... + (ndim - 1)
+  return sum == n * (n + 1) / 2;
+}
+
 std::vector<int64_t> unsqueeze_strides(
     const std::vector<int64_t>& strides,
     const int64_t numel) {
@@ -170,7 +178,8 @@ vTensor::vTensor(
       memory_layout_(memory_layout),
       // Calculate tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
-      strides_(calculate_strides(sizes, memory_layout_)),
+      dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)),
+      strides_(calculate_strides(sizes, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
       padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
@@ -189,6 +198,9 @@ vTensor::vTensor(
           padded_sizes_,
           dtype_,
           allocate_memory) {
+  VK_CHECK_COND(
+      dim_order_is_valid(dim_order_), "computed dim order is invalid");
+
   if (storage_type != utils::kBuffer) {
     texture_limits_.limits = utils::ivec3{
         utils::safe_downcast<int32_t>(storage_.image_extents_[0]),
@@ -204,16 +216,39 @@ vTensor::vTensor(
   }
 }
 
+vTensor::vTensor(const vTensor& other)
+    : dtype_(other.dtype_),
+      memory_layout_(other.memory_layout_),
+      // Copy tensor size metadata
+      sizes_(other.sizes_.begin(), other.sizes_.end()),
+      dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
+      strides_(other.strides_.begin(), other.strides_.end()),
+      numel_(other.numel_),
+      padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()},
+      unsqueezed_strides_{
+          other.unsqueezed_strides_.begin(),
+          other.unsqueezed_strides_.end()},
+      padded_numel_(other.padded_numel_),
+      texture_limits_{other.texture_limits_},
+      // Empty initialize Utility Uniform Buffers
+      sizes_uniform_(),
+      strides_uniform_(),
+      numel_uniform_(),
+      texture_limits_uniform_(),
+      // Copy Tensor storage
+      storage_(other.storage_) {}
+
 vTensor::vTensor(
     const vTensor& other,
     const std::vector<int64_t>& sizes,
-    const std::vector<int64_t>& strides,
-    const size_t offset_numel)
+    const std::vector<int64_t>& dim_order,
+    const int64_t offset_numel)
     : dtype_(other.dtype_),
-      memory_layout_(estimate_memory_layout(strides)),
+      memory_layout_(estimate_memory_layout(dim_order)),
       // Copy tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
-      strides_(strides.begin(), strides.end()),
+      dim_order_(dim_order.begin(), dim_order.end()),
+      strides_(calculate_strides(sizes_, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
       padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
@@ -226,6 +261,8 @@ vTensor::vTensor(
       texture_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) {
+  VK_CHECK_COND(
+      dim_order_is_valid(dim_order_), "new dim order provided is invalid");
   VK_CHECK_COND(
       offset_numel + numel_ <= other.numel(),
       "Tensor alias cannot access more elements than available in the original"
@@ -339,9 +376,17 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
   }
 }
 
-void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
+void vTensor::update_metadata(
+    const std::vector<int64_t>& new_sizes,
+    const std::vector<int64_t>& new_dim_order) {
   sizes_ = new_sizes;
-  strides_ = calculate_strides(new_sizes, memory_layout_);
+  dim_order_ = new_dim_order;
+  strides_ = calculate_strides(sizes_, dim_order_);
+  // Only update the memory layout for buffer-backed tensors. Strides are
+  // meaningless for texture-backed tensors and do not impact the memory layout.
+  if (storage_type() == utils::kBuffer) {
+    memory_layout_ = estimate_memory_layout(dim_order_);
+  }
   numel_ = utils::multiply_integers(sizes_);
 
   padded_sizes_ = calculate_padded_sizes(sizes_, memory_layout_);
@@ -373,15 +418,7 @@ void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
   }
 }
 
-void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
-  update_size_metadata(new_sizes);
-  storage_.discard_and_reallocate(
-      calculate_padded_sizes(new_sizes, memory_layout_),
-      memory_layout_,
-      dtype_);
-}
-
-void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
+void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
   if (storage_type() != utils::kBuffer) {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
@@ -394,10 +431,47 @@ void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
 
     VK_CHECK_COND(
         valid_resize,
-        "Cannot use virtual resize if new sizes requires a larger texture.");
+        "tensor sizes requires a larger texture than the current one.");
+  } else {
+    // For buffer storage check that the current buffer is large enough for the
+    // new sizes of the tensor.
+    int64_t numel = utils::multiply_integers(sizes);
+    bool valid_resize =
+        numel + storage_.buffer_offset_ <= storage_.buffer_length_;
+    VK_CHECK_COND(
+        valid_resize,
+        "tensor sizes requires a larger buffer than the current one.");
   }
+}
+
+void vTensor::virtual_reconfigure(
+    const std::vector<int64_t>& new_sizes,
+    const std::vector<int64_t>& new_dim_order) {
+  VK_CHECK_COND(
+      storage_type() == utils::kBuffer,
+      "virtual_reconfigure is only applicable for buffer backed tensors");
+  VK_CHECK_COND(new_sizes.size() == new_dim_order.size());
+  VK_CHECK_COND(dim_order_is_valid(new_dim_order));
 
-  update_size_metadata(new_sizes);
+  check_sizes(new_sizes);
+  update_metadata(new_sizes, new_dim_order);
+}
+
+void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
+  VK_CHECK_COND(
+      new_sizes.size() == dim_order_.size(),
+      "new sizes cannot modify the dimensionality of the tensor ");
+
+  check_sizes(new_sizes);
+  update_metadata(new_sizes, dim_order_);
+}
+
+void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
+  update_metadata(new_sizes, dim_order_);
+  storage_.discard_and_reallocate(
+      calculate_padded_sizes(new_sizes, memory_layout_),
+      memory_layout_,
+      dtype_);
 }
 
 //
@@ -480,6 +554,7 @@ vTensorStorage::vTensorStorage(
       storage_type_{storage_type},
       image_extents_(calculate_image_extents(padded_sizes, gpu_memory_layout)),
       buffer_length_{utils::multiply_integers(padded_sizes)},
+      buffer_offset_{0},
       image_(allocate_image(
           context_,
           image_extents_,
@@ -496,11 +571,12 @@ vTensorStorage::vTensorStorage(
 
 vTensorStorage::vTensorStorage(
     const vTensorStorage& other,
-    const size_t buffer_offset)
+    const int64_t buffer_offset)
     : context_(other.context_),
       storage_type_{other.storage_type_},
       image_extents_(other.image_extents_),
       buffer_length_{other.buffer_length_},
+      buffer_offset_{buffer_offset},
       image_(),
       buffer_(other.buffer_, buffer_offset),
       last_access_{other.last_access_} {
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 5a4598291c..8186ef1bd6 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -20,14 +20,21 @@ namespace vkcompute {
 namespace api {
 
 /*
- * Given the sizes of a tensor and the GPU memory layout, calculate the strides
- * of the tensor in NCHW dimension order. The GPU memory layout will be used to
- * determine which dimension is packed along a texel; that dimension will be
- * used as the "fasted moving" dimension with a stride of 1.
+ * Given a GPUMemoryLayout value, produce a dim order vector that matches the
+ * given memory layout. The produced dim order vector will be in the NCHW
+ * dimension order
+ */
+std::vector<int64_t> calculate_dim_order(
+    const size_t ndim,
+    const utils::GPUMemoryLayout memory_layout);
+
+/*
+ * Given the sizes of a tensor and the dim order of the tensor (both in NCHW)
+ * dimension order, calculate the strides of the tensor.
  */
 std::vector<int64_t> calculate_strides(
     const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout);
+    const std::vector<int64_t>& dim_order);
 
 std::vector<int64_t> unsqueeze_strides(
     const std::vector<int64_t>& strides,
@@ -96,7 +103,7 @@ class vTensorStorage final {
    * because this behaviour is unsafe, since the original tensor may be
    * destroyed before the copy is destroyed.
    */
-  vTensorStorage(const vTensorStorage& other, const size_t buffer_offset = 0);
+  vTensorStorage(const vTensorStorage& other, const int64_t buffer_offset = 0);
 
  public:
   // To discourage creating copies, the assignment operator is still deleted.
@@ -118,6 +125,7 @@ class vTensorStorage final {
   // Resource sizings
   utils::uvec3 image_extents_{};
   int64_t buffer_length_{};
+  int64_t buffer_offset_{};
 
   // GPU Storage
   mutable vkapi::VulkanImage image_;
@@ -167,8 +175,16 @@ class vTensor final {
       const utils::GPUMemoryLayout memory_layout = utils::kChannelsPacked,
       const bool allocate_memory = true);
 
-  vTensor(const vTensor& other) = delete;
-  vTensor& operator=(const vTensor& other) = delete;
+  /*
+   * This constructor allows for the creation of a vTensor that references the
+   * same buffer resource of another vTensor, with the same sizes and strides
+   * metadata. The created vTensor will not own the underlying resource. This is
+   * only applicable for buffer backed tensors at the moment.
+   *
+   * Once created, the sizes and strides of the aliased vTensor can be changed
+   * using the `virtual_reconfigure` member function.
+   */
+  vTensor(const vTensor& other);
 
   /*
    * This constructor allows for the creation of a vTensor that references the
@@ -176,6 +192,10 @@ class vTensor final {
    * strides metatdata. The created vTensor will not own the underlying
    * resource. This is only applicable for buffer backed tensors at the moment.
    *
+   * Note that dim order is used as the source of truth regarding the strides,
+   * and the new strides are computed from the new sizes and new dim order.
+   * Thus only the dim order is provided as an argument to this function.
+   *
    * The offset_numel argument allows the aliased tensor's memory region to
    * begin at an offset of N elements from the start of the original tensor's
    * buffer.
@@ -183,8 +203,11 @@ class vTensor final {
   vTensor(
       const vTensor& other,
       const std::vector<int64_t>& sizes,
-      const std::vector<int64_t>& strides,
-      const size_t offset_numel = 0);
+      const std::vector<int64_t>& dim_order,
+      const int64_t offset_numel = 0);
+
+  // To discourage making copies, the copy assignment operator is still deleted
+  vTensor& operator=(const vTensor& other) = delete;
 
   vTensor(vTensor&& other) = default;
   vTensor& operator=(vTensor&& other) = default;
@@ -195,6 +218,11 @@ class vTensor final {
 
   // sizes of the tensor in NCHW dimension order
   std::vector<int64_t> sizes_;
+  // dim order of the tensor; dimension indices are in NCHW dimension order
+  // i.e. 0 is N, 1 is C, 2 is H, 3 is W for a 4D tensor. The dims with larger
+  // strides precede the dims with smaller strides in the dim order. The last
+  // dim is always the fastest moving dim with a stride of 1.
+  std::vector<int64_t> dim_order_;
   // strides of the tensor in NCHW dimension order
   std::vector<int64_t> strides_;
   // Contains the number of elements in the tensor according to the canonical
@@ -305,6 +333,10 @@ class vTensor final {
     return sizes_.size();
   }
 
+  inline const std::vector<int64_t>& dim_order() const {
+    return dim_order_;
+  }
+
   inline const std::vector<int64_t>& strides() const {
     return strides_;
   }
@@ -386,24 +418,46 @@ class vTensor final {
 
  private:
   /*
-   * Update the size metadata of the vTensor to be new sizes. Should not be used
-   * directly, reallocate() or virtual_resize() should be used instead.
+   * Given new sizes and new strides of the dim order, update the sizes and dim
+   * order metadata of the vTensor. New strides are computed using the new sizes
+   * and new dim order.
+   */
+  void update_metadata(
+      const std::vector<int64_t>& new_sizes,
+      const std::vector<int64_t>& new_dim_order);
+
+  /*
+   * Check that tensor sizes are valid given the current storage resource's
+   * limits.
    */
-  void update_size_metadata(const std::vector<int64_t>& new_sizes);
+  void check_sizes(const std::vector<int64_t>& sizes) const;
 
  public:
   /*
-   * Discard the underlying VkImage or VkBuffer and re-allocate based on new
-   * tensor sizes
+   * Change how the tensor should be interpreted by compute shaders via updating
+   * the size and dim order of the tensor. The new sizes and dim order may have
+   * different dimensionality than the current dimensionality of the tensor.
+   *
+   * This function can only be used for buffer-backed tensors, since texture
+   * backed buffers cannot change dimensionality or memory layout.
    */
-  void reallocate(const std::vector<int64_t>& new_sizes);
+  void virtual_reconfigure(
+      const std::vector<int64_t>& new_sizes,
+      const std::vector<int64_t>& new_dim_order);
 
   /*
    * Perform a virtual resize of the vTensor by modifying the size metadata that
    * gets used in compute shaders. This allows the shader to treat the
-   * underlying resource as if it were a different size.
+   * underlying resource as if it were a different size. The new sizes cannot
+   * modify the dimensionality of the tensor.
    */
   void virtual_resize(const std::vector<int64_t>& new_sizes);
+
+  /*
+   * Discard the underlying VkImage or VkBuffer and re-allocate based on new
+   * tensor sizes
+   */
+  void reallocate(const std::vector<int64_t>& new_sizes);
 };
 
 } // namespace api
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 50d927a913..48e1ebf0a8 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -203,6 +203,13 @@ ValueRef ComputeGraph::add_tensor(
       sizes, dtype, suggested_memory_layout(sizes), shared_object_idx);
 }
 
+ValueRef ComputeGraph::add_tensor_view(const ValueRef vref) {
+  const vTensorPtr t = get_tensor(vref);
+  ValueRef idx(static_cast<int>(values_.size()));
+  values_.emplace_back(api::vTensor(*t));
+  return idx;
+}
+
 ValueRef ComputeGraph::add_tensor_view(
     const ValueRef vref,
     const std::vector<int64_t>& sizes,
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index b432be8388..faa2f4107e 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -356,10 +356,17 @@ class ComputeGraph final {
    * `vTensor` value at `vref`. See the copy constructor of `api::vTensor` for
    * more details.
    */
+  ValueRef add_tensor_view(const ValueRef vref);
+
+  /*
+   * Use the copy constructor of `api::vTensor` to create a "view" of the
+   * `vTensor` value at `vref` with different sizes and dim order. See the copy
+   * constructor of `api::vTensor` for more details.
+   */
   ValueRef add_tensor_view(
       const ValueRef vref,
       const std::vector<int64_t>& sizes,
-      const std::vector<int64_t>& strides,
+      const std::vector<int64_t>& dim_order,
       const size_t offset_numel = 0);
 
   /*
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
index 3b2a826f87..2cb00ba65a 100644
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
@@ -35,7 +35,22 @@ ExecuteNode::ExecuteNode(
   graph.update_descriptor_counts(shader, /*execute = */ true);
 }
 
+ExecuteNode::ExecuteNode(
+    const ResizeFunction& resize_fn,
+    const std::vector<ValueRef>& resize_args)
+    : shader_(),
+      global_workgroup_size_({0u, 0u, 0u}),
+      local_workgroup_size_({0u, 0u, 0u}),
+      args_(),
+      params_(),
+      spec_vars_(),
+      resize_fn_(resize_fn),
+      resize_args_(resize_args) {}
+
 void ExecuteNode::encode(ComputeGraph* graph) {
+  if (!shader_) {
+    return;
+  }
   api::Context* const context = graph->context();
   vkapi::PipelineBarrier pipeline_barrier{};
 
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
index 1fff14e020..dece9ddb50 100644
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h
+++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
@@ -48,7 +48,7 @@ class ExecuteNode final {
       const std::vector<ArgGroup>&,
       const std::vector<ValueRef>&)>;
 
-  ExecuteNode(
+  explicit ExecuteNode(
       ComputeGraph& graph,
       const vkapi::ShaderInfo& shader,
       const utils::uvec3& global_workgroup_size,
@@ -59,6 +59,15 @@ class ExecuteNode final {
       const ResizeFunction& resize_fn = nullptr,
       const std::vector<ValueRef>& resize_args = {});
 
+  /*
+   * This overload of the ExecuteNode constructor is used to register ops which
+   * update a tensor view. No shader is dispatched, but the node still needs to
+   * update the view's sizes and strides after a resize.
+   */
+  explicit ExecuteNode(
+      const ResizeFunction& resize_fn = nullptr,
+      const std::vector<ValueRef>& resize_args = {});
+
   ~ExecuteNode() = default;
 
   void encode(ComputeGraph* graph);
@@ -83,6 +92,11 @@ class ExecuteNode final {
   const vkapi::SpecVarList spec_vars_;
   const ResizeFunction resize_fn_;
   const std::vector<ValueRef> resize_args_;
+
+ public:
+  operator bool() const {
+    return shader_;
+  }
 };
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 52af0542b6..74113197d4 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -562,6 +562,7 @@ void conv(ComputeGraph& graph, const std::vector<ValueRef>& args) {
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.convolution.default, conv);
   VK_REGISTER_OP(conv_with_clamp.default, conv);
+  VK_REGISTER_OP(et_vk.conv_with_clamp.default, conv);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Shader.h b/backends/vulkan/runtime/vk_api/Shader.h
index 34c2d95c93..1e3b2a799f 100644
--- a/backends/vulkan/runtime/vk_api/Shader.h
+++ b/backends/vulkan/runtime/vk_api/Shader.h
@@ -53,8 +53,8 @@ class ShaderLayout final {
 
 struct ShaderInfo final {
   struct {
-    const uint32_t* bin;
-    uint32_t size;
+    const uint32_t* bin = nullptr;
+    uint32_t size = 0u;
   } src_code;
 
   std::string kernel_name{""};
@@ -71,6 +71,10 @@ struct ShaderInfo final {
       const uint32_t,
       std::vector<VkDescriptorType>,
       const utils::uvec3 tile_size);
+
+  operator bool() const {
+    return src_code.bin != nullptr;
+  };
 };
 
 bool operator==(const ShaderInfo& _1, const ShaderInfo& _2);
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
index 9f57ec49a8..d80809ec79 100644
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ b/backends/vulkan/test/test_vulkan_delegate.py
@@ -1633,6 +1633,42 @@ def forward(self, x):
             memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
         )
 
+    def test_vulkan_backend_conv_with_clamp(self):
+        class ConvWithClampModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight = torch.randn(6, 8, 3, 3)
+                self.bias = torch.randn(8)
+                self.stride = (1, 2)
+                self.padding = (2, 3)
+                self.dilation = (1, 1)
+                self.transposed = True
+                self.output_padding = (0, 1)
+                self.groups = 1
+                self.output_min = 0
+                self.output_max = 10
+
+            def forward(self, x):
+                return torch.ops.et_vk.conv_with_clamp(
+                    x,
+                    self.weight,
+                    self.bias,
+                    self.stride,
+                    self.padding,
+                    self.dilation,
+                    self.transposed,
+                    self.output_padding,
+                    self.groups,
+                    self.output_min,
+                    self.output_max,
+                )
+
+        self.lower_module_and_test_output(
+            ConvWithClampModule(),
+            (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),),
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
     def test_vulkan_backend_grid_priors(self):
         class GridPriorsModule(torch.nn.Module):
             def __init__(self):
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index ad49687369..6c056cc9d9 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -482,3 +482,9 @@ void execute_graph_and_check_output(
     }
   }
 }
+
+bool check_close(float a, float b, float atol, float rtol) {
+  float max = std::max(std::abs(a), std::abs(b));
+  float diff = std::abs(a - b);
+  return diff <= (atol + rtol * max);
+}
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index f9969eddbf..bf54944617 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -242,3 +242,9 @@ void print_vector(
   }
   std::cout << std::endl;
 }
+
+//
+// Misc. Utilities
+//
+
+bool check_close(float a, float b, float atol = 1e-4, float rtol = 1e-5);
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 307593d8fd..1ac74e29ef 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -168,7 +168,50 @@ std::vector<int64_t> get_reference_strides(
   return {};
 }
 
+TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) {
+  vkapi::ShaderInfo empty_shader_info;
+  EXPECT_FALSE(empty_shader_info);
+  EXPECT_TRUE(empty_shader_info.src_code.bin == nullptr);
+  EXPECT_TRUE(empty_shader_info.src_code.size == 0u);
+}
+
+TEST_F(VulkanComputeAPITest, calculate_dim_order_test) {
+  // ndim, GPUMemoryLayout, expected dim order pairs
+  std::vector<std::tuple<size_t, utils::GPUMemoryLayout, std::vector<int64_t>>>
+      test_cases = {
+          {1, utils::kWidthPacked, {0}},
+          {1, utils::kHeightPacked, {0}},
+          {1, utils::kChannelsPacked, {0}},
+          {2, utils::kWidthPacked, {0, 1}},
+          {2, utils::kHeightPacked, {1, 0}},
+          {2, utils::kChannelsPacked, {0, 1}},
+          {3, utils::kWidthPacked, {0, 1, 2}},
+          {3, utils::kHeightPacked, {0, 2, 1}},
+          {3, utils::kChannelsPacked, {1, 2, 0}},
+          {4, utils::kWidthPacked, {0, 1, 2, 3}},
+          {4, utils::kHeightPacked, {0, 1, 3, 2}},
+          {4, utils::kChannelsPacked, {0, 2, 3, 1}},
+      };
+
+  for (const auto& test_case : test_cases) {
+    const size_t& ndim = std::get<0>(test_case);
+    const utils::GPUMemoryLayout& layout = std::get<1>(test_case);
+    const auto& expected_dim_order = std::get<2>(test_case);
+    std::vector<int64_t> dim_order = calculate_dim_order(ndim, layout);
+
+    ASSERT_TRUE(dim_order == expected_dim_order);
+  }
+}
+
 TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
+  vTensor v_tensor_to_resize(
+      context(),
+      {25, 25, 25, 25},
+      vkapi::kFloat,
+      utils::kBuffer,
+      utils::kWidthPacked,
+      /*allocate_memory = */ false);
+
   for (const auto& sizes : standard_sizes_to_test) {
     if (sizes.size() < 3) {
       continue;
@@ -176,7 +219,9 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
     for (const auto& layout :
          {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) {
       {
-        std::vector<int64_t> strides = calculate_strides(sizes, layout);
+        std::vector<int64_t> dim_order =
+            calculate_dim_order(sizes.size(), layout);
+        std::vector<int64_t> strides = calculate_strides(sizes, dim_order);
         std::vector<int64_t> ref_strides = get_reference_strides(sizes, layout);
         ASSERT_TRUE(strides == ref_strides);
 
@@ -187,6 +232,25 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
             get_reference_strides(sizes, layout, true);
 
         ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides);
+
+        // Create new vTensor and check that the strides are correct
+        vTensor new_v_tensor(
+            context(),
+            sizes,
+            vkapi::kFloat,
+            utils::kBuffer,
+            layout,
+            /*allocate_memory = */ false);
+
+        ASSERT_TRUE(new_v_tensor.strides() == ref_strides);
+        ASSERT_TRUE(
+            new_v_tensor.unsqueezed_strides() == ref_unsqueezed_strides);
+
+        // Resize vtensor and check that updated metadata is correct
+        v_tensor_to_resize.virtual_reconfigure(sizes, dim_order);
+        ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides);
+        ASSERT_TRUE(
+            v_tensor_to_resize.unsqueezed_strides() == ref_unsqueezed_strides);
       }
     }
   }
@@ -542,9 +606,10 @@ TEST_F(VulkanComputeAPITest, tensor_copy_test) {
   std::vector<int64_t> sizes = {9, 9};
   std::vector<int64_t> strides =
       get_reference_strides(sizes, utils::kWidthPacked);
+  std::vector<int64_t> dim_order = {0, 1};
 
   vTensor original = CREATE_FLOAT_BUFFER(sizes, /*allocate_memory=*/true);
-  vTensor copy = vTensor(original, sizes, strides);
+  vTensor copy = vTensor(original, sizes, dim_order);
   EXPECT_TRUE(get_vma_allocation_count() == 1);
 
   // Fill original tensor with some data
@@ -557,7 +622,6 @@ TEST_F(VulkanComputeAPITest, tensor_copy_test) {
   for (size_t i = 0; i < data_out.size(); ++i) {
     CHECK_VALUE(data_out, i, 2.5f + i);
   }
-  std::cout << std::endl;
 }
 
 TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
@@ -569,7 +633,7 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
   std::vector<int64_t> mat2_t_sizes = {K, N};
   std::vector<int64_t> out_sizes = {M, N};
 
-  std::vector<int64_t> transposed_strides = {1, K};
+  std::vector<int64_t> transposed_dim_order = {1, 0};
 
   vTensor mat1 = CREATE_FLOAT_BUFFER(mat1_sizes, /*allocate_memory=*/true);
   vTensor mat2 = CREATE_FLOAT_BUFFER(mat2_sizes, /*allocate_memory=*/true);
@@ -581,8 +645,8 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
   std::vector<float> mat2_data =
       create_random_float_buffer(mat2.staging_buffer_numel());
 
-  vTensor mat2_t = vTensor(mat2, mat2_t_sizes, transposed_strides);
-  EXPECT_TRUE(mat2_t.gpu_memory_layout() == utils::kHeightPacked);
+  // Create direct view and modify sizes and strides later
+  vTensor mat2_t = vTensor(mat2);
 
   std::vector<float> mat2_t_data = transpose_matrix(mat2_data, N, K);
   std::vector<float> ref_out =
@@ -594,6 +658,10 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
 
   record_reference_matmul(api::context(), out, mat1, mat2_t);
 
+  // Update sizes and strides of mat2_t to be that of a transposed tensor
+  mat2_t.virtual_reconfigure(mat2_t_sizes, transposed_dim_order);
+  EXPECT_TRUE(mat2_t.gpu_memory_layout() == utils::kHeightPacked);
+
   std::vector<float> data_out(out.staging_buffer_numel());
   // Extract the copy tensor; should contain the data of the original tensor
   extract_vtensor(out, data_out);
@@ -601,7 +669,7 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
   EXPECT_TRUE(data_out.size() == ref_out.size());
 
   for (size_t i = 0; i < data_out.size(); ++i) {
-    EXPECT_TRUE(data_out[i] == ref_out[i]);
+    EXPECT_TRUE(check_close(data_out[i], ref_out[i]));
   }
 }
 
@@ -615,7 +683,7 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_slice_test) {
   constexpr int L_S2 = 7;
   constexpr int O_S2 = 3;
 
-  std::vector<int64_t> strides = {1};
+  std::vector<int64_t> dim_order = {0};
 
   std::vector<int64_t> t_sizes = {L};
   std::vector<int64_t> s1_sizes = {L_S1};
@@ -625,8 +693,8 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_slice_test) {
 
   fill_vtensor(orig, 0);
 
-  vTensor s1 = vTensor(orig, s1_sizes, strides, O_S1);
-  vTensor s2 = vTensor(s1, s2_sizes, strides, O_S2);
+  vTensor s1 = vTensor(orig, s1_sizes, dim_order, O_S1);
+  vTensor s2 = vTensor(s1, s2_sizes, dim_order, O_S2);
 
   record_scalar_add_buffer(api::context(), s1, 4.5f);
   record_scalar_add_buffer(api::context(), s2, 7.5f);
@@ -975,6 +1043,19 @@ TEST(VulkanComputeGraphTest, test_values_string) {
   EXPECT_TRUE(stored == "hello, world");
 }
 
+TEST(VulkanComputeGraphTest, empty_init_executenode_test) {
+  ExecuteNode node(nullptr, {});
+  EXPECT_FALSE(node);
+
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  // Encode an empty ExecuteNode and check that command buffer encoding does not
+  // crash.
+  graph.execute_nodes().emplace_back(new ExecuteNode(nullptr, {}));
+  EXPECT_NO_FATAL_FAILURE(graph.encode_execute());
+}
+
 TEST(VulkanComputeGraphTest, test_zero_dim_tensor) {
   GraphConfig config;
   ComputeGraph graph(config);
@@ -1073,7 +1154,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph_with_view) {
   config.set_storage_type_override(utils::kBuffer);
   ComputeGraph graph(config);
 
-  std::vector<int64_t> strides = {W, 1};
+  std::vector<int64_t> dim_order = {0, 1};
 
   std::vector<int64_t> orig_sizes = {H, W};
   std::vector<int64_t> slice_sizes = {S_H, W};
@@ -1083,7 +1164,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph_with_view) {
 
   IOValueRef orig = graph.add_input_tensor(orig_sizes, vkapi::kFloat);
   ValueRef slice =
-      graph.add_tensor_view(orig.value, slice_sizes, strides, offset);
+      graph.add_tensor_view(orig.value, slice_sizes, dim_order, offset);
 
   IOValueRef out = {};
 
diff --git a/backends/xnnpack/test/TARGETS b/backends/xnnpack/test/TARGETS
index abedffb8e6..629ac8275b 100644
--- a/backends/xnnpack/test/TARGETS
+++ b/backends/xnnpack/test/TARGETS
@@ -36,10 +36,10 @@ runtime.python_test(
     deps = [
         "//executorch/backends/xnnpack/partition:xnnpack_partitioner",
         "//executorch/backends/xnnpack/test/tester:tester",
+        "//executorch/devtools:lib",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program/serialize:lib",
         "//executorch/exir/passes:constant_prop_pass",
-        "//executorch/sdk:lib",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program/serialize:lib",
         "//pytorch/ao:torchao",  # @manual
     ],
     external_deps = [
diff --git a/backends/xnnpack/test/test_xnnpack_utils.py b/backends/xnnpack/test/test_xnnpack_utils.py
index c6b1513d31..3f5359a3f4 100644
--- a/backends/xnnpack/test/test_xnnpack_utils.py
+++ b/backends/xnnpack/test/test_xnnpack_utils.py
@@ -25,6 +25,12 @@
 
 # import the xnnpack backend implementation
 from executorch.backends.xnnpack.xnnpack_preprocess import XnnpackBackend
+from executorch.devtools import BundledProgram
+
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
+    serialize_from_bundled_program_to_flatbuffer,
+)
 from executorch.exir import ExecutorchProgram, ExirExportedProgram
 from executorch.exir.backend.backend_api import to_backend, validation_disabled
 
@@ -34,12 +40,6 @@
     _load_for_executorch_from_buffer,
 )
 from executorch.extension.pytree import tree_flatten
-from executorch.sdk import BundledProgram
-
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.serialize import (
-    serialize_from_bundled_program_to_flatbuffer,
-)
 
 from torch.ao.quantization import (  # @manual
     default_per_channel_symmetric_qnnpack_qconfig,
diff --git a/sdk/CMakeLists.txt b/devtools/CMakeLists.txt
similarity index 89%
rename from sdk/CMakeLists.txt
rename to devtools/CMakeLists.txt
index 79903fc315..4c4d15fd73 100644
--- a/sdk/CMakeLists.txt
+++ b/devtools/CMakeLists.txt
@@ -78,8 +78,8 @@ set_property(TARGET flatccrt PROPERTY POSITION_INDEPENDENT_CODE ON)
 include(ExternalProject)
 
 # The include directory that will contain the generated schema headers.
-set(_program_schema__include_dir "${CMAKE_BINARY_DIR}/sdk/include")
-set(_bundled_schema__include_dir "${CMAKE_BINARY_DIR}/sdk/bundled_program")
+set(_program_schema__include_dir "${CMAKE_BINARY_DIR}/devtools/include")
+set(_bundled_schema__include_dir "${CMAKE_BINARY_DIR}/devtools/bundled_program")
 
 # TODO(dbort): Only enable this when cross-compiling. It can cause build race
 # conditions (libflatcc.a errors) when enabled.
@@ -128,11 +128,11 @@ set(_etdump_schema__outputs)
 foreach(fbs_file ${_etdump_schema_names})
   string(REGEX REPLACE "[.]fbs$" "_reader.h" generated "${fbs_file}")
   list(APPEND _etdump_schema__outputs
-       "${_program_schema__include_dir}/executorch/sdk/etdump/${generated}"
+       "${_program_schema__include_dir}/executorch/devtools/etdump/${generated}"
   )
   string(REGEX REPLACE "[.]fbs$" "_builder.h" generated "${fbs_file}")
   list(APPEND _etdump_schema__outputs
-       "${_program_schema__include_dir}/executorch/sdk/etdump/${generated}"
+       "${_program_schema__include_dir}/executorch/devtools/etdump/${generated}"
   )
 endforeach()
 
@@ -143,7 +143,7 @@ foreach(fbs_file ${_bundled_input_schema_names})
   list(
     APPEND
     _bundled_program_schema__outputs
-    "${_bundled_schema__include_dir}/executorch/sdk/bundled_program/schema/${generated}"
+    "${_bundled_schema__include_dir}/executorch/devtools/bundled_program/schema/${generated}"
   )
 endforeach()
 
@@ -152,9 +152,9 @@ add_library(
   bundled_program_schema INTERFACE ${_bundled_program_schema__outputs}
 )
 
-file(MAKE_DIRECTORY ${_program_schema__include_dir}/executorch/sdk/etdump)
+file(MAKE_DIRECTORY ${_program_schema__include_dir}/executorch/devtools/etdump)
 file(MAKE_DIRECTORY
-     ${_program_schema__include_dir}/executorch/sdk/bundled_program
+     ${_program_schema__include_dir}/executorch/devtools/bundled_program
 )
 
 add_custom_command(
@@ -164,7 +164,7 @@ add_custom_command(
     # tree instead of under the binary directory, and there's no way to change
     # that behavior.
     ${_flatcc_source_dir}/bin/flatcc -cwr -o
-    ${_program_schema__include_dir}/executorch/sdk/etdump
+    ${_program_schema__include_dir}/executorch/devtools/etdump
     ${_etdump_schema__srcs}
   COMMAND rm -f ${_etdump_schema_cleanup_paths}
   DEPENDS ${_etdump_schema_gen_dep}
@@ -186,9 +186,9 @@ add_custom_command(
   OUTPUT ${_bundled_program_schema__outputs}
   COMMAND
     ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o
-    "${_bundled_schema__include_dir}/executorch/sdk/bundled_program/schema"
+    "${_bundled_schema__include_dir}/executorch/devtools/bundled_program/schema"
     ${_bundled_program_schema__srcs}
-  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/sdk
+  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/devtools
   DEPENDS ${FLATC_EXECUTABLE} ${_bundled_program_schema__srcs}
   COMMENT "Generating bundled_program headers"
   VERBATIM
diff --git a/sdk/TARGETS b/devtools/TARGETS
similarity index 54%
rename from sdk/TARGETS
rename to devtools/TARGETS
index 56d38a4ad3..06964b8387 100644
--- a/sdk/TARGETS
+++ b/devtools/TARGETS
@@ -6,8 +6,8 @@ python_library(
     name = "lib",
     srcs = ["__init__.py"],
     deps = [
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/etrecord:etrecord",
-        "//executorch/sdk/inspector:lib",
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/etrecord:etrecord",
+        "//executorch/devtools/inspector:lib",
     ],
 )
diff --git a/sdk/__init__.py b/devtools/__init__.py
similarity index 57%
rename from sdk/__init__.py
rename to devtools/__init__.py
index 11134bf276..821d75901f 100644
--- a/sdk/__init__.py
+++ b/devtools/__init__.py
@@ -4,10 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import executorch.sdk.inspector as inspector
-from executorch.sdk.bundled_program.core import BundledProgram
-from executorch.sdk.etrecord import ETRecord, generate_etrecord, parse_etrecord
-from executorch.sdk.inspector import Inspector
+import executorch.devtools.inspector as inspector
+from executorch.devtools.bundled_program.core import BundledProgram
+from executorch.devtools.etrecord import ETRecord, generate_etrecord, parse_etrecord
+from executorch.devtools.inspector import Inspector
 
 __all__ = [
     "ETRecord",
diff --git a/sdk/backend_debug/TARGETS b/devtools/backend_debug/TARGETS
similarity index 100%
rename from sdk/backend_debug/TARGETS
rename to devtools/backend_debug/TARGETS
diff --git a/sdk/backend_debug/__init__.py b/devtools/backend_debug/__init__.py
similarity index 83%
rename from sdk/backend_debug/__init__.py
rename to devtools/backend_debug/__init__.py
index c1c9726b86..b457b7d11d 100644
--- a/sdk/backend_debug/__init__.py
+++ b/devtools/backend_debug/__init__.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from executorch.sdk.backend_debug.delegation_info import (
+from executorch.devtools.backend_debug.delegation_info import (
     DelegationBreakdown,
     get_delegation_info,
 )
diff --git a/sdk/backend_debug/delegation_info.py b/devtools/backend_debug/delegation_info.py
similarity index 100%
rename from sdk/backend_debug/delegation_info.py
rename to devtools/backend_debug/delegation_info.py
diff --git a/sdk/backend_debug/tests/TARGETS b/devtools/backend_debug/tests/TARGETS
similarity index 86%
rename from sdk/backend_debug/tests/TARGETS
rename to devtools/backend_debug/tests/TARGETS
index 3c9f6c2e64..ae234df8ce 100644
--- a/sdk/backend_debug/tests/TARGETS
+++ b/devtools/backend_debug/tests/TARGETS
@@ -10,8 +10,8 @@ python_unittest(
     deps = [
         "fbsource//third-party/pypi/pandas:pandas",
         "//caffe2:torch",
+        "//executorch/devtools/backend_debug:delegation_info",
         "//executorch/exir:lib",
         "//executorch/exir/backend/test:op_partitioner_demo",
-        "//executorch/sdk/backend_debug:delegation_info",
     ],
 )
diff --git a/sdk/backend_debug/tests/test_delegation_info.py b/devtools/backend_debug/tests/test_delegation_info.py
similarity index 96%
rename from sdk/backend_debug/tests/test_delegation_info.py
rename to devtools/backend_debug/tests/test_delegation_info.py
index 2d98e9a595..6ff5169094 100644
--- a/sdk/backend_debug/tests/test_delegation_info.py
+++ b/devtools/backend_debug/tests/test_delegation_info.py
@@ -9,9 +9,9 @@
 import pandas as pd
 
 import torch
+from executorch.devtools.backend_debug import DelegationBreakdown, get_delegation_info
 from executorch.exir import to_edge
 from executorch.exir.backend.test.op_partitioner_demo import AddMulPartitionerDemo
-from executorch.sdk.backend_debug import DelegationBreakdown, get_delegation_info
 from pandas.testing import assert_frame_equal
 
 
diff --git a/sdk/bundled_program/TARGETS b/devtools/bundled_program/TARGETS
similarity index 88%
rename from sdk/bundled_program/TARGETS
rename to devtools/bundled_program/TARGETS
index c731606217..27560f7087 100644
--- a/sdk/bundled_program/TARGETS
+++ b/devtools/bundled_program/TARGETS
@@ -18,10 +18,10 @@ runtime.python_library(
         ":config",
         ":version",
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program/schema:bundled_program_schema_py",
         "//executorch/exir:schema",
         "//executorch/exir:tensor",
         "//executorch/exir/_serialize:lib",
-        "//executorch/sdk/bundled_program/schema:bundled_program_schema_py",
     ],
 )
 
@@ -46,6 +46,6 @@ runtime.python_library(
         "version.py",
     ],
     visibility = [
-        "//executorch/sdk/...",
+        "//executorch/devtools/...",
     ],
 )
diff --git a/sdk/bundled_program/bundled_program.cpp b/devtools/bundled_program/bundled_program.cpp
similarity index 98%
rename from sdk/bundled_program/bundled_program.cpp
rename to devtools/bundled_program/bundled_program.cpp
index 63affa5c7f..d174cbdcda 100644
--- a/sdk/bundled_program/bundled_program.cpp
+++ b/devtools/bundled_program/bundled_program.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/sdk/bundled_program/bundled_program.h>
+#include <executorch/devtools/bundled_program/bundled_program.h>
 
 #include <cmath>
 #include <cstddef>
@@ -16,12 +16,12 @@
 #include <ATen/ATen.h>
 #endif // USE_ATEN_LIB
 
+#include <executorch/devtools/bundled_program/schema/bundled_program_schema_generated.h>
 #include <executorch/runtime/core/event_tracer_hooks.h>
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 #include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/platform/log.h>
-#include <executorch/sdk/bundled_program/schema/bundled_program_schema_generated.h>
 
 namespace torch {
 namespace executor {
diff --git a/sdk/bundled_program/bundled_program.h b/devtools/bundled_program/bundled_program.h
similarity index 100%
rename from sdk/bundled_program/bundled_program.h
rename to devtools/bundled_program/bundled_program.h
diff --git a/sdk/bundled_program/config.py b/devtools/bundled_program/config.py
similarity index 100%
rename from sdk/bundled_program/config.py
rename to devtools/bundled_program/config.py
diff --git a/sdk/bundled_program/core.py b/devtools/bundled_program/core.py
similarity index 98%
rename from sdk/bundled_program/core.py
rename to devtools/bundled_program/core.py
index 56fc817bbe..c775fb1510 100644
--- a/sdk/bundled_program/core.py
+++ b/devtools/bundled_program/core.py
@@ -8,19 +8,19 @@
 import typing
 from typing import Dict, List, Optional, Sequence, Type, Union
 
-import executorch.exir.schema as core_schema
+import executorch.devtools.bundled_program.schema as bp_schema
 
-import executorch.sdk.bundled_program.schema as bp_schema
+import executorch.exir.schema as core_schema
 
 import torch
 import torch.fx
+from executorch.devtools.bundled_program.config import ConfigValue, MethodTestSuite
+
+from executorch.devtools.bundled_program.version import BUNDLED_PROGRAM_SCHEMA_VERSION
 
 from executorch.exir import ExecutorchProgram, ExecutorchProgramManager
 from executorch.exir._serialize import _serialize_pte_binary
 from executorch.exir.tensor import get_scalar_type, scalar_type_enum, TensorSpec
-from executorch.sdk.bundled_program.config import ConfigValue, MethodTestSuite
-
-from executorch.sdk.bundled_program.version import BUNDLED_PROGRAM_SCHEMA_VERSION
 
 # pyre-ignore
 supported_program_type_table: Dict[Type[core_schema.KernelTypes], ConfigValue] = {
diff --git a/sdk/bundled_program/schema/README.md b/devtools/bundled_program/schema/README.md
similarity index 100%
rename from sdk/bundled_program/schema/README.md
rename to devtools/bundled_program/schema/README.md
diff --git a/sdk/bundled_program/schema/TARGETS b/devtools/bundled_program/schema/TARGETS
similarity index 84%
rename from sdk/bundled_program/schema/TARGETS
rename to devtools/bundled_program/schema/TARGETS
index e9bd642069..51c004cbec 100644
--- a/sdk/bundled_program/schema/TARGETS
+++ b/devtools/bundled_program/schema/TARGETS
@@ -15,8 +15,8 @@ runtime.python_library(
         "bundled_program_schema.py",
     ],
     visibility = [
-        "//executorch/sdk/bundled_program/...",
-        "//executorch/sdk/etrecord/...",
+        "//executorch/devtools/bundled_program/...",
+        "//executorch/devtools/etrecord/...",
     ],
     deps = [
         "//executorch/exir:scalar_type",
diff --git a/sdk/bundled_program/schema/__init__.py b/devtools/bundled_program/schema/__init__.py
similarity index 100%
rename from sdk/bundled_program/schema/__init__.py
rename to devtools/bundled_program/schema/__init__.py
diff --git a/sdk/bundled_program/schema/bundled_program_schema.fbs b/devtools/bundled_program/schema/bundled_program_schema.fbs
similarity index 100%
rename from sdk/bundled_program/schema/bundled_program_schema.fbs
rename to devtools/bundled_program/schema/bundled_program_schema.fbs
diff --git a/sdk/bundled_program/schema/bundled_program_schema.py b/devtools/bundled_program/schema/bundled_program_schema.py
similarity index 100%
rename from sdk/bundled_program/schema/bundled_program_schema.py
rename to devtools/bundled_program/schema/bundled_program_schema.py
diff --git a/sdk/bundled_program/schema/scalar_type.fbs b/devtools/bundled_program/schema/scalar_type.fbs
similarity index 100%
rename from sdk/bundled_program/schema/scalar_type.fbs
rename to devtools/bundled_program/schema/scalar_type.fbs
diff --git a/sdk/bundled_program/schema/targets.bzl b/devtools/bundled_program/schema/targets.bzl
similarity index 93%
rename from sdk/bundled_program/schema/targets.bzl
rename to devtools/bundled_program/schema/targets.bzl
index a25d792c5a..532a01e039 100644
--- a/sdk/bundled_program/schema/targets.bzl
+++ b/devtools/bundled_program/schema/targets.bzl
@@ -49,14 +49,14 @@ def define_common_targets():
     runtime.export_file(
         name = INPUT_BUNDLED,
         visibility = [
-            "//executorch/sdk/bundled_program/serialize/...",
+            "//executorch/devtools/bundled_program/serialize/...",
         ],
     )
 
     runtime.export_file(
         name = INPUT_SCALAR_TYPE,
         visibility = [
-            "//executorch/sdk/bundled_program/serialize/...",
+            "//executorch/devtools/bundled_program/serialize/...",
         ],
     )
 
@@ -72,7 +72,7 @@ def define_common_targets():
         name = BUNDLED_LIBRARY_NAME,
         srcs = [],
         visibility = [
-            "//executorch/sdk/bundled_program/...",
+            "//executorch/devtools/bundled_program/...",
             "//executorch/extension/pybindings/...",
         ],
         exported_headers = {
diff --git a/sdk/bundled_program/schema/test/TARGETS b/devtools/bundled_program/schema/test/TARGETS
similarity index 100%
rename from sdk/bundled_program/schema/test/TARGETS
rename to devtools/bundled_program/schema/test/TARGETS
diff --git a/sdk/bundled_program/schema/test/test_schema.py b/devtools/bundled_program/schema/test/test_schema.py
similarity index 79%
rename from sdk/bundled_program/schema/test/test_schema.py
rename to devtools/bundled_program/schema/test/test_schema.py
index ab3d2760d2..c2a19adef7 100644
--- a/sdk/bundled_program/schema/test/test_schema.py
+++ b/devtools/bundled_program/schema/test/test_schema.py
@@ -20,8 +20,8 @@ def test_schema_sync(self) -> None:
 
         self.assertTrue(
             filecmp.cmp(
-                prefix + "sdk/bundled_program/schema/scalar_type.fbs",
+                prefix + "devtools/bundled_program/schema/scalar_type.fbs",
                 prefix + "schema/scalar_type.fbs",
             ),
-            'Please run "hg cp fbcode//executorch/schema/scalar_type.fbs fbcode//executorch/sdk/bundled_program/schema/scalar_type.fbs" to sync schema changes.',
+            'Please run "hg cp fbcode//executorch/schema/scalar_type.fbs fbcode//executorch/devtools/bundled_program/schema/scalar_type.fbs" to sync schema changes.',
         )
diff --git a/sdk/bundled_program/serialize/TARGETS b/devtools/bundled_program/serialize/TARGETS
similarity index 76%
rename from sdk/bundled_program/serialize/TARGETS
rename to devtools/bundled_program/serialize/TARGETS
index 20abccd7fd..11c5839977 100644
--- a/sdk/bundled_program/serialize/TARGETS
+++ b/devtools/bundled_program/serialize/TARGETS
@@ -10,8 +10,8 @@ runtime.python_library(
         "__init__.py",
     ],
     resources = {
-        "//executorch/sdk/bundled_program/schema:bundled_program_schema.fbs": "bundled_program_schema.fbs",
-        "//executorch/sdk/bundled_program/schema:scalar_type.fbs": "scalar_type.fbs",
+        "//executorch/devtools/bundled_program/schema:bundled_program_schema.fbs": "bundled_program_schema.fbs",
+        "//executorch/devtools/bundled_program/schema:scalar_type.fbs": "scalar_type.fbs",
     },
     # Currently serialization API should only be used in some dedicated targets,
     # to avoid ODR violation when linking with another Flatbuffers library.
@@ -20,18 +20,18 @@ runtime.python_library(
         "//executorch/bacends/...",
         "//executorch/backends/xnnpack/test/...",
         "//executorch/codegen/...",
+        "//executorch/devtools/bundled_program/tests/...",
         "//executorch/examples/async_exec:emit_program_lib",
         "//executorch/exir:lib",
         "//executorch/extension/pybindings/test:test",
         "//executorch/extension/pybindings/test:test-library",
         "//executorch/profiler/...",
-        "//executorch/sdk/bundled_program/tests/...",
         "//executorch/test/...",
         "@EXECUTORCH_CLIENTS",
     ],
     deps = [
         "fbsource//third-party/pypi/setuptools:setuptools",
+        "//executorch/devtools/bundled_program/schema:bundled_program_schema_py",
         "//executorch/exir/_serialize:lib",
-        "//executorch/sdk/bundled_program/schema:bundled_program_schema_py",
     ],
 )
diff --git a/sdk/bundled_program/serialize/__init__.py b/devtools/bundled_program/serialize/__init__.py
similarity index 97%
rename from sdk/bundled_program/serialize/__init__.py
rename to devtools/bundled_program/serialize/__init__.py
index e0c75574c9..075436e9c1 100644
--- a/sdk/bundled_program/serialize/__init__.py
+++ b/devtools/bundled_program/serialize/__init__.py
@@ -12,14 +12,14 @@
 import os
 import tempfile
 
-import executorch.sdk.bundled_program.schema as bp_schema
+import executorch.devtools.bundled_program.schema as bp_schema
 
 # @manual=fbsource//third-party/pypi/setuptools:setuptools
 import pkg_resources
+from executorch.devtools.bundled_program.core import BundledProgram
 
 from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass
 from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile
-from executorch.sdk.bundled_program.core import BundledProgram
 
 # The prefix of schema files used for bundled program
 BUNDLED_PROGRAM_SCHEMA_NAME = "bundled_program_schema"
diff --git a/sdk/bundled_program/serialize/test/TARGETS b/devtools/bundled_program/serialize/test/TARGETS
similarity index 51%
rename from sdk/bundled_program/serialize/test/TARGETS
rename to devtools/bundled_program/serialize/test/TARGETS
index 85f55c02f8..dd92f63f2d 100644
--- a/sdk/bundled_program/serialize/test/TARGETS
+++ b/devtools/bundled_program/serialize/test/TARGETS
@@ -10,9 +10,8 @@ python_unittest(
         "test_serialize.py",
     ],
     deps = [
-        "//executorch/exir:print_program",
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/bundled_program/serialize:lib",
-        "//executorch/sdk/bundled_program/util:test_util",
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/bundled_program/serialize:lib",
+        "//executorch/devtools/bundled_program/util:test_util",
     ],
 )
diff --git a/sdk/bundled_program/serialize/test/test_serialize.py b/devtools/bundled_program/serialize/test/test_serialize.py
similarity index 82%
rename from sdk/bundled_program/serialize/test/test_serialize.py
rename to devtools/bundled_program/serialize/test/test_serialize.py
index 1db6871fc0..48a914d144 100644
--- a/sdk/bundled_program/serialize/test/test_serialize.py
+++ b/devtools/bundled_program/serialize/test/test_serialize.py
@@ -8,13 +8,15 @@
 
 import unittest
 
-from executorch.sdk.bundled_program.core import BundledProgram
+from executorch.devtools.bundled_program.core import BundledProgram
 
-from executorch.sdk.bundled_program.serialize import (
+from executorch.devtools.bundled_program.serialize import (
     deserialize_from_flatbuffer_to_bundled_program,
     serialize_from_bundled_program_to_flatbuffer,
 )
-from executorch.sdk.bundled_program.util.test_util import get_common_executorch_program
+from executorch.devtools.bundled_program.util.test_util import (
+    get_common_executorch_program,
+)
 
 
 class TestSerialize(unittest.TestCase):
diff --git a/sdk/bundled_program/targets.bzl b/devtools/bundled_program/targets.bzl
similarity index 91%
rename from sdk/bundled_program/targets.bzl
rename to devtools/bundled_program/targets.bzl
index a3268dff2c..7035b3b31f 100644
--- a/sdk/bundled_program/targets.bzl
+++ b/devtools/bundled_program/targets.bzl
@@ -19,7 +19,7 @@ def define_common_targets():
             ],
             deps = [
                 "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix,
-                "//executorch/sdk/bundled_program/schema:bundled_program_schema_fbs",
+                "//executorch/devtools/bundled_program/schema:bundled_program_schema_fbs",
             ],
             exported_deps = [
                 "//executorch/runtime/core:memory_allocator",
diff --git a/sdk/bundled_program/test/TARGETS b/devtools/bundled_program/test/TARGETS
similarity index 68%
rename from sdk/bundled_program/test/TARGETS
rename to devtools/bundled_program/test/TARGETS
index caf69be60e..652c74b8f4 100644
--- a/sdk/bundled_program/test/TARGETS
+++ b/devtools/bundled_program/test/TARGETS
@@ -1,4 +1,5 @@
 # @noautodeps
+
 load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
 
 oncall("executorch")
@@ -10,11 +11,11 @@ python_unittest(
     ],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/bundled_program/schema:bundled_program_schema_py",
+        "//executorch/devtools/bundled_program/util:test_util",
         "//executorch/exir/_serialize:lib",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/bundled_program/schema:bundled_program_schema_py",
-        "//executorch/sdk/bundled_program/util:test_util",
     ],
 )
 
@@ -25,9 +26,9 @@ python_unittest(
     ],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program/util:test_util",
         "//executorch/extension/pytree:pylib",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program/util:test_util",
     ],
 )
 
@@ -38,6 +39,10 @@ python_unittest(
     ],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/bundled_program/serialize:lib",
+        "//executorch/devtools/bundled_program/util:test_util",
         "//executorch/exir:dynamic_shape",
         "//executorch/exir:lib",
         "//executorch/exir:memory",
@@ -54,9 +59,5 @@ python_unittest(
         "//executorch/extension/pybindings:portable_lib",
         "//executorch/extension/pytree:pybindings",
         "//executorch/kernels/portable:custom_ops_generated_lib",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/bundled_program/serialize:lib",
-        "//executorch/sdk/bundled_program/util:test_util",
     ],
 )
diff --git a/sdk/bundled_program/test/test_bundle_data.py b/devtools/bundled_program/test/test_bundle_data.py
similarity index 93%
rename from sdk/bundled_program/test/test_bundle_data.py
rename to devtools/bundled_program/test/test_bundle_data.py
index a8d9485c5f..565539cbf1 100644
--- a/sdk/bundled_program/test/test_bundle_data.py
+++ b/devtools/bundled_program/test/test_bundle_data.py
@@ -9,13 +9,15 @@
 import unittest
 from typing import List
 
-import executorch.sdk.bundled_program.schema as bp_schema
+import executorch.devtools.bundled_program.schema as bp_schema
 
 import torch
+from executorch.devtools.bundled_program.config import ConfigValue
+from executorch.devtools.bundled_program.core import BundledProgram
+from executorch.devtools.bundled_program.util.test_util import (
+    get_common_executorch_program,
+)
 from executorch.exir._serialize import _serialize_pte_binary
-from executorch.sdk.bundled_program.config import ConfigValue
-from executorch.sdk.bundled_program.core import BundledProgram
-from executorch.sdk.bundled_program.util.test_util import get_common_executorch_program
 
 
 class TestBundle(unittest.TestCase):
diff --git a/sdk/bundled_program/test/test_config.py b/devtools/bundled_program/test/test_config.py
similarity index 97%
rename from sdk/bundled_program/test/test_config.py
rename to devtools/bundled_program/test/test_config.py
index 3183ad907f..21f3d48042 100644
--- a/sdk/bundled_program/test/test_config.py
+++ b/devtools/bundled_program/test/test_config.py
@@ -10,14 +10,14 @@
 from typing import get_args, List, Union
 
 import torch
-from executorch.extension.pytree import tree_flatten
-from executorch.sdk.bundled_program.config import DataContainer
+from executorch.devtools.bundled_program.config import DataContainer
 
-from executorch.sdk.bundled_program.util.test_util import (
+from executorch.devtools.bundled_program.util.test_util import (
     get_random_test_suites,
     get_random_test_suites_with_eager_model,
     SampleModel,
 )
+from executorch.extension.pytree import tree_flatten
 
 
 class TestConfig(unittest.TestCase):
diff --git a/sdk/bundled_program/test/test_end2end.py b/devtools/bundled_program/test/test_end2end.py
similarity index 88%
rename from sdk/bundled_program/test/test_end2end.py
rename to devtools/bundled_program/test/test_end2end.py
index 99d58ee15c..7cee073be0 100644
--- a/sdk/bundled_program/test/test_end2end.py
+++ b/devtools/bundled_program/test/test_end2end.py
@@ -21,12 +21,12 @@
 
 import torch
 
-from executorch.sdk.bundled_program.core import BundledProgram
-from executorch.sdk.bundled_program.serialize import (
+from executorch.devtools.bundled_program.core import BundledProgram
+from executorch.devtools.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
 
-from executorch.sdk.bundled_program.util.test_util import (
+from executorch.devtools.bundled_program.util.test_util import (
     get_common_executorch_program,
     SampleModel,
 )
@@ -45,7 +45,7 @@
     pass
 
 try:
-    from executorch.extension.pybindings.aten_lib import (
+    from executorch.extension.pybindings.aten_lib import (  # @manual=//executorch/extension/pybindings:aten_lib
         _load_bundled_program_from_buffer,
         _load_for_executorch_from_buffer,
         _load_for_executorch_from_bundled_program,
diff --git a/sdk/bundled_program/util/TARGETS b/devtools/bundled_program/util/TARGETS
similarity index 68%
rename from sdk/bundled_program/util/TARGETS
rename to devtools/bundled_program/util/TARGETS
index 17d19dfb29..7d019ce30f 100644
--- a/sdk/bundled_program/util/TARGETS
+++ b/devtools/bundled_program/util/TARGETS
@@ -7,10 +7,10 @@ python_library(
     srcs = [
         "test_util.py",
     ],
-    visibility = ["//executorch/sdk/bundled_program/..."],
+    visibility = ["//executorch/devtools/bundled_program/..."],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program:config",
         "//executorch/exir:lib",
-        "//executorch/sdk/bundled_program:config",
     ],
 )
diff --git a/sdk/bundled_program/util/test_util.py b/devtools/bundled_program/util/test_util.py
similarity index 99%
rename from sdk/bundled_program/util/test_util.py
rename to devtools/bundled_program/util/test_util.py
index bfea8158ac..505186f3a0 100644
--- a/sdk/bundled_program/util/test_util.py
+++ b/devtools/bundled_program/util/test_util.py
@@ -10,14 +10,14 @@
 from typing import List, Tuple
 
 import torch
-
-from executorch.exir import ExecutorchProgramManager, to_edge
-from executorch.sdk.bundled_program.config import (
+from executorch.devtools.bundled_program.config import (
     MethodInputType,
     MethodOutputType,
     MethodTestCase,
     MethodTestSuite,
 )
+
+from executorch.exir import ExecutorchProgramManager, to_edge
 from torch.export import export
 from torch.export.unflatten import _assign_attr, _AttrKind
 
diff --git a/sdk/bundled_program/version.py b/devtools/bundled_program/version.py
similarity index 100%
rename from sdk/bundled_program/version.py
rename to devtools/bundled_program/version.py
diff --git a/sdk/debug_format/TARGETS b/devtools/debug_format/TARGETS
similarity index 100%
rename from sdk/debug_format/TARGETS
rename to devtools/debug_format/TARGETS
diff --git a/sdk/debug_format/base_schema.py b/devtools/debug_format/base_schema.py
similarity index 100%
rename from sdk/debug_format/base_schema.py
rename to devtools/debug_format/base_schema.py
diff --git a/sdk/debug_format/et_schema.py b/devtools/debug_format/et_schema.py
similarity index 99%
rename from sdk/debug_format/et_schema.py
rename to devtools/debug_format/et_schema.py
index 9a6af4edba..abe155233a 100644
--- a/sdk/debug_format/et_schema.py
+++ b/devtools/debug_format/et_schema.py
@@ -21,7 +21,7 @@
 
 import torch
 from executorch import exir
-from executorch.sdk.debug_format.base_schema import (
+from executorch.devtools.debug_format.base_schema import (
     Node,
     OperatorGraph,
     OperatorNode,
diff --git a/sdk/etdump/TARGETS b/devtools/etdump/TARGETS
similarity index 81%
rename from sdk/etdump/TARGETS
rename to devtools/etdump/TARGETS
index 22d07478cb..7dcc4c1e84 100644
--- a/sdk/etdump/TARGETS
+++ b/devtools/etdump/TARGETS
@@ -11,7 +11,7 @@ runtime.python_library(
         "schema_flatcc.py",
     ],
     visibility = [
-        "//executorch/sdk/...",
+        "//executorch/devtools/...",
     ],
     deps = [
         "//executorch/exir:scalar_type",
@@ -24,11 +24,11 @@ runtime.python_library(
         "serialize.py",
     ],
     resources = {
+        "//executorch/devtools/etdump:etdump_schema_flatcc.fbs": "etdump_schema_flatcc.fbs",
         "//executorch/schema:scalar_type.fbs": "scalar_type.fbs",
-        "//executorch/sdk/etdump:etdump_schema_flatcc.fbs": "etdump_schema_flatcc.fbs",
     },
     visibility = [
-        "//executorch/sdk/...",
+        "//executorch/devtools/...",
     ],
     deps = [
         "fbsource//third-party/pypi/setuptools:setuptools",
diff --git a/sdk/etdump/emitter.cpp b/devtools/etdump/emitter.cpp
similarity index 98%
rename from sdk/etdump/emitter.cpp
rename to devtools/etdump/emitter.cpp
index 1b3cba9d19..dfca629530 100644
--- a/sdk/etdump/emitter.cpp
+++ b/devtools/etdump/emitter.cpp
@@ -9,8 +9,8 @@
 #include <stdio.h>
 #include <cstdint>
 
+#include "executorch/devtools/etdump/emitter.h"
 #include "executorch/runtime/platform/assert.h"
-#include "executorch/sdk/etdump/emitter.h"
 
 namespace torch {
 namespace executor {
diff --git a/sdk/etdump/emitter.h b/devtools/etdump/emitter.h
similarity index 92%
rename from sdk/etdump/emitter.h
rename to devtools/etdump/emitter.h
index 3910d3bd27..bf8ab0b1e1 100644
--- a/sdk/etdump/emitter.h
+++ b/devtools/etdump/emitter.h
@@ -9,7 +9,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 
-#include <executorch/sdk/etdump/etdump_flatcc.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 #include <flatcc/flatcc_builder.h>
 
 #pragma once
diff --git a/sdk/etdump/etdump_flatcc.cpp b/devtools/etdump/etdump_flatcc.cpp
similarity index 98%
rename from sdk/etdump/etdump_flatcc.cpp
rename to devtools/etdump/etdump_flatcc.cpp
index dab1443b55..ca46c12f51 100644
--- a/sdk/etdump/etdump_flatcc.cpp
+++ b/devtools/etdump/etdump_flatcc.cpp
@@ -6,16 +6,16 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "executorch/sdk/etdump/etdump_flatcc.h"
-#include <executorch/sdk/etdump/etdump_schema_flatcc_builder.h>
-#include <executorch/sdk/etdump/etdump_schema_flatcc_reader.h>
+#include "executorch/devtools/etdump/etdump_flatcc.h"
+#include <executorch/devtools/etdump/etdump_schema_flatcc_builder.h>
+#include <executorch/devtools/etdump/etdump_schema_flatcc_reader.h>
 #include <flatcc/flatcc_types.h>
 #include <stdio.h>
 #include <string.h>
+#include "executorch/devtools/etdump/emitter.h"
 #include "executorch/runtime/core/exec_aten/exec_aten.h"
 #include "executorch/runtime/core/exec_aten/util/scalar_type_util.h"
 #include "executorch/runtime/platform/assert.h"
-#include "executorch/sdk/etdump/emitter.h"
 
 namespace torch {
 namespace executor {
diff --git a/sdk/etdump/etdump_flatcc.h b/devtools/etdump/etdump_flatcc.h
similarity index 100%
rename from sdk/etdump/etdump_flatcc.h
rename to devtools/etdump/etdump_flatcc.h
diff --git a/sdk/etdump/etdump_schema_flatcc.fbs b/devtools/etdump/etdump_schema_flatcc.fbs
similarity index 100%
rename from sdk/etdump/etdump_schema_flatcc.fbs
rename to devtools/etdump/etdump_schema_flatcc.fbs
diff --git a/sdk/etdump/scalar_type.fbs b/devtools/etdump/scalar_type.fbs
similarity index 100%
rename from sdk/etdump/scalar_type.fbs
rename to devtools/etdump/scalar_type.fbs
diff --git a/sdk/etdump/schema_flatcc.py b/devtools/etdump/schema_flatcc.py
similarity index 97%
rename from sdk/etdump/schema_flatcc.py
rename to devtools/etdump/schema_flatcc.py
index eaad876a53..f19f328d3f 100644
--- a/sdk/etdump/schema_flatcc.py
+++ b/devtools/etdump/schema_flatcc.py
@@ -7,7 +7,7 @@
 # pyre-strict
 """
 This file is the python representation of the schema contained in
-executorch/sdk/etdump/etdump_schema.fbs. Any changes made to that
+executorch/devtools/etdump/etdump_schema.fbs. Any changes made to that
 flatbuffer schema should accordingly be reflected here also.
 """
 
diff --git a/sdk/etdump/serialize.py b/devtools/etdump/serialize.py
similarity index 98%
rename from sdk/etdump/serialize.py
rename to devtools/etdump/serialize.py
index 0cc6682bfc..4ed63bc385 100644
--- a/sdk/etdump/serialize.py
+++ b/devtools/etdump/serialize.py
@@ -11,11 +11,11 @@
 import tempfile
 
 import pkg_resources
+from executorch.devtools.etdump.schema_flatcc import ETDumpFlatCC
 
 from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass
 
 from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile
-from executorch.sdk.etdump.schema_flatcc import ETDumpFlatCC
 
 # The prefix of schema files used for etdump
 ETDUMP_FLATCC_SCHEMA_NAME = "etdump_schema_flatcc"
diff --git a/sdk/etdump/targets.bzl b/devtools/etdump/targets.bzl
similarity index 100%
rename from sdk/etdump/targets.bzl
rename to devtools/etdump/targets.bzl
diff --git a/sdk/etdump/tests/CMakeLists.txt b/devtools/etdump/tests/CMakeLists.txt
similarity index 100%
rename from sdk/etdump/tests/CMakeLists.txt
rename to devtools/etdump/tests/CMakeLists.txt
diff --git a/sdk/etdump/tests/TARGETS b/devtools/etdump/tests/TARGETS
similarity index 75%
rename from sdk/etdump/tests/TARGETS
rename to devtools/etdump/tests/TARGETS
index ad48948c48..51e807891d 100644
--- a/sdk/etdump/tests/TARGETS
+++ b/devtools/etdump/tests/TARGETS
@@ -11,8 +11,8 @@ python_unittest(
         "serialize_test.py",
     ],
     deps = [
+        "//executorch/devtools/etdump:schema_flatcc",
+        "//executorch/devtools/etdump:serialize",
         "//executorch/exir/_serialize:lib",
-        "//executorch/sdk/etdump:schema_flatcc",
-        "//executorch/sdk/etdump:serialize",
     ],
 )
diff --git a/sdk/etdump/tests/etdump_test.cpp b/devtools/etdump/tests/etdump_test.cpp
similarity index 99%
rename from sdk/etdump/tests/etdump_test.cpp
rename to devtools/etdump/tests/etdump_test.cpp
index d30bd9a303..de8c0abc39 100644
--- a/sdk/etdump/tests/etdump_test.cpp
+++ b/devtools/etdump/tests/etdump_test.cpp
@@ -9,12 +9,12 @@
 #include <gtest/gtest.h>
 #include <cstdio>
 
+#include <executorch/devtools/etdump/etdump_flatcc.h>
+#include <executorch/devtools/etdump/etdump_schema_flatcc_builder.h>
+#include <executorch/devtools/etdump/etdump_schema_flatcc_reader.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/span.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/sdk/etdump/etdump_flatcc.h>
-#include <executorch/sdk/etdump/etdump_schema_flatcc_builder.h>
-#include <executorch/sdk/etdump/etdump_schema_flatcc_reader.h>
 #include <executorch/test/utils/DeathTest.h>
 #include <cstdint>
 #include <cstring>
diff --git a/sdk/etdump/tests/serialize_test.py b/devtools/etdump/tests/serialize_test.py
similarity index 97%
rename from sdk/etdump/tests/serialize_test.py
rename to devtools/etdump/tests/serialize_test.py
index 2b1497f597..1a7f3bd93f 100644
--- a/sdk/etdump/tests/serialize_test.py
+++ b/devtools/etdump/tests/serialize_test.py
@@ -12,13 +12,13 @@
 from pprint import pformat
 from typing import List
 
-import executorch.sdk.etdump.schema_flatcc as flatcc
-from executorch.exir._serialize._dataclass import _DataclassEncoder
+import executorch.devtools.etdump.schema_flatcc as flatcc
 
-from executorch.sdk.etdump.serialize import (
+from executorch.devtools.etdump.serialize import (
     deserialize_from_etdump_flatcc,
     serialize_to_etdump_flatcc,
 )
+from executorch.exir._serialize._dataclass import _DataclassEncoder
 
 
 def diff_jsons(a: str, b: str) -> List[str]:
diff --git a/sdk/etdump/tests/targets.bzl b/devtools/etdump/tests/targets.bzl
similarity index 82%
rename from sdk/etdump/tests/targets.bzl
rename to devtools/etdump/tests/targets.bzl
index 41b19ca65e..5299b7c1cb 100644
--- a/sdk/etdump/tests/targets.bzl
+++ b/devtools/etdump/tests/targets.bzl
@@ -13,8 +13,8 @@ def define_common_targets():
             "etdump_test.cpp",
         ],
         deps = [
-            "//executorch/sdk/etdump:etdump_flatcc",
-            "//executorch/sdk/etdump:etdump_schema_flatcc",
+            "//executorch/devtools/etdump:etdump_flatcc",
+            "//executorch/devtools/etdump:etdump_schema_flatcc",
             "//executorch/runtime/platform:platform",
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
         ],
diff --git a/sdk/etrecord/TARGETS b/devtools/etrecord/TARGETS
similarity index 71%
rename from sdk/etrecord/TARGETS
rename to devtools/etrecord/TARGETS
index c7de63a81f..09fc3212bf 100644
--- a/sdk/etrecord/TARGETS
+++ b/devtools/etrecord/TARGETS
@@ -9,10 +9,10 @@ python_library(
         "_etrecord.py",
     ],
     deps = [
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/bundled_program/schema:bundled_program_schema_py",
         "//executorch/exir:lib",
         "//executorch/exir/emit:emit",
         "//executorch/exir/serde:serialize",
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/bundled_program/schema:bundled_program_schema_py",
     ],
 )
diff --git a/sdk/etrecord/__init__.py b/devtools/etrecord/__init__.py
similarity index 86%
rename from sdk/etrecord/__init__.py
rename to devtools/etrecord/__init__.py
index 29c29462a7..59ff4e44c2 100644
--- a/sdk/etrecord/__init__.py
+++ b/devtools/etrecord/__init__.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from executorch.sdk.etrecord._etrecord import (
+from executorch.devtools.etrecord._etrecord import (
     ETRecord,
     generate_etrecord,
     parse_etrecord,
diff --git a/sdk/etrecord/_etrecord.py b/devtools/etrecord/_etrecord.py
similarity index 98%
rename from sdk/etrecord/_etrecord.py
rename to devtools/etrecord/_etrecord.py
index 1ae46f27aa..cd21325498 100644
--- a/sdk/etrecord/_etrecord.py
+++ b/devtools/etrecord/_etrecord.py
@@ -12,6 +12,9 @@
 from zipfile import BadZipFile, ZipFile
 
 from executorch import exir
+from executorch.devtools.bundled_program.core import BundledProgram
+
+from executorch.devtools.bundled_program.schema.bundled_program_schema import Value
 from executorch.exir import (
     EdgeProgramManager,
     ExecutorchProgram,
@@ -23,9 +26,6 @@
 
 from executorch.exir.serde.export_serialize import SerializedArtifact
 from executorch.exir.serde.serialize import deserialize, serialize
-from executorch.sdk.bundled_program.core import BundledProgram
-
-from executorch.sdk.bundled_program.schema.bundled_program_schema import Value
 
 ProgramOutput = List[Value]
 
diff --git a/sdk/etrecord/tests/TARGETS b/devtools/etrecord/tests/TARGETS
similarity index 64%
rename from sdk/etrecord/tests/TARGETS
rename to devtools/etrecord/tests/TARGETS
index 0984c755a4..fffa7f1834 100644
--- a/sdk/etrecord/tests/TARGETS
+++ b/devtools/etrecord/tests/TARGETS
@@ -8,11 +8,11 @@ python_unittest(
     srcs = ["etrecord_test.py"],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/etrecord:etrecord",
         "//executorch/exir:lib",
         "//executorch/exir/tests:models",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/etrecord:etrecord",
     ],
 )
 
@@ -21,10 +21,10 @@ python_library(
     srcs = ["etrecord_test.py"],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/etrecord:etrecord",
         "//executorch/exir:lib",
         "//executorch/exir/tests:models",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program:core",
-        "//executorch/sdk/etrecord:etrecord",
     ],
 )
diff --git a/sdk/etrecord/tests/etrecord_test.py b/devtools/etrecord/tests/etrecord_test.py
similarity index 96%
rename from sdk/etrecord/tests/etrecord_test.py
rename to devtools/etrecord/tests/etrecord_test.py
index bc534fd487..b8e08dfe8c 100644
--- a/sdk/etrecord/tests/etrecord_test.py
+++ b/devtools/etrecord/tests/etrecord_test.py
@@ -12,14 +12,14 @@
 import executorch.exir.tests.models as models
 import torch
 from executorch import exir
-from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.core import BundledProgram
-from executorch.sdk.etrecord import generate_etrecord, parse_etrecord
-from executorch.sdk.etrecord._etrecord import (
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.core import BundledProgram
+from executorch.devtools.etrecord import generate_etrecord, parse_etrecord
+from executorch.devtools.etrecord._etrecord import (
     _get_reference_outputs,
     ETRecordReservedFileNames,
 )
+from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
 from torch.export import export
 
 
diff --git a/sdk/inspector/TARGETS b/devtools/inspector/TARGETS
similarity index 70%
rename from sdk/inspector/TARGETS
rename to devtools/inspector/TARGETS
index bc53c90c11..2b1cbecff3 100644
--- a/sdk/inspector/TARGETS
+++ b/devtools/inspector/TARGETS
@@ -14,10 +14,10 @@ python_library(
         "fbsource//third-party/pypi/pandas:pandas",
         "fbsource//third-party/pypi/tabulate:tabulate",
         ":inspector_utils",
+        "//executorch/devtools/debug_format:et_schema",
+        "//executorch/devtools/etdump:schema_flatcc",
+        "//executorch/devtools/etrecord:etrecord",
         "//executorch/exir:lib",
-        "//executorch/sdk/debug_format:et_schema",
-        "//executorch/sdk/etdump:schema_flatcc",
-        "//executorch/sdk/etrecord:etrecord",
     ],
 )
 
@@ -27,7 +27,7 @@ python_binary(
     main_src = "inspector_cli.py",
     deps = [
         ":inspector_utils",
-        "//executorch/sdk:lib",
+        "//executorch/devtools:lib",
     ],
 )
 
@@ -40,11 +40,11 @@ python_library(
         "fbsource//third-party/pypi/matplotlib:matplotlib",
         "fbsource//third-party/pypi/numpy:numpy",
         "//caffe2:torch",
-        "//executorch/sdk/debug_format:base_schema",
-        "//executorch/sdk/debug_format:et_schema",
-        "//executorch/sdk/etdump:schema_flatcc",
-        "//executorch/sdk/etdump:serialize",
-        "//executorch/sdk/etrecord:etrecord",
+        "//executorch/devtools/debug_format:base_schema",
+        "//executorch/devtools/debug_format:et_schema",
+        "//executorch/devtools/etdump:schema_flatcc",
+        "//executorch/devtools/etdump:serialize",
+        "//executorch/devtools/etrecord:etrecord",
     ],
 )
 
diff --git a/sdk/inspector/__init__.py b/devtools/inspector/__init__.py
similarity index 60%
rename from sdk/inspector/__init__.py
rename to devtools/inspector/__init__.py
index bef3d363d5..ff9bb81479 100644
--- a/sdk/inspector/__init__.py
+++ b/devtools/inspector/__init__.py
@@ -4,7 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from executorch.sdk.inspector._inspector import Event, EventBlock, Inspector, PerfData
-from executorch.sdk.inspector._inspector_utils import TimeScale
+from executorch.devtools.inspector._inspector import (
+    Event,
+    EventBlock,
+    Inspector,
+    PerfData,
+)
+from executorch.devtools.inspector._inspector_utils import TimeScale
 
 __all__ = ["Event", "EventBlock", "Inspector", "PerfData", "TimeScale"]
diff --git a/sdk/inspector/_inspector.py b/devtools/inspector/_inspector.py
similarity index 99%
rename from sdk/inspector/_inspector.py
rename to devtools/inspector/_inspector.py
index 5f9bfafee7..f98e3cd3a5 100644
--- a/sdk/inspector/_inspector.py
+++ b/devtools/inspector/_inspector.py
@@ -26,16 +26,19 @@
     Union,
 )
 
-import executorch.sdk.etdump.schema_flatcc as flatcc
+import executorch.devtools.etdump.schema_flatcc as flatcc
 
 import numpy as np
 import pandas as pd
-from executorch.exir import ExportedProgram
 
-from executorch.sdk.debug_format.et_schema import OperatorGraph, OperatorNode
-from executorch.sdk.etdump.schema_flatcc import DebugEvent, ETDumpFlatCC, ProfileEvent
-from executorch.sdk.etrecord import ETRecord, parse_etrecord
-from executorch.sdk.inspector._inspector_utils import (
+from executorch.devtools.debug_format.et_schema import OperatorGraph, OperatorNode
+from executorch.devtools.etdump.schema_flatcc import (
+    DebugEvent,
+    ETDumpFlatCC,
+    ProfileEvent,
+)
+from executorch.devtools.etrecord import ETRecord, parse_etrecord
+from executorch.devtools.inspector._inspector_utils import (
     create_debug_handle_to_op_node_mapping,
     EDGE_DIALECT_GRAPH_KEY,
     EXCLUDED_COLUMNS_WHEN_PRINTING,
@@ -53,6 +56,7 @@
     TimeScale,
     verify_debug_data_equivalence,
 )
+from executorch.exir import ExportedProgram
 
 from tabulate import tabulate
 
diff --git a/sdk/inspector/_inspector_utils.py b/devtools/inspector/_inspector_utils.py
similarity index 97%
rename from sdk/inspector/_inspector_utils.py
rename to devtools/inspector/_inspector_utils.py
index 6879e85505..98b5fdc722 100644
--- a/sdk/inspector/_inspector_utils.py
+++ b/devtools/inspector/_inspector_utils.py
@@ -8,14 +8,14 @@
 from enum import Enum
 from typing import Dict, List, Mapping, Optional, Tuple, TypeAlias, Union
 
-import executorch.sdk.etdump.schema_flatcc as flatcc
+import executorch.devtools.etdump.schema_flatcc as flatcc
 
 import torch
 
-from executorch.sdk.debug_format.base_schema import OperatorNode
+from executorch.devtools.debug_format.base_schema import OperatorNode
 
-from executorch.sdk.debug_format.et_schema import FXOperatorGraph, OperatorGraph
-from executorch.sdk.etdump.schema_flatcc import (
+from executorch.devtools.debug_format.et_schema import FXOperatorGraph, OperatorGraph
+from executorch.devtools.etdump.schema_flatcc import (
     DebugEvent,
     ETDumpFlatCC,
     ProfileEvent,
@@ -25,8 +25,8 @@
     ValueType,
 )
 
-from executorch.sdk.etdump.serialize import deserialize_from_etdump_flatcc
-from executorch.sdk.etrecord import ETRecord
+from executorch.devtools.etdump.serialize import deserialize_from_etdump_flatcc
+from executorch.devtools.etrecord import ETRecord
 
 FORWARD = "forward"
 EDGE_DIALECT_GRAPH_KEY = "edge_dialect_graph_module"
diff --git a/sdk/inspector/inspector_cli.py b/devtools/inspector/inspector_cli.py
similarity index 93%
rename from sdk/inspector/inspector_cli.py
rename to devtools/inspector/inspector_cli.py
index d6c8d5442f..bd76607a94 100644
--- a/sdk/inspector/inspector_cli.py
+++ b/devtools/inspector/inspector_cli.py
@@ -6,8 +6,8 @@
 
 import argparse
 
-from executorch.sdk import Inspector
-from executorch.sdk.inspector._inspector_utils import compare_results, TimeScale
+from executorch.devtools import Inspector
+from executorch.devtools.inspector._inspector_utils import compare_results, TimeScale
 
 
 def main() -> None:
diff --git a/devtools/inspector/tests/TARGETS b/devtools/inspector/tests/TARGETS
new file mode 100644
index 0000000000..eada6817bc
--- /dev/null
+++ b/devtools/inspector/tests/TARGETS
@@ -0,0 +1,41 @@
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+
+oncall("executorch")
+
+python_unittest(
+    name = "inspector_test",
+    srcs = ["inspector_test.py"],
+    deps = [
+        "//executorch/devtools:lib",
+        "//executorch/devtools/debug_format:et_schema",
+        "//executorch/devtools/etdump:schema_flatcc",
+        "//executorch/devtools/etrecord/tests:etrecord_test_library",
+        "//executorch/devtools/inspector:inspector",
+        "//executorch/devtools/inspector:lib",
+        "//executorch/exir:lib",
+    ],
+)
+
+python_unittest(
+    name = "event_blocks_test",
+    srcs = ["event_blocks_test.py"],
+    deps = [
+        "//executorch/devtools/etdump:schema_flatcc",
+        "//executorch/devtools/inspector:inspector",
+        "//executorch/devtools/inspector:lib",
+    ],
+)
+
+python_unittest(
+    name = "inspector_utils_test",
+    srcs = ["inspector_utils_test.py"],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/devtools:lib",
+        "//executorch/devtools/debug_format:base_schema",
+        "//executorch/devtools/debug_format:et_schema",
+        "//executorch/devtools/etdump:schema_flatcc",
+        "//executorch/devtools/etrecord/tests:etrecord_test_library",
+        "//executorch/devtools/inspector:inspector_utils",
+    ],
+)
diff --git a/sdk/inspector/tests/event_blocks_test.py b/devtools/inspector/tests/event_blocks_test.py
similarity index 98%
rename from sdk/inspector/tests/event_blocks_test.py
rename to devtools/inspector/tests/event_blocks_test.py
index 7c7da00186..4101035f99 100644
--- a/sdk/inspector/tests/event_blocks_test.py
+++ b/devtools/inspector/tests/event_blocks_test.py
@@ -8,10 +8,10 @@
 import unittest
 from typing import List, Optional, Tuple, Union
 
-import executorch.sdk.etdump.schema_flatcc as flatcc
-from executorch.sdk.etdump.schema_flatcc import ETDumpFlatCC, ProfileEvent
-from executorch.sdk.inspector import Event, EventBlock, PerfData
-from executorch.sdk.inspector._inspector import (
+import executorch.devtools.etdump.schema_flatcc as flatcc
+from executorch.devtools.etdump.schema_flatcc import ETDumpFlatCC, ProfileEvent
+from executorch.devtools.inspector import Event, EventBlock, PerfData
+from executorch.devtools.inspector._inspector import (
     DelegateMetadata,
     EventSignature,
     InstructionEvent,
diff --git a/sdk/inspector/tests/inspector_test.py b/devtools/inspector/tests/inspector_test.py
similarity index 97%
rename from sdk/inspector/tests/inspector_test.py
rename to devtools/inspector/tests/inspector_test.py
index a372c7c569..55f0cd10ae 100644
--- a/sdk/inspector/tests/inspector_test.py
+++ b/devtools/inspector/tests/inspector_test.py
@@ -14,14 +14,19 @@
 
 from unittest.mock import patch
 
-from executorch.exir import ExportedProgram
-from executorch.sdk import generate_etrecord, parse_etrecord
-from executorch.sdk.debug_format.et_schema import OperatorNode
-from executorch.sdk.etdump.schema_flatcc import ProfileEvent
-from executorch.sdk.etrecord.tests.etrecord_test import TestETRecord
-
-from executorch.sdk.inspector import _inspector, Event, EventBlock, Inspector, PerfData
-from executorch.sdk.inspector._inspector import (
+from executorch.devtools import generate_etrecord, parse_etrecord
+from executorch.devtools.debug_format.et_schema import OperatorNode
+from executorch.devtools.etdump.schema_flatcc import ProfileEvent
+from executorch.devtools.etrecord.tests.etrecord_test import TestETRecord
+
+from executorch.devtools.inspector import (
+    _inspector,
+    Event,
+    EventBlock,
+    Inspector,
+    PerfData,
+)
+from executorch.devtools.inspector._inspector import (
     DebugEventSignature,
     flatcc,
     InstructionEvent,
@@ -29,6 +34,8 @@
     ProfileEventSignature,
 )
 
+from executorch.exir import ExportedProgram
+
 
 OP_TYPE = "aten::add"
 EVENT_BLOCK_NAME = "block_0"
diff --git a/sdk/inspector/tests/inspector_utils_test.py b/devtools/inspector/tests/inspector_utils_test.py
similarity index 94%
rename from sdk/inspector/tests/inspector_utils_test.py
rename to devtools/inspector/tests/inspector_utils_test.py
index b5b9b54d6c..d853732fcc 100644
--- a/sdk/inspector/tests/inspector_utils_test.py
+++ b/devtools/inspector/tests/inspector_utils_test.py
@@ -10,19 +10,19 @@
 
 import torch
 
-from executorch.sdk import generate_etrecord, parse_etrecord
+from executorch.devtools import generate_etrecord, parse_etrecord
 
-from executorch.sdk.debug_format.base_schema import (
+from executorch.devtools.debug_format.base_schema import (
     OperatorGraph,
     OperatorNode,
     ValueNode,
 )
 
-from executorch.sdk.debug_format.et_schema import FXOperatorGraph
-from executorch.sdk.etdump import schema_flatcc as flatcc
+from executorch.devtools.debug_format.et_schema import FXOperatorGraph
+from executorch.devtools.etdump import schema_flatcc as flatcc
 
-from executorch.sdk.etrecord.tests.etrecord_test import TestETRecord
-from executorch.sdk.inspector._inspector_utils import (
+from executorch.devtools.etrecord.tests.etrecord_test import TestETRecord
+from executorch.devtools.inspector._inspector_utils import (
     create_debug_handle_to_op_node_mapping,
     EDGE_DIALECT_GRAPH_KEY,
     find_populated_event,
diff --git a/sdk/size_analysis_tool/TARGETS b/devtools/size_analysis_tool/TARGETS
similarity index 86%
rename from sdk/size_analysis_tool/TARGETS
rename to devtools/size_analysis_tool/TARGETS
index 44ae0aa6f8..c365ba152d 100644
--- a/sdk/size_analysis_tool/TARGETS
+++ b/devtools/size_analysis_tool/TARGETS
@@ -12,9 +12,9 @@ python_library(
     visibility = ["PUBLIC"],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools:lib",
         "//executorch/exir:lib",
         "//executorch/exir/backend:backend_api",
-        "//executorch/sdk:lib",
     ],
 )
 
@@ -23,13 +23,13 @@ python_binary(
     srcs = [
         "size_analysis_tool.py",
     ],
-    main_function = "executorch.sdk.size_analysis_tool.size_analysis_tool.main",
+    main_function = "executorch.devtools.size_analysis_tool.size_analysis_tool.main",
     visibility = ["PUBLIC"],
     deps = [
         "//caffe2:torch",
+        "//executorch/devtools:lib",
         "//executorch/exir:lib",
         "//executorch/exir/backend:backend_api",
-        "//executorch/sdk:lib",
     ],
 )
 
@@ -43,9 +43,9 @@ python_unittest(
         "//caffe2:torch",
         "//executorch/backends/xnnpack/partition:xnnpack_partitioner",
         "//executorch/backends/xnnpack/utils:xnnpack_utils",
+        "//executorch/devtools:lib",
         "//executorch/exir:lib",
         "//executorch/exir/backend:backend_api",
         "//executorch/exir/passes:spec_prop_pass",
-        "//executorch/sdk:lib",
     ],
 )
diff --git a/sdk/size_analysis_tool/size_analysis_tool.py b/devtools/size_analysis_tool/size_analysis_tool.py
similarity index 99%
rename from sdk/size_analysis_tool/size_analysis_tool.py
rename to devtools/size_analysis_tool/size_analysis_tool.py
index d17ec5ac47..8ea8ddbbf4 100644
--- a/sdk/size_analysis_tool/size_analysis_tool.py
+++ b/devtools/size_analysis_tool/size_analysis_tool.py
@@ -9,10 +9,10 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
+from executorch.devtools import parse_etrecord
 
 from executorch.exir import ExportedProgram
 from executorch.exir.backend.backend_api import LoweredBackendModule
-from executorch.sdk import parse_etrecord
 
 
 def _get_tensor_data(node: torch.fx.Node, tensor: torch.Tensor) -> Dict[str, Any]:
diff --git a/sdk/size_analysis_tool/size_analysis_tool_test.py b/devtools/size_analysis_tool/size_analysis_tool_test.py
similarity index 98%
rename from sdk/size_analysis_tool/size_analysis_tool_test.py
rename to devtools/size_analysis_tool/size_analysis_tool_test.py
index 3e1efec77b..96feae7e42 100644
--- a/sdk/size_analysis_tool/size_analysis_tool_test.py
+++ b/devtools/size_analysis_tool/size_analysis_tool_test.py
@@ -14,12 +14,12 @@
     get_xnnpack_executorch_backend_config,
 )
 from executorch.backends.xnnpack.utils.utils import capture_graph_for_xnnpack
-from executorch.exir.backend.backend_api import to_backend, validation_disabled
-from executorch.exir.passes.spec_prop_pass import SpecPropPass
 
-from executorch.sdk.size_analysis_tool.size_analysis_tool import (
+from executorch.devtools.size_analysis_tool.size_analysis_tool import (
     generate_model_size_information,
 )
+from executorch.exir.backend.backend_api import to_backend, validation_disabled
+from executorch.exir.passes.spec_prop_pass import SpecPropPass
 
 
 class SizeAnalysisToolTest(unittest.TestCase):
diff --git a/sdk/targets.bzl b/devtools/targets.bzl
similarity index 100%
rename from sdk/targets.bzl
rename to devtools/targets.bzl
diff --git a/docs/source/extension-module.md b/docs/source/extension-module.md
index 9e236e8e48..97528c9540 100644
--- a/docs/source/extension-module.md
+++ b/docs/source/extension-module.md
@@ -132,7 +132,7 @@ Use [ExecuTorch Dump](sdk-etdump.md) to trace model execution. Create an instanc
 #include <fstream>
 #include <memory>
 #include <executorch/extension/module/module.h>
-#include <executorch/sdk/etdump/etdump_flatcc.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 
 using namespace ::torch::executor;
 
diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md
index 5fffb7e8ca..6d79e1e0fd 100644
--- a/docs/source/llm/getting-started.md
+++ b/docs/source/llm/getting-started.md
@@ -763,7 +763,7 @@ In your export script, after calling `to_edge()` and `to_executorch()`, call `ge
 
 ```
 import copy
-from executorch.sdk import generate_etrecord
+from executorch.devtools import generate_etrecord
 
 # Make the deep copy immediately after to to_edge()
 edge_manager_copy = copy.deepcopy(edge_manager)
@@ -784,7 +784,7 @@ Include the ETDump header in your code.
 ```cpp
 // main.cpp
 
-#include <executorch/sdk/etdump/etdump_flatcc.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 ```
 
 Create an Instance of the ETDumpGen class and pass it to the Module constructor.
@@ -835,7 +835,7 @@ Run the runner, you will see “etdump.etdp” generated.
 Once you’ve collected debug artifacts ETDump (and optionally an ETRecord), you can use the Inspector API to view performance information.
 
 ```python
-from executorch.sdk import Inspector
+from executorch.devtools import Inspector
 
 inspector = Inspector(etdump_path="etdump.etdp")
 # If you also generated an ETRecord, then pass that in as well: `inspector = Inspector(etdump_path="etdump.etdp", etrecord="etrecord.bin")`
diff --git a/docs/source/sdk-bundled-io.md b/docs/source/sdk-bundled-io.md
index 33deae3904..288fce93df 100644
--- a/docs/source/sdk-bundled-io.md
+++ b/docs/source/sdk-bundled-io.md
@@ -28,7 +28,7 @@ In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTest
 :::{dropdown} `MethodTestCase`
 
 ```{eval-rst}
-.. autofunction:: executorch.sdk.bundled_program.config.MethodTestCase.__init__
+.. autofunction:: executorch.devtools.bundled_program.config.MethodTestCase.__init__
     :noindex:
 ```
 :::
@@ -38,7 +38,7 @@ In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTest
 :::{dropdown} `MethodTestSuite`
 
 ```{eval-rst}
-.. autofunction:: executorch.sdk.bundled_program.config.MethodTestSuite
+.. autofunction:: executorch.devtools.bundled_program.config.MethodTestSuite
     :noindex:
 ```
 :::
@@ -48,13 +48,13 @@ Since each model may have multiple inference methods, we need to generate `List[
 
 ### Step 3: Generate `BundledProgram`
 
-We provide `BundledProgram` class under `executorch/sdk/bundled_program/core.py` to bundled the `ExecutorchProgram`-like variable, including
+We provide `BundledProgram` class under `executorch/devtools/bundled_program/core.py` to bundled the `ExecutorchProgram`-like variable, including
                             `ExecutorchProgram`, `MultiMethodExecutorchProgram` or `ExecutorchProgramManager`, with the `List[MethodTestSuite]`:
 
 :::{dropdown} `BundledProgram`
 
 ```{eval-rst}
-.. autofunction:: executorch.sdk.bundled_program.core.BundledProgram.__init__
+.. autofunction:: executorch.devtools.bundled_program.core.BundledProgram.__init__
     :noindex:
 ```
 :::
@@ -65,18 +65,18 @@ Construtor of `BundledProgram `will do sannity check internally to see if the gi
 
 ### Step 4: Serialize `BundledProgram` to Flatbuffer.
 
-To serialize `BundledProgram` to make runtime APIs use it, we provide two APIs, both under `executorch/sdk/bundled_program/serialize/__init__.py`.
+To serialize `BundledProgram` to make runtime APIs use it, we provide two APIs, both under `executorch/devtools/bundled_program/serialize/__init__.py`.
 
 :::{dropdown} Serialize and Deserialize
 
 ```{eval-rst}
-.. currentmodule:: executorch.sdk.bundled_program.serialize
+.. currentmodule:: executorch.devtools.bundled_program.serialize
 .. autofunction:: serialize_from_bundled_program_to_flatbuffer
     :noindex:
 ```
 
 ```{eval-rst}
-.. currentmodule:: executorch.sdk.bundled_program.serialize
+.. currentmodule:: executorch.devtools.bundled_program.serialize
 .. autofunction:: deserialize_from_flatbuffer_to_bundled_program
     :noindex:
 ```
@@ -90,10 +90,10 @@ Here is a flow highlighting how to generate a `BundledProgram` given a PyTorch m
 import torch
 
 from executorch.exir import to_edge
-from executorch.sdk import BundledProgram
+from executorch.devtools import BundledProgram
 
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.serialize import (
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
 from torch._export import capture_pre_autograd_graph
@@ -187,7 +187,7 @@ with open(save_path, "wb") as f:
 We can also regenerate `BundledProgram` from flatbuffer file if needed:
 
 ```python
-from executorch.sdk.bundled_program.serialize import deserialize_from_flatbuffer_to_bundled_program
+from executorch.devtools.bundled_program.serialize import deserialize_from_flatbuffer_to_bundled_program
 save_path = "bundled_program.bpte"
 with open(save_path, "rb") as f:
     serialized_bundled_program = f.read()
@@ -313,9 +313,9 @@ Here's the example of the dtype of test input not meet model's requirement:
 import torch
 
 from executorch.exir import to_edge
-from executorch.sdk import BundledProgram
+from executorch.devtools import BundledProgram
 
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
 from torch.export import export
 
 
@@ -400,7 +400,7 @@ Cell In[1], line 72
      68 ]
      70 # Step 3: Generate BundledProgram
 ---> 72 bundled_program = create_bundled_program(program, method_test_suites)
-File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites)
+File /executorch/devtools/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites)
     264 """Create bp_schema.BundledProgram by bundling the given program and method_test_suites together.
     265
     266 Args:
@@ -411,7 +411,7 @@ File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(prog
 --> 276 assert_valid_bundle(program, method_test_suites)
     278 bundled_method_test_suites: List[bp_schema.BundledMethodTestSuite] = []
     280 # Emit data and metadata of bundled tensor
-File /executorch/sdk/bundled_program/core.py:219, in assert_valid_bundle(program, method_test_suites)
+File /executorch/devtools/bundled_program/core.py:219, in assert_valid_bundle(program, method_test_suites)
     215 # type of tensor input should match execution plan
     216 if type(cur_plan_test_inputs[j]) == torch.Tensor:
     217     # pyre-fixme[16]: Undefined attribute [16]: Item `bool` of `typing.Union[bool, float, int, torch._tensor.Tensor]`
@@ -449,9 +449,9 @@ Another common error would be the method name in any `MethodTestSuite` does not
 import torch
 
 from executorch.exir import to_edge
-from executorch.sdk import BundledProgram
+from executorch.devtools import BundledProgram
 
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
 from torch.export import export
 
 
@@ -532,7 +532,7 @@ Cell In[3], line 73
      70 method_test_suites[0].method_name = "MISSING_METHOD_NAME"
      72 # Generate BundledProgram
 ---> 73 bundled_program = create_bundled_program(program, method_test_suites)
-File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites)
+File /executorch/devtools/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites)
     264 """Create bp_schema.BundledProgram by bundling the given program and method_test_suites together.
     265
     266 Args:
@@ -543,7 +543,7 @@ File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(prog
 --> 276 assert_valid_bundle(program, method_test_suites)
     278 bundled_method_test_suites: List[bp_schema.BundledMethodTestSuite] = []
     280 # Emit data and metadata of bundled tensor
-File /executorch/sdk/bundled_program/core.py:141, in assert_valid_bundle(program, method_test_suites)
+File /executorch/devtools/bundled_program/core.py:141, in assert_valid_bundle(program, method_test_suites)
     138 method_name_of_program = {e.name for e in program.execution_plan}
     139 method_name_of_test_suites = {t.method_name for t in method_test_suites}
 --> 141 assert method_name_of_test_suites.issubset(
diff --git a/docs/source/sdk-debugging.md b/docs/source/sdk-debugging.md
index 45e50b44e8..14d4af0f15 100644
--- a/docs/source/sdk-debugging.md
+++ b/docs/source/sdk-debugging.md
@@ -38,7 +38,7 @@ For a real example reflecting the steps below, please refer to [sdk_example_runn
 Once a model has been run, using the generated ETDump and debug buffers, users can leverage the [Inspector API's](./sdk-inspector.rst) to inspect these debug outputs.
 
 ```python
-from executorch.sdk import Inspector
+from executorch.devtools import Inspector
 
 # Create an Inspector instance with etdump and the debug buffer.
 inspector = Inspector(etdump_path=etdump_path,
@@ -67,7 +67,7 @@ We've also provided a simple set of utilities that let users perform quality ana
 
 
 ```python
-from executorch.sdk.inspector._inspector_utils import compare_results
+from executorch.devtools.inspector._inspector_utils import compare_results
 
 # Run a simple quality analysis between the model outputs sourced from the
 # runtime and a set of reference outputs.
diff --git a/docs/source/sdk-etdump.md b/docs/source/sdk-etdump.md
index 4eacb18b14..aad623efc8 100644
--- a/docs/source/sdk-etdump.md
+++ b/docs/source/sdk-etdump.md
@@ -9,7 +9,7 @@ Generating an ETDump is a relatively straightforward process. Users can follow t
 
 1. ***Include*** the ETDump header in your code.
 ```C++
-#include <executorch/sdk/etdump/etdump_flatcc.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 ```
 
 2. ***Create*** an Instance of the ETDumpGen class and pass it into the `load_method` call that is invoked in the runtime.
diff --git a/docs/source/sdk-etrecord.rst b/docs/source/sdk-etrecord.rst
index 43ed5095c6..b3b7f042cc 100644
--- a/docs/source/sdk-etrecord.rst
+++ b/docs/source/sdk-etrecord.rst
@@ -31,7 +31,7 @@ they are interested in working with via our tooling.
 .. warning::
     Users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process.
 
-.. currentmodule:: executorch.sdk.etrecord._etrecord
+.. currentmodule:: executorch.devtools.etrecord._etrecord
 .. autofunction:: generate_etrecord
 
 Using an ``ETRecord``
diff --git a/docs/source/sdk-inspector.rst b/docs/source/sdk-inspector.rst
index e15c1f2a39..448f30cfb5 100644
--- a/docs/source/sdk-inspector.rst
+++ b/docs/source/sdk-inspector.rst
@@ -26,26 +26,26 @@ Inspector Methods
 Constructor
 ~~~~~~~~~~~
 
-.. autofunction:: executorch.sdk.Inspector.__init__
+.. autofunction:: executorch.devtools.Inspector.__init__
 
 **Example Usage:**
 
 .. code:: python
 
-    from executorch.sdk import Inspector
+    from executorch.devtools import Inspector
 
     inspector = Inspector(etdump_path="/path/to/etdump.etdp", etrecord="/path/to/etrecord.bin")
 
 to_dataframe
 ~~~~~~~~~~~~~~~~
 
-.. autofunction:: executorch.sdk.Inspector.to_dataframe
+.. autofunction:: executorch.devtools.Inspector.to_dataframe
 
 
 print_data_tabular
 ~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: executorch.sdk.Inspector.print_data_tabular
+.. autofunction:: executorch.devtools.Inspector.print_data_tabular
 
 .. _example-usage-1:
 
@@ -62,7 +62,7 @@ Note that the unit of delegate profiling events is "cycles". We're working on pr
 find_total_for_module
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: executorch.sdk.Inspector.find_total_for_module
+.. autofunction:: executorch.devtools.Inspector.find_total_for_module
 
 .. _example-usage-2:
 
@@ -80,7 +80,7 @@ find_total_for_module
 get_exported_program
 ~~~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: executorch.sdk.Inspector.get_exported_program
+.. autofunction:: executorch.devtools.Inspector.get_exported_program
 
 .. _example-usage-3:
 
@@ -119,7 +119,7 @@ of an ``Inspector`` instance, for example:
 
     inspector.event_blocks
 
-.. autoclass:: executorch.sdk.inspector.EventBlock
+.. autoclass:: executorch.devtools.inspector.EventBlock
 
 ``Event`` Class
 ~~~~~~~~~~~~~~~
@@ -127,7 +127,7 @@ of an ``Inspector`` instance, for example:
 Access ``Event`` instances through the ``events`` attribute of an
 ``EventBlock`` instance.
 
-.. autoclass:: executorch.sdk.inspector.Event
+.. autoclass:: executorch.devtools.inspector.Event
 
 **Example Usage:**
 
@@ -152,7 +152,7 @@ table. This command produces the identical table output as calling the
 
 .. code:: bash
 
-    python3 -m sdk.inspector.inspector_cli --etdump_path <path_to_etdump> --etrecord_path <path_to_etrecord>
+    python3 -m devtools.inspector.inspector_cli --etdump_path <path_to_etdump> --etrecord_path <path_to_etrecord>
 
 Note that the `etrecord_path` argument is optional.
 
diff --git a/docs/source/tutorials_source/sdk-integration-tutorial.py b/docs/source/tutorials_source/sdk-integration-tutorial.py
index ccc2e480ad..35d200204c 100644
--- a/docs/source/tutorials_source/sdk-integration-tutorial.py
+++ b/docs/source/tutorials_source/sdk-integration-tutorial.py
@@ -38,9 +38,9 @@
 #
 # The first step is to generate an ``ETRecord``. ``ETRecord`` contains model
 # graphs and metadata for linking runtime results (such as profiling) to
-# the eager model. This is generated via ``executorch.sdk.generate_etrecord``.
+# the eager model. This is generated via ``executorch.devtools.generate_etrecord``.
 #
-# ``executorch.sdk.generate_etrecord`` takes in an output file path (str), the
+# ``executorch.devtools.generate_etrecord`` takes in an output file path (str), the
 # edge dialect model (``EdgeProgramManager``), the ExecuTorch dialect model
 # (``ExecutorchProgramManager``), and an optional dictionary of additional models.
 #
@@ -51,6 +51,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from executorch.devtools import generate_etrecord
 
 from executorch.exir import (
     EdgeCompileConfig,
@@ -58,7 +59,6 @@
     ExecutorchProgramManager,
     to_edge,
 )
-from executorch.sdk import generate_etrecord
 from torch.export import export, ExportedProgram
 
 
@@ -129,14 +129,14 @@ def forward(self, x):
 # In this tutorial, a `Bundled Program` is created from the example model above.
 
 import torch
+from executorch.devtools import BundledProgram
 
-from executorch.exir import to_edge
-from executorch.sdk import BundledProgram
-
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.serialize import (
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
+
+from executorch.exir import to_edge
 from torch.export import export
 
 # Step 1: ExecuTorch Program Export
@@ -188,7 +188,7 @@ def forward(self, x):
 #
 # To visualize all runtime events, call Inspector's ``print_data_tabular``.
 
-from executorch.sdk import Inspector
+from executorch.devtools import Inspector
 
 # sphinx_gallery_start_ignore
 inspector_patch = patch.object(Inspector, "__init__", return_value=None)
diff --git a/docs/website/docs/tutorials/bundled_program.md b/docs/website/docs/tutorials/bundled_program.md
index ac67d6f628..fb119df731 100644
--- a/docs/website/docs/tutorials/bundled_program.md
+++ b/docs/website/docs/tutorials/bundled_program.md
@@ -122,7 +122,7 @@ ET_NODISCARD Error VerifyResultWithBundledExpectedOutput(
 
 ### Example
 
-Here we provide an example about how to run the bundled program step by step. Most of the code are borrowed from "fbcode/executorch/sdk/fb/runners/executor_runner.cpp" and please review that file if you need more info and context:
+Here we provide an example about how to run the bundled program step by step. Most of the code are borrowed from "fbcode/executorch/devtools/fb/runners/executor_runner.cpp" and please review that file if you need more info and context:
 
 ```c++
     // method_name is the name for the method we want to test
diff --git a/examples/apple/coreml/executor_runner/main.mm b/examples/apple/coreml/executor_runner/main.mm
index 4cc21ba30a..2475d68fa9 100644
--- a/examples/apple/coreml/executor_runner/main.mm
+++ b/examples/apple/coreml/executor_runner/main.mm
@@ -13,7 +13,7 @@
 #import <executorch/runtime/executor/program.h>
 #import <executorch/runtime/platform/log.h>
 #import <executorch/runtime/platform/runtime.h>
-#import <executorch/sdk/etdump/etdump_flatcc.h>
+#import <executorch/devtools/etdump/etdump_flatcc.h>
 #import <executorch/util/util.h>
 #import <memory>
 #import <numeric>
diff --git a/examples/apple/coreml/scripts/build_executor_runner.sh b/examples/apple/coreml/scripts/build_executor_runner.sh
index 16c5dea02a..b57a8f12e7 100755
--- a/examples/apple/coreml/scripts/build_executor_runner.sh
+++ b/examples/apple/coreml/scripts/build_executor_runner.sh
@@ -56,7 +56,7 @@ mkdir -p "$EXECUTORCH_INCLUDE_DIR_PATH"
 find extension \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \;
 find runtime \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \;
 find util \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \;
-find sdk \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \;
+find devtools \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \;
 cp -rf "$COREML_DIR_PATH/runtime/include/" "$INCLUDE_DIR_PATH"
 
 # Copy required libraries
diff --git a/examples/apple/coreml/scripts/export.py b/examples/apple/coreml/scripts/export.py
index 4bf26a7f3e..5a8c9b227f 100644
--- a/examples/apple/coreml/scripts/export.py
+++ b/examples/apple/coreml/scripts/export.py
@@ -17,10 +17,10 @@
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
 
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
+from executorch.devtools.etrecord import generate_etrecord
 from executorch.exir import to_edge
 
 from executorch.exir.backend.backend_api import to_backend
-from executorch.sdk.etrecord import generate_etrecord
 from torch.export import export
 
 REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent.parent
diff --git a/examples/apple/coreml/scripts/inspector_cli.py b/examples/apple/coreml/scripts/inspector_cli.py
index 768465f770..e0b81d4aff 100644
--- a/examples/apple/coreml/scripts/inspector_cli.py
+++ b/examples/apple/coreml/scripts/inspector_cli.py
@@ -8,8 +8,8 @@
 
 from pathlib import Path
 
-from executorch.sdk import Inspector
-from executorch.sdk.inspector._inspector_utils import compare_results
+from executorch.devtools import Inspector
+from executorch.devtools.inspector._inspector_utils import compare_results
 
 
 def get_root_dir_path() -> Path:
diff --git a/examples/apple/coreml/scripts/inspector_utils.py b/examples/apple/coreml/scripts/inspector_utils.py
index 1736c2cefb..c5674ec520 100644
--- a/examples/apple/coreml/scripts/inspector_utils.py
+++ b/examples/apple/coreml/scripts/inspector_utils.py
@@ -20,6 +20,13 @@
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
 
+from executorch.devtools import BundledProgram, generate_etrecord, Inspector
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
+    serialize_from_bundled_program_to_flatbuffer,
+)
+from executorch.devtools.inspector import Event
+
 from executorch.exir import (
     EdgeProgramManager,
     ExecutorchBackendConfig,
@@ -30,14 +37,6 @@
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.tracer import Value
 
-from executorch.sdk import BundledProgram, generate_etrecord, Inspector
-
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.serialize import (
-    serialize_from_bundled_program_to_flatbuffer,
-)
-from executorch.sdk.inspector import Event
-
 from torch.export import export, ExportedProgram
 
 COREML_METADATA_KEYS: Final[List[Tuple[str, str]]] = [
diff --git a/examples/apple/mps/CMakeLists.txt b/examples/apple/mps/CMakeLists.txt
index d1dd8e93d7..319d8159ce 100644
--- a/examples/apple/mps/CMakeLists.txt
+++ b/examples/apple/mps/CMakeLists.txt
@@ -92,8 +92,8 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
   include(${EXECUTORCH_SRCS_FILE})
   target_include_directories(
     bundled_program
-    INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../../sdk/include
-              ${CMAKE_CURRENT_BINARY_DIR}/../../../sdk/bundled_program
+    INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../../devtools/include
+              ${CMAKE_CURRENT_BINARY_DIR}/../../../devtools/bundled_program
               ${EXECUTORCH_ROOT}/third-party/flatbuffers/include
               ${EXECUTORCH_ROOT}/third-party/flatcc/include
               ${_mps_schema_headers}
diff --git a/examples/apple/mps/executor_runner/mps_executor_runner.mm b/examples/apple/mps/executor_runner/mps_executor_runner.mm
index 604419a620..040b2fcd99 100644
--- a/examples/apple/mps/executor_runner/mps_executor_runner.mm
+++ b/examples/apple/mps/executor_runner/mps_executor_runner.mm
@@ -30,8 +30,8 @@
 #include <executorch/runtime/platform/profiler.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/sdk/bundled_program/bundled_program.h>
-#include <executorch/sdk/etdump/etdump_flatcc.h>
+#include <executorch/devtools/bundled_program/bundled_program.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 
 #include <chrono>
 using namespace std::chrono;
diff --git a/examples/apple/mps/executor_runner/targets.bzl b/examples/apple/mps/executor_runner/targets.bzl
index fd0a7a5046..14399411ae 100644
--- a/examples/apple/mps/executor_runner/targets.bzl
+++ b/examples/apple/mps/executor_runner/targets.bzl
@@ -28,9 +28,9 @@ def define_common_targets():
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/kernels/portable:generated_lib",
                 "//executorch/extension/data_loader:file_data_loader",
-                "//executorch/sdk/etdump:etdump_flatcc",
+                "//executorch/devtools/etdump:etdump_flatcc",
                 "//executorch/extension/data_loader:buffer_data_loader",
-                "//executorch/sdk/bundled_program:runtime",
+                "//executorch/devtools/bundled_program:runtime",
             ],
             external_deps = [
                 "gflags",
diff --git a/examples/apple/mps/scripts/mps_example.py b/examples/apple/mps/scripts/mps_example.py
index e561afb185..636444e2b7 100644
--- a/examples/apple/mps/scripts/mps_example.py
+++ b/examples/apple/mps/scripts/mps_example.py
@@ -14,6 +14,11 @@
 from executorch import exir
 from executorch.backends.apple.mps import MPSBackend
 from executorch.backends.apple.mps.partition import MPSPartitioner
+from executorch.devtools import BundledProgram, generate_etrecord
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
+    serialize_from_bundled_program_to_flatbuffer,
+)
 
 from executorch.exir import (
     EdgeCompileConfig,
@@ -24,11 +29,6 @@
 from executorch.exir.backend.backend_details import CompileSpec
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.extension.export_util.utils import export_to_edge, save_pte_program
-from executorch.sdk import BundledProgram, generate_etrecord
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.serialize import (
-    serialize_from_bundled_program_to_flatbuffer,
-)
 
 from ....models import MODEL_NAME_TO_MODEL
 from ....models.model_factory import EagerModelFactory
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
index e3a74456b3..fd5cdc7117 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
@@ -94,7 +94,7 @@
 		03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = LLaMARunner.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		03729F072BB203B300152F2E /* runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = runner.cpp; sourceTree = "<group>"; };
 		03729F082BB203B300152F2E /* runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = runner.h; sourceTree = "<group>"; };
-		03729F092BB203B300152F2E /* util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = util.h; sourceTree = "<group>"; };
+		03729F092BB203B300152F2E /* util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = util.h; path = ../../../../extension/llm/runner/util.h; sourceTree = "<group>"; };
 		03729F102BB2042B00152F2E /* sampler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sampler.h; sourceTree = "<group>"; };
 		03729F112BB2042B00152F2E /* sampler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sampler.cpp; sourceTree = "<group>"; };
 		03729F142BB2043600152F2E /* bpe_tokenizer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = bpe_tokenizer.cpp; path = ../../../../extension/llm/tokenizer/bpe_tokenizer.cpp; sourceTree = "<group>"; };
@@ -264,7 +264,7 @@
 				03729F102BB2042B00152F2E /* sampler.h */,
 			);
 			name = sampler;
-			path = ../../../../../models/llama2/sampler;
+			path = ../../../../../../extension/llm/sampler;
 			sourceTree = "<group>";
 		};
 /* End PBXGroup section */
diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask.cpp b/examples/models/flamingo/cross_attention/cross_attention_mask.cpp
new file mode 100644
index 0000000000..b2a2a6a806
--- /dev/null
+++ b/examples/models/flamingo/cross_attention/cross_attention_mask.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <algorithm>
+#include <string>
+
+#include <executorch/examples/models/flamingo/cross_attention/cross_attention_mask.h>
+#include <executorch/extension/runner_util/managed_tensor.h>
+
+namespace torch::executor {
+
+// Fowrward declaration needed for ARM compilers.
+int32_t safe_size_t_to_sizes_type(size_t value);
+std::vector<std::vector<int>> _get_image_attention_intervals(
+    const std::vector<int>& tokens,
+    int image_token_id);
+
+int32_t safe_size_t_to_sizes_type(size_t value) {
+  if (value >
+      static_cast<size_t>(std::numeric_limits<TensorImpl::SizesType>::max())) {
+    throw std::overflow_error(
+        "size_t value too large for TensorImpl::SizesType");
+  }
+  return static_cast<TensorImpl::SizesType>(value);
+}
+
+/**
+ * Returns a list of lists of the form [start, end) where start is the index
+ * of the current image token and end is the index of the next image token,
+ * exclusive.
+ *
+ * Example:
+ *     >>> text = "<img1><img2>These are two dogs. <img3>This is a cat."
+ *     >>> size_t image_token_id = 1;
+ *     >>> std::vector<int> tokens = {1, 1, 9673, 527, 1403, 12875, 13, 1, 1115,
+ * 374, 264, 8415]};
+ *     >>> transform = VisionCrossAttentionMask(tile_size=400, patch_size=40,
+ * image_token_id=1)
+ *     >>> intervals = _get_image_attention_intervals(tokens, image_token_id)
+ *     [[0, 7], [1, 7], [7, 12]]
+ *
+ * @param tokens List of token IDs in the text sequence.
+ * @param image_token_id The value of the image token.
+ *
+ * @returns Vector of vectors of the form [start, end) indicating the range of
+ * positions in the text sequence that should attend to the image.
+ */
+std::vector<std::vector<int>> _get_image_attention_intervals(
+    const std::vector<int>& tokens,
+    int image_token_id) {
+  std::vector<std::vector<int>> vision_masks;
+  int end = tokens.size();
+  std::vector<int> vision_token_locations;
+
+  // Find all vision token locations.
+  for (int i = 0; i < tokens.size(); ++i) {
+    if (tokens[i] == image_token_id) {
+      vision_token_locations.push_back(i);
+    }
+  }
+
+  // Return empty vector if there are no images.
+  if (vision_token_locations.empty()) {
+    return vision_masks;
+  }
+
+  // If there is only one image, it will attend to subsequent text until end.
+  if (vision_token_locations.size() == 1) {
+    vision_masks.push_back({vision_token_locations[0], end});
+    return vision_masks;
+  }
+
+  // Construct intervals from previous image token to next image token.
+  for (int i = 0; i < vision_token_locations.size() - 1; ++i) {
+    vision_masks.push_back(
+        {vision_token_locations[i], vision_token_locations[i + 1]});
+  }
+
+  // Last image will attend to subsequent text until end.
+  vision_masks.push_back({vision_token_locations.back(), end});
+
+  // If there are consecutive vision tokens, they should all attend to the
+  // same subsequent text.
+  int last_mask_end = vision_masks.back()[1];
+  for (auto it = vision_masks.rbegin(); it != vision_masks.rend(); ++it) {
+    if ((*it)[0] == (*it)[1] - 1) {
+      (*it)[1] = last_mask_end;
+    }
+    last_mask_end = (*it)[1];
+  }
+
+  return vision_masks;
+}
+
+std::vector<ManagedTensor> cross_attention_mask(
+    const std::vector<int>& tokens,
+    const std::vector<Tensor>& images,
+    size_t tile_size,
+    size_t patch_size,
+    int image_token_id,
+    std::vector<std::vector<int>>& out) {
+  size_t patch_grid_size = tile_size / patch_size;
+  size_t patches_per_tile = patch_grid_size * patch_grid_size;
+
+  std::vector<std::vector<int>> image_intervals =
+      _get_image_attention_intervals(tokens, image_token_id);
+
+  if (image_intervals.size() != images.size()) {
+    throw std::runtime_error(
+        "The number of image tokens (" +
+        std::to_string(image_intervals.size()) +
+        ") does not match the number of images (" +
+        std::to_string(images.size()) + ")");
+  }
+
+  // Create mask for each individual image based on its number of tokens,
+  // which can vary based on number of tiles since they are not yet tile padded.
+  // The masks are padded and concatenated together in the batch collator.
+  std::vector<ManagedTensor> cross_attention_masks;
+  size_t text_seq_len = tokens.size();
+  for (size_t image_idx = 0; image_idx < image_intervals.size(); ++image_idx) {
+    size_t n_tiles = images[image_idx].size(0);
+    size_t image_seq_len =
+        n_tiles * (patches_per_tile + 1); // +1 for the CLS token.
+
+    // Mask will be block of 1s at the corresponding interval in the text.
+    // It is not a causal block because all the image tokens correspond
+    // to a single image, so text tokens attend to all the image's tokens.
+    std::vector<TensorImpl::SizesType> sizes = {
+        safe_size_t_to_sizes_type(text_seq_len),
+        safe_size_t_to_sizes_type(image_seq_len)};
+
+    // Allocate the underlying data to be handled by the managed tensor.
+    size_t num_elements = text_seq_len * image_seq_len;
+    size_t stride = image_seq_len;
+    std::vector<int> mask_data(num_elements);
+
+    ManagedTensor mask(mask_data.data(), sizes, ScalarType::Int);
+    cross_attention_masks.emplace_back(std::move(mask));
+
+    // Add the allocated data to the output vector.
+    out.emplace_back(std::move(mask_data));
+
+    // All rows of tensor in the text_seq_len dimension within the interval are
+    // set to 1 (true).
+    size_t start = image_intervals[image_idx][0];
+    size_t end = image_intervals[image_idx][1]; // End is exclusive.
+    for (size_t i = start; i < end; ++i) {
+      for (size_t j = 0; j < image_seq_len; ++j) {
+        size_t unrolled_index = i * image_seq_len + j;
+        if (unrolled_index >= out[image_idx].size()) {
+          throw std::out_of_range(
+              "Index " + std::to_string(unrolled_index) +
+              " out of range of output tensor.");
+        }
+        out[image_idx][i * stride + j] = 1;
+      }
+    }
+  }
+
+  return cross_attention_masks;
+}
+
+} // namespace torch::executor
diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask.h b/examples/models/flamingo/cross_attention/cross_attention_mask.h
new file mode 100644
index 0000000000..6998d91ad4
--- /dev/null
+++ b/examples/models/flamingo/cross_attention/cross_attention_mask.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
+#include <vector>
+
+namespace torch {
+namespace executor {
+
+/**
+ * Computes the cross-attention mask for text + image inputs. Text tokens that
+ * participate in cross-attention with an image token will show True in the mask
+ * and follow the interleaved structure laid out in Fig. 7 of the Flamingo paper
+ * (https://arxiv.org/pdf/2204.14198):
+ *
+ *     (1) Text tokens immediately following the image token up until the next
+ * image token (2) Consecutive image tokens attend to subsequent text tokens
+ *
+ * ::
+ *
+ *           ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐
+ *      img1 │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │   │ │   │ │   │ │   │ │   │
+ *           └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘
+ *           ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐
+ *      img2 │   │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │   │ │   │ │   │ │   │ │   │
+ *           └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘
+ *           ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐
+ *      img3 │   │ │   │ │   │ │   │ │   │ │   │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │
+ *           └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘
+ *         <img1> <img2>These  are   two  dogs. <img3> This   is    a    cat.
+ *
+ *
+ *
+ * Resultant mask is constructed per image and is of shape (text_seq_len,
+ * image_seq_len), where True indicates that the token outputted from the image
+ * encoder attends to the token in the text sequence in cross-attention. A list
+ * of these masks are returned with length equal to number of images in the
+ * sample.
+ *
+ * @param tokens Vector of tokens participating in the cross attention.
+ * @param images Vector of images participating in the cross attention.
+ * @param tile_size The size of the image tiles from the image transform.
+ * @param patch_size The size of each patch. Used to divide the tiles into
+ * patches. E.g. for patch_size = 40, a tile of shape (400, 400) will have 10x10
+ * grid of patches with shape (40, 40) each. image_token_id (int): Token ID of
+ * the image special token.
+ * @param image_token_id The value of the image token.
+ * @param out Out vector holding the raw data wrapped by the returned cross
+ * attention masks.
+ *
+ * @returns A vector of cross attention masks, as Tensors, one for each image.
+ */
+std::vector<ManagedTensor> cross_attention_mask(
+    const std::vector<int>& tokens,
+    const std::vector<Tensor>& images,
+    size_t tile_size,
+    size_t patch_size,
+    int image_token_id,
+    std::vector<std::vector<int>>& out);
+
+} // namespace executor
+} // namespace torch
diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp b/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp
new file mode 100644
index 0000000000..5b9e58c216
--- /dev/null
+++ b/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/models/flamingo/cross_attention/cross_attention_mask.h>
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using torch::executor::ManagedTensor;
+using torch::executor::ScalarType;
+using torch::executor::Tensor;
+using torch::executor::TensorImpl;
+
+TEST(CrossAttentxnMaskTest, TestCrossAttentionMask) {
+  std::vector<int> tokens = {
+      1, 1, 9673, 527, 1403, 12875, 13, 1, 1115, 374, 264, 8415};
+
+  // Initialize image tensors.
+  TensorImpl::SizesType sizes[2] = {2, 2};
+  TensorImpl::DimOrderType dim_order[2] = {0, 1};
+  TensorImpl::StridesType strides[2] = {2, 1};
+
+  int32_t a_data[4] = {1, 2, 3, 4};
+  auto a_impl =
+      TensorImpl(ScalarType::Int, 2, sizes, a_data, dim_order, strides);
+  Tensor a(&a_impl);
+
+  int32_t b_data[4] = {5, 6, 7, 8};
+  auto b_impl =
+      TensorImpl(ScalarType::Int, 2, sizes, b_data, dim_order, strides);
+  Tensor b(&b_impl);
+
+  int32_t c_data[4] = {9, 10, 11, 12};
+  auto c_impl =
+      TensorImpl(ScalarType::Int, 2, sizes, c_data, dim_order, strides);
+  Tensor c(&c_impl);
+
+  std::vector<Tensor> images = {a, b, c};
+  std::vector<std::vector<int>> mask_data;
+  std::vector<ManagedTensor> output_masks =
+      torch::executor::cross_attention_mask(
+          tokens,
+          images,
+          /*tile_size=*/1,
+          /*patch_size=*/1,
+          /*image_token_id=*/1,
+          /*out=*/mask_data);
+
+  // Check contents of the mask.
+  std::vector<std::vector<size_t>> expected_intervals = {
+      {0, 7}, {1, 7}, {7, 12}};
+  for (size_t mask_idx = 0; mask_idx < output_masks.size(); ++mask_idx) {
+    ManagedTensor& output_mask = output_masks[mask_idx];
+    Tensor output_tensor = output_mask.get_aliasing_tensor();
+    for (size_t i = 0; i < output_tensor.size(0); ++i) {
+      for (size_t j = 0; j < output_tensor.strides()[0]; ++j) {
+        size_t unrolled_index = i * output_tensor.strides()[0] + j;
+        if (i >= expected_intervals[mask_idx][0] &&
+            i < expected_intervals[mask_idx][1]) {
+          EXPECT_EQ(output_tensor.const_data_ptr<int>()[unrolled_index], 1);
+        } else {
+          EXPECT_EQ(output_tensor.const_data_ptr<int>()[unrolled_index], 0);
+        }
+      }
+    }
+  }
+}
diff --git a/examples/models/flamingo/cross_attention/targets.bzl b/examples/models/flamingo/cross_attention/targets.bzl
new file mode 100644
index 0000000000..7bc13270aa
--- /dev/null
+++ b/examples/models/flamingo/cross_attention/targets.bzl
@@ -0,0 +1,25 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    runtime.cxx_library(
+        name = "cross_attention_mask",
+        srcs = ["cross_attention_mask.cpp"],
+        exported_headers = ["cross_attention_mask.h"],
+        exported_deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/extension/runner_util:managed_tensor",
+            "//executorch/runtime/core/exec_aten/util:tensor_util",
+        ],
+    )
+
+    runtime.cxx_test(
+        name = "cross_attention_mask_test",
+        srcs = ["cross_attention_mask_test.cpp"],
+        deps = [":cross_attention_mask"],
+    )
diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS
index 9bdbff5fbb..467949a5eb 100644
--- a/examples/models/llama2/TARGETS
+++ b/examples/models/llama2/TARGETS
@@ -93,7 +93,7 @@ runtime.python_library(
         # "//executorch/extension/pybindings:aten_lib",
         # "//executorch/extension/pybindings:portable_lib",
         # "//executorch/extension/pybindings:portable_lib_plus_custom",
-        "//executorch/sdk/etrecord:etrecord",
+        "//executorch/devtools/etrecord:etrecord",
         "//executorch/util:memory_profiler",
         "//executorch/util:python_profiler",
         "fbsource//third-party/pypi/coremltools:coremltools",
diff --git a/examples/models/llama2/eval_llama.py b/examples/models/llama2/eval_llama.py
index 0495c76bbf..4daeaf7afa 100644
--- a/examples/models/llama2/eval_llama.py
+++ b/examples/models/llama2/eval_llama.py
@@ -22,6 +22,8 @@ def main() -> None:
     modelname = "llama2"
     parser = build_args_parser()
     args = parser.parse_args()
+    # Overrides this arg, because evaluation requires full logits.
+    args.generate_full_logits = True
     eval_llama(modelname, args)
 
 
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index c22c0a3c3c..221f2f75bc 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -22,6 +22,8 @@
 
 import torch
 
+from executorch.devtools.etrecord import generate_etrecord
+
 from executorch.examples.models.llama2.llama_transformer import ModelArgs
 
 from executorch.extension.llm.export.builder import DType, LLMEdgeManager
@@ -40,8 +42,6 @@
     get_pt2e_quantizers,
     get_qnn_quantizer,
 )
-
-from executorch.sdk.etrecord import generate_etrecord
 from executorch.util.activation_memory_profiler import generate_memory_trace
 
 from ..model_factory import EagerModelFactory
@@ -296,6 +296,13 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="Generate the ETRecord debug artifact.",
     )
 
+    parser.add_argument(
+        "--generate_full_logits",
+        action="store_true",
+        required=False,
+        default=True,
+        help="Generate logits for all inputs.",
+    )
     return parser
 
 
@@ -405,6 +412,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
             params_path=params_path,
             use_kv_cache=args.use_kv_cache,
             use_sdpa_with_kv_cache=args.use_sdpa_with_kv_cache,
+            generate_full_logits=args.generate_full_logits,
             weight_type=weight_type,
             enable_dynamic_shape=args.enable_dynamic_shape,
             verbose=args.verbose,
@@ -590,6 +598,7 @@ def _load_llama_model(
     params_path: str,
     use_kv_cache: bool = False,
     use_sdpa_with_kv_cache: bool = False,
+    generate_full_logits: bool = True,
     weight_type: WeightType = WeightType.LLAMA,
     enable_dynamic_shape: bool = False,
     verbose: bool = False,
@@ -616,6 +625,7 @@ def _load_llama_model(
         params=params_path,
         use_kv_cache=use_kv_cache,
         use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
+        generate_full_logits=generate_full_logits,
         fairseq2=weight_type == WeightType.FAIRSEQ2,
         max_seq_len=max_seq_len,
         enable_dynamic_shape=enable_dynamic_shape,
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
index 4ae12b0f64..81b47a3a5d 100644
--- a/examples/models/llama2/llama_transformer.py
+++ b/examples/models/llama2/llama_transformer.py
@@ -96,6 +96,10 @@ class ModelArgs:
     use_sdpa_with_kv_cache_op: bool = (
         False  # Use custom sdpa op that updates kv cache in-place
     )
+    # Generate logits for all inputs. When it's True, it would take big memory usage
+    # at runtime. Enable it only necessary (e.g., use perplexity tools that requires
+    # logits for all input tokens.)
+    generate_full_logits: bool = True
     enable_dynamic_shape: bool = False  # export model with dynamic shape support
     use_hf_rope: bool = False  # Use HuggingFace's RoPE implementation
     rope_theta: Optional[float] = (
@@ -442,6 +446,7 @@ def __init__(self, params: ModelArgs):
         self.norm = RMSNorm(params.dim, eps=params.norm_eps)
         self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
         self.use_kv_cache = params.use_kv_cache
+        self.generate_full_logits = params.generate_full_logits
         self.max_seq_len = params.max_seq_len
         if params.use_hf_rope:
             self.precompute_freqs_cis = hf_precompute_freqs_cis
@@ -512,6 +517,10 @@ def forward(
                 input_pos,
             )
 
+        if not self.generate_full_logits:
+            # Only the last logit is used for the new generated token
+            h = h[:, -1, :]
+
         h = self.norm(h)
 
         logits = self.output(h)
diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
index fdf0dc707e..b375399f33 100644
--- a/examples/models/llama2/model.py
+++ b/examples/models/llama2/model.py
@@ -61,6 +61,7 @@ def __init__(self, **kwargs):
 
         self.use_kv_cache = kwargs.get("use_kv_cache", False)
         self.use_sdpa_with_kv_cache_op = kwargs.get("use_sdpa_with_kv_cache", False)
+        self.generate_full_logits = kwargs.get("generate_full_logits", True)
         self.enable_dynamic_shape = kwargs.get("enable_dynamic_shape", False)
 
         self.max_seq_len = kwargs.get("max_seq_len", 128)
@@ -145,6 +146,7 @@ def __init__(self, **kwargs):
             max_batch_size=max_batch_size,
             use_kv_cache=self.use_kv_cache,
             use_sdpa_with_kv_cache_op=self.use_sdpa_with_kv_cache_op,
+            generate_full_logits=self.generate_full_logits,
             enable_dynamic_shape=self.enable_dynamic_shape,
             **params,
         )
diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp
index c5ce03b88d..a58fdfd5e5 100644
--- a/examples/models/llava/runner/llava_runner.cpp
+++ b/examples/models/llava/runner/llava_runner.cpp
@@ -20,6 +20,8 @@
 #include <sstream>
 #include <vector>
 
+using ::executorch::extension::llm::Stats;
+
 namespace torch::executor {
 
 bool LlavaRunner::is_loaded() {
diff --git a/examples/models/llava/runner/llava_runner.h b/examples/models/llava/runner/llava_runner.h
index d9805a0c91..13d842e30f 100644
--- a/examples/models/llava/runner/llava_runner.h
+++ b/examples/models/llava/runner/llava_runner.h
@@ -35,7 +35,8 @@ class LlavaRunner : public MultimodalRunner {
       const std::string& prompt,
       int32_t seq_len = 1024,
       std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {});
+      std::function<void(const ::executorch::extension::llm::Stats&)>
+          stats_callback = {});
 
  private:
   inline static const std::string kPresetPrompt =
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index 7cd3709b95..c2a6c2c46c 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -18,6 +18,7 @@
  */
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/runner_util/inputs.h>
 #include <executorch/runtime/core/memory_allocator.h>
@@ -26,7 +27,6 @@
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/profiler.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/sdk/etdump/etdump_flatcc.h>
 #include <executorch/util/util.h>
 
 #include <gflags/gflags.h>
diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py
index b12a44993d..8339b9f5b5 100644
--- a/examples/qualcomm/scripts/export_example.py
+++ b/examples/qualcomm/scripts/export_example.py
@@ -15,12 +15,12 @@
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
 )
+from executorch.devtools import generate_etrecord
 from executorch.examples.models import MODEL_NAME_TO_MODEL
 from executorch.examples.models.model_factory import EagerModelFactory
 from executorch.exir.backend.backend_api import to_backend, validation_disabled
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.extension.export_util.utils import save_pte_program
-from executorch.sdk import generate_etrecord
 
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
diff --git a/examples/sdk/CMakeLists.txt b/examples/sdk/CMakeLists.txt
index 76034b0760..af7e9b15bc 100644
--- a/examples/sdk/CMakeLists.txt
+++ b/examples/sdk/CMakeLists.txt
@@ -49,7 +49,7 @@ add_executable(sdk_example_runner sdk_example_runner/sdk_example_runner.cpp)
 target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
 
 target_include_directories(
-  etdump INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../sdk/include
+  etdump INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../devtools/include
                    ${EXECUTORCH_ROOT}/third-party/flatcc/include
 )
 target_link_libraries(
diff --git a/examples/sdk/README.md b/examples/sdk/README.md
index 68043517fb..096f90864e 100644
--- a/examples/sdk/README.md
+++ b/examples/sdk/README.md
@@ -59,11 +59,11 @@ Running the program will generate an `ETDump` file (`.etdp`) at the location spe
 Once an `ETDump` has been generated, it can be viewed using the CLI inspector. This will print a tabular view of the data recorded in the ETDump.
 
 ```bash
-   python3 -m sdk.inspector.inspector_cli --etdump_path mv2_etdump.etdp
+   python3 -m devtools.inspector.inspector_cli --etdump_path mv2_etdump.etdp
    ```
 ### ETDump C++ API
 
-ETDump profiling can also be used in a custom C++ program. `ETDumpGen` is an implementation of the abstract `EventTracer` class.  Include the header file located at `sdk/etdump/etdump_flatcc.h`. To initialize the ETDump generator, construct it before loading the method from the program.
+ETDump profiling can also be used in a custom C++ program. `ETDumpGen` is an implementation of the abstract `EventTracer` class.  Include the header file located at `devtools/etdump/etdump_flatcc.h`. To initialize the ETDump generator, construct it before loading the method from the program.
 
 ```cpp
    torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
diff --git a/examples/sdk/scripts/export_bundled_program.py b/examples/sdk/scripts/export_bundled_program.py
index a34a0ab4d3..052f5e9962 100644
--- a/examples/sdk/scripts/export_bundled_program.py
+++ b/examples/sdk/scripts/export_bundled_program.py
@@ -11,19 +11,19 @@
 from typing import List
 
 import torch
-
-from executorch.exir import ExecutorchProgramManager
-from executorch.extension.export_util.utils import export_to_exec_prog
-from executorch.sdk import BundledProgram
-from executorch.sdk.bundled_program.config import (
+from executorch.devtools import BundledProgram
+from executorch.devtools.bundled_program.config import (
     MethodInputType,
     MethodTestCase,
     MethodTestSuite,
 )
-from executorch.sdk.bundled_program.serialize import (
+from executorch.devtools.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
 
+from executorch.exir import ExecutorchProgramManager
+from executorch.extension.export_util.utils import export_to_exec_prog
+
 from ...models import MODEL_NAME_TO_MODEL
 from ...models.model_factory import EagerModelFactory
 
diff --git a/examples/sdk/scripts/gen_sample_etrecord.py b/examples/sdk/scripts/gen_sample_etrecord.py
index c219ed4094..d2c4913b03 100644
--- a/examples/sdk/scripts/gen_sample_etrecord.py
+++ b/examples/sdk/scripts/gen_sample_etrecord.py
@@ -10,6 +10,7 @@
 from typing import Any
 
 import torch
+from executorch.devtools import generate_etrecord
 from executorch.exir import (
     EdgeCompileConfig,
     EdgeProgramManager,
@@ -18,7 +19,6 @@
     to_edge,
 )
 from executorch.exir.capture._config import ExecutorchBackendConfig
-from executorch.sdk import generate_etrecord
 from torch.export import export
 
 from ...models import MODEL_NAME_TO_MODEL
diff --git a/examples/sdk/sdk_example_runner/sdk_example_runner.cpp b/examples/sdk/sdk_example_runner/sdk_example_runner.cpp
index e2e42ab670..7e979937d1 100644
--- a/examples/sdk/sdk_example_runner/sdk_example_runner.cpp
+++ b/examples/sdk/sdk_example_runner/sdk_example_runner.cpp
@@ -22,13 +22,13 @@
 
 #include <gflags/gflags.h>
 
+#include <executorch/devtools/bundled_program/bundled_program.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/sdk/bundled_program/bundled_program.h>
-#include <executorch/sdk/etdump/etdump_flatcc.h>
 #include <executorch/util/util.h>
 
 static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4MB
diff --git a/examples/sdk/sdk_example_runner/targets.bzl b/examples/sdk/sdk_example_runner/targets.bzl
index a5e8feb33c..025d42fee1 100644
--- a/examples/sdk/sdk_example_runner/targets.bzl
+++ b/examples/sdk/sdk_example_runner/targets.bzl
@@ -20,8 +20,8 @@ def define_common_targets():
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/extension/data_loader:buffer_data_loader",
             "//executorch/util:util",
-            "//executorch/sdk/etdump:etdump_flatcc",
-            "//executorch/sdk/bundled_program:runtime",
+            "//executorch/devtools/etdump:etdump_flatcc",
+            "//executorch/devtools/bundled_program:runtime",
         ],
         external_deps = [
             "gflags",
diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py
index a816c4f0e7..32d67e0cd4 100644
--- a/examples/xnnpack/aot_compiler.py
+++ b/examples/xnnpack/aot_compiler.py
@@ -12,9 +12,9 @@
 
 import torch
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.devtools import generate_etrecord
 from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig
 from executorch.extension.export_util.utils import export_to_edge, save_pte_program
-from executorch.sdk import generate_etrecord
 
 from ..models import MODEL_NAME_TO_MODEL
 from ..models.model_factory import EagerModelFactory
diff --git a/examples/xnnpack/targets.bzl b/examples/xnnpack/targets.bzl
index 30cafa56fa..35df8999b4 100644
--- a/examples/xnnpack/targets.bzl
+++ b/examples/xnnpack/targets.bzl
@@ -32,7 +32,7 @@ def define_common_targets():
             "//executorch/examples/xnnpack/quantization:quant_utils",
             "//executorch/exir:lib",
             "//executorch/exir/backend:backend_api",
-            "//executorch/sdk:lib",
+            "//executorch/devtools:lib",
         ],
     )
 
diff --git a/exir/_serialize/TARGETS b/exir/_serialize/TARGETS
index 8ddf830039..49419a4159 100644
--- a/exir/_serialize/TARGETS
+++ b/exir/_serialize/TARGETS
@@ -14,8 +14,8 @@ cpp_python_extension(
         "//executorch/backends/fb/qnnpack/...",
         "//executorch/backends/vulkan/...",
         "//executorch/backends/xnnpack/...",
-        "//executorch/sdk/bundled_program/...",
-        "//executorch/sdk/etdump/...",
+        "//executorch/devtools/bundled_program/...",
+        "//executorch/devtools/etdump/...",
     ],
     deps = [
         "fbsource//third-party/flatbuffers:flatc_library",
@@ -45,6 +45,10 @@ runtime.python_library(
     visibility = [
         "//executorch/backends/...",
         "//executorch/codegen/...",
+        "//executorch/devtools:lib",
+        "//executorch/devtools/bundled_program/serialize:lib",
+        "//executorch/devtools/bundled_program/tests/...",
+        "//executorch/devtools/experimental/...",
         "//executorch/examples/async_exec:emit_program_lib",
         "//executorch/exir/...",
         "//executorch/exir/tests/...",
@@ -52,10 +56,6 @@ runtime.python_library(
         "//executorch/extension/pybindings/test:test",
         "//executorch/extension/pybindings/test:test-library",
         "//executorch/profiler/...",
-        "//executorch/sdk:lib",
-        "//executorch/sdk/bundled_program/serialize:lib",
-        "//executorch/sdk/bundled_program/tests/...",
-        "//executorch/sdk/experimental/...",
         "//executorch/test/...",
         "@EXECUTORCH_CLIENTS",
     ],
diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py
index 0aebab649e..bf40a78bb6 100644
--- a/exir/emit/_emit_program.py
+++ b/exir/emit/_emit_program.py
@@ -78,6 +78,29 @@ def _remove_non_user_outputs(exported_program: ExportedProgram) -> torch.fx.Grap
     return gm
 
 
+# For each entry point in the model, determine if its a joint graph,
+# and if it is return a map of the indices in the model output that the
+# gradient outputs start at and that the parameter outputs start at.
+def _get_training_metadata(methods: Dict[str, ExportedProgram]) -> Dict[str, int]:
+    gradients_method_prefix = "__et_training_gradients_index_"
+    parameters_method_prefix = "__et_training_parameters_index_"
+    training_metadata = {}
+    for name, method in methods.items():
+        found_grad = False
+        found_param = False
+        i = 0
+        for output_spec in method.graph_signature.output_specs:
+            if output_spec.kind == OutputKind.GRADIENT_TO_PARAMETER and not found_grad:
+                training_metadata[gradients_method_prefix + name] = i
+                found_grad = True
+            elif output_spec.kind == OutputKind.TOKEN and not found_param:
+                assert found_grad  # Params must come after gradients
+                training_metadata[parameters_method_prefix + name] = i
+                found_param = True
+            i += 1
+    return training_metadata
+
+
 def emit_program(
     methods: Union[ExportedProgram, Dict[str, ExportedProgram]],
     emit_stacktrace: bool = False,
@@ -143,6 +166,10 @@ def emit_program(
             emitter.instr_id_to_delegate_debug_id_map
         )
 
+    training_metadata = _get_training_metadata(methods)
+    if len(training_metadata) > 0:
+        plans.extend(emitter._emit_prim_getters(training_metadata))
+
     # emit any primitive getters
     if prim_getters is not None:
         plans.extend(emitter._emit_prim_getters(prim_getters))
diff --git a/exir/tests/test_joint_graph.py b/exir/tests/test_joint_graph.py
index 7c80439610..0e5a322397 100644
--- a/exir/tests/test_joint_graph.py
+++ b/exir/tests/test_joint_graph.py
@@ -108,3 +108,23 @@ def forward(self, x, y):
         self.assertTrue(torch.allclose(m.linear.bias.grad, et_outputs[2]))
         self.assertTrue(torch.allclose(m.linear.weight, et_outputs[3]))
         self.assertTrue(torch.allclose(m.linear.bias, et_outputs[4]))
+
+        self.assertEqual(
+            len(et.executorch_program.execution_plan), 3
+        )  # forward + 2 training metadata functions
+
+        # gradient outputs start at index 1
+        self.assertEqual(
+            et.executorch_program.execution_plan[1]  # pyre-ignore
+            .values[0]
+            .val.int_val,
+            1,
+        )
+
+        # parameter outputs start at index 3
+        self.assertEqual(
+            et.executorch_program.execution_plan[2]  # pyre-ignore
+            .values[0]
+            .val.int_val,
+            3,
+        )
diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp
index 727c04774b..d31cbaf369 100644
--- a/extension/llm/custom_ops/op_sdpa.cpp
+++ b/extension/llm/custom_ops/op_sdpa.cpp
@@ -158,7 +158,7 @@ static inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) {
 template <
     typename scalar_t,
     typename std::enable_if_t<
-        torch::executor::is_reduced_floating_point<scalar_t>::value,
+        ::executorch::runtime::is_reduced_floating_point<scalar_t>::value,
         int> = 0>
 static inline scalar_t* conditional_data_ptr(float* ptr, scalar_t* ptr2) {
   (void)ptr;
@@ -247,7 +247,7 @@ void cpu_flash_attention(
       "KV_split_size must be greater than q_split_size");
 
   constexpr bool is_reduced_type =
-      torch::executor::is_reduced_floating_point<scalar_t>::value;
+      ::executorch::runtime::is_reduced_floating_point<scalar_t>::value;
 
   ET_CHECK_MSG(
       !is_reduced_type, "FlashAttention does not support reduced types.");
diff --git a/extension/llm/custom_ops/op_sdpa_test.cpp b/extension/llm/custom_ops/op_sdpa_test.cpp
index 116be2508d..43f2022917 100644
--- a/extension/llm/custom_ops/op_sdpa_test.cpp
+++ b/extension/llm/custom_ops/op_sdpa_test.cpp
@@ -17,6 +17,7 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
+using executorch::runtime::testing::TensorFactory;
 
 exec_aten::Tensor op_scaled_dot_product_attention(
     const exec_aten::Tensor& query,
@@ -37,7 +38,7 @@ Most tests are generated by FACTO
 */
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_105) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 1, 4, 4},
@@ -123,7 +124,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_105) {
 }
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_11) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 1, 1, 8},
@@ -152,7 +153,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_11) {
 }
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_13) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 8, 1, 1}, {-47.0, 21.25, 74.75, 46.375, 21.0, -29.0, 2.625, 83.125});
@@ -181,7 +182,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_13) {
 }
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_17) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {3, 2, 2, 6},
@@ -257,7 +258,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_17) {
 }
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_18) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {3, 2, 2, 6},
@@ -333,7 +334,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_18) {
 // Disabling this test because right now we are enforcing that
 // attention mask must be 2D
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_19) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {3, 2, 2, 6},
@@ -479,7 +480,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_19) {
 */
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_51) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 1, 8, 3},
diff --git a/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp
index 819dd70217..2a8124bc1e 100644
--- a/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp
+++ b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp
@@ -16,6 +16,7 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
+using executorch::runtime::testing::TensorFactory;
 
 exec_aten::Tensor op_sdpa_with_kv_cache(
     const exec_aten::Tensor& query,
@@ -79,7 +80,7 @@ Missing tests:
 5. Different dtypes, fp16, bf16, double (or expect throw)
 */
 TEST(OpScaledDotProductAttentionTest, BasicTest) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 1, 4, 4},
@@ -360,7 +361,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTest) {
 }
 
 TEST(OpScaledDotProductAttentionTest, LargerTest) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 1, 7, 4}, {0.8823, 0.9150, 0.3829, 0.9593, 0.3904, 0.6009, 0.2566,
@@ -524,7 +525,7 @@ TEST(OpScaledDotProductAttentionTest, LargerTest) {
 }
 
 TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 1, 4, 4},
@@ -807,7 +808,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
 }
 
 TEST(OpScaledDotProductAttentionTest, SequenceTest) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
 
   exec_aten::Tensor query = tfFloat.make(
       {1, 1, 8, 4},
diff --git a/extension/llm/custom_ops/op_tile_crop_test.cpp b/extension/llm/custom_ops/op_tile_crop_test.cpp
index 565f510913..36841b80f1 100644
--- a/extension/llm/custom_ops/op_tile_crop_test.cpp
+++ b/extension/llm/custom_ops/op_tile_crop_test.cpp
@@ -15,7 +15,7 @@
 using namespace ::testing;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
-using torch::executor::testing::TensorFactory;
+using executorch::runtime::testing::TensorFactory;
 
 class OpTileCropOutTest : public OperatorTest {
  protected:
diff --git a/extension/llm/runner/image.h b/extension/llm/runner/image.h
index e18353dda9..32a9f87818 100644
--- a/extension/llm/runner/image.h
+++ b/extension/llm/runner/image.h
@@ -13,7 +13,9 @@
 // patternlint-disable-next-line executorch-cpp-nostdinc
 #include <vector>
 
-namespace torch::executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 struct Image {
   // Assuming NCHW format
@@ -23,4 +25,14 @@ struct Image {
   int32_t channels;
 };
 
-} // namespace torch::executor
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::Image;
+} // namespace executor
+} // namespace torch
diff --git a/extension/llm/runner/image_prefiller.h b/extension/llm/runner/image_prefiller.h
index 64b623be36..879b0a6e21 100644
--- a/extension/llm/runner/image_prefiller.h
+++ b/extension/llm/runner/image_prefiller.h
@@ -13,23 +13,27 @@
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/extension/module/module.h>
 
-namespace torch::executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 // Assuming kv cache and parallel prefill are enabled.
 class ImagePrefiller {
  public:
-  explicit ImagePrefiller(Module* module) : module_(module) {}
+  explicit ImagePrefiller(::executorch::extension::Module* module)
+      : module_(module) {}
+
   /**
    * Prefill an LLM Module with the given image input.
    * @param image The image input to the multimodal LLM.
    * @param start_pos The starting position in KV cache of the input in the LLM
    * @return The next token of the LLM Module after prefill.
    */
-  virtual Result<exec_aten::Tensor> prefill(
+  virtual ::executorch::runtime::Result<exec_aten::Tensor> prefill(
       Image& image,
       int64_t start_pos = 0) = 0;
 
-  virtual Error load() = 0;
+  virtual ::executorch::runtime::Error load() = 0;
   virtual bool is_method_loaded() = 0;
 
   virtual ~ImagePrefiller() = default;
@@ -38,4 +42,14 @@ class ImagePrefiller {
   Module* module_;
 };
 
-} // namespace torch::executor
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::ImagePrefiller;
+} // namespace executor
+} // namespace torch
diff --git a/extension/llm/runner/metadata_util.h b/extension/llm/runner/metadata_util.h
index 4ea2d9eebd..5f55dad538 100644
--- a/extension/llm/runner/metadata_util.h
+++ b/extension/llm/runner/metadata_util.h
@@ -14,7 +14,10 @@
 
 #include <executorch/extension/module/module.h>
 
-namespace torch::executor {
+namespace executorch {
+namespace extension {
+namespace llm {
+
 template <typename T>
 T get_module_metadata(
     Module* module,
@@ -26,9 +29,10 @@ T get_module_metadata(
 
   T res = default_val;
   if (model_methods.count(method_name)) {
-    Result<std::vector<EValue>> outputs = module->execute(method_name);
+    ::executorch::runtime::Result<std::vector<::executorch::runtime::EValue>>
+        outputs = module->execute(method_name);
     if (outputs.ok()) {
-      std::vector<EValue> outs = outputs.get();
+      std::vector<::executorch::runtime::EValue> outs = outputs.get();
       if (outs.size() > 0) {
         res = outs[0].to<T>();
       }
@@ -43,4 +47,7 @@ T get_module_metadata(
   ET_LOG(Info, "%s: %lld", method_name.c_str(), (long long)res);
   return res;
 }
-} // namespace torch::executor
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
index ac38085be4..745f086f80 100644
--- a/extension/llm/runner/multimodal_runner.h
+++ b/extension/llm/runner/multimodal_runner.h
@@ -33,8 +33,9 @@
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
-namespace torch::executor {
-using Stats = ::executorch::llm::Stats;
+namespace executorch {
+namespace extension {
+namespace llm {
 
 class MultimodalRunner {
  public:
@@ -53,8 +54,8 @@ class MultimodalRunner {
   }
 
   virtual bool is_loaded() = 0;
-  virtual Error load() = 0;
-  virtual Error generate(
+  virtual ::executorch::runtime::Error load() = 0;
+  virtual ::executorch::runtime::Error generate(
       std::vector<Image>& images,
       const std::string& prompt,
       int32_t seq_len = 1024,
@@ -91,4 +92,14 @@ class MultimodalRunner {
   Stats stats_;
 };
 
-} // namespace torch::executor
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::MultimodalRunner;
+} // namespace executor
+} // namespace torch
diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h
index f62be0940c..902ba89296 100644
--- a/extension/llm/runner/stats.h
+++ b/extension/llm/runner/stats.h
@@ -14,7 +14,10 @@
 #include <sstream>
 // patternlint-disable-next-line executorch-cpp-nostdinc
 #include <string>
-namespace executorch::llm {
+
+namespace executorch {
+namespace extension {
+namespace llm {
 
 struct Stats {
   // Scaling factor for timestamps - in this case, we use ms.
@@ -41,12 +44,11 @@ struct Stats {
   // Token count from generated (total - prompt)
   int64_t num_generated_tokens;
   inline void on_sampling_begin() {
-    aggregate_sampling_timer_start_timestamp =
-        ::torch::executor::util::time_in_ms();
+    aggregate_sampling_timer_start_timestamp = time_in_ms();
   }
   inline void on_sampling_end() {
-    aggregate_sampling_time_ms += ::torch::executor::util::time_in_ms() -
-        aggregate_sampling_timer_start_timestamp;
+    aggregate_sampling_time_ms +=
+        time_in_ms() - aggregate_sampling_timer_start_timestamp;
     aggregate_sampling_timer_start_timestamp = 0;
   }
 
@@ -132,4 +134,16 @@ inline void print_report(const Stats& stats) {
           stats.SCALING_FACTOR_UNITS_PER_SECOND);
 }
 
-} // namespace executorch::llm
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace executorch {
+namespace llm {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::kTopp;
+using ::executorch::extension::llm::print_report;
+using ::executorch::extension::llm::Stats;
+} // namespace llm
+} // namespace executorch
diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
index 3de75ceccb..a0963769ea 100644
--- a/extension/llm/runner/text_decoder_runner.cpp
+++ b/extension/llm/runner/text_decoder_runner.cpp
@@ -8,11 +8,15 @@
 
 // Given inputs, run a text decoder and return logits.
 
-#include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
+
 #include <ctime>
 
-namespace torch::executor {
+#include <executorch/extension/llm/runner/stats.h>
+
+namespace executorch {
+namespace extension {
+namespace llm {
 
 // NOTE: we observed ~2x loading performance increase on iPhone 15
 // and a ~5% improvement on Galaxy S22 by switching to
@@ -26,22 +30,22 @@ TextDecoderRunner::TextDecoderRunner(
       sampler_(std::make_unique<Sampler>(
           vocab_size,
           temperature,
-          ::executorch::llm::kTopp,
+          kTopp,
           static_cast<unsigned long long>(std::time(nullptr)))),
       use_kv_cache_(use_kv_cache) {}
 
 // This function is functional, meaning it shouldn't modify any state of the
 // input. It should be safe to call multiple times with the same inputs. The
 // outer loop (call site) is responsible for managing state.
-Result<exec_aten::Tensor> TextDecoderRunner::step(
+::executorch::runtime::Result<exec_aten::Tensor> TextDecoderRunner::step(
     ManagedTensor& managed_tokens,
     ManagedTensor& managed_start_pos) {
   auto tokens = managed_tokens.get_aliasing_tensor();
   // ET_LOG(Info, "Input token %" PRIu64, input_token);
   if (use_kv_cache_) {
     auto start_pos = managed_start_pos.get_aliasing_tensor();
-    Result<std::vector<EValue>> outputs_res =
-        module_->forward({tokens, start_pos});
+    ::executorch::runtime::Result<std::vector<::executorch::runtime::EValue>>
+        outputs_res = module_->forward({tokens, start_pos});
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_CHECK_MSG(
         outputs_res.get().size() == 1,
@@ -55,7 +59,8 @@ Result<exec_aten::Tensor> TextDecoderRunner::step(
   } else { // no kv cache
     (void)managed_start_pos; // unused
 
-    Result<std::vector<EValue>> outputs_res = module_->forward({tokens});
+    ::executorch::runtime::Result<std::vector<::executorch::runtime::EValue>>
+        outputs_res = module_->forward({tokens});
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_CHECK_MSG(
         outputs_res.get().size() == 1,
@@ -69,4 +74,6 @@ Result<exec_aten::Tensor> TextDecoderRunner::step(
   }
 }
 
-} // namespace torch::executor
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h
index 49ddea6629..6a8e3396fe 100644
--- a/extension/llm/runner/text_decoder_runner.h
+++ b/extension/llm/runner/text_decoder_runner.h
@@ -16,7 +16,9 @@
 // patternlint-disable-next-line executorch-cpp-nostdinc
 #include <functional>
 
-namespace torch::executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 class TextDecoderRunner {
  public:
@@ -35,7 +37,7 @@ class TextDecoderRunner {
    * Module.
    * @return The output of the LLM Module. This will be a tensor of logits.
    */
-  virtual Result<exec_aten::Tensor> step(
+  virtual ::executorch::runtime::Result<exec_aten::Tensor> step(
       ManagedTensor& input,
       ManagedTensor& start_pos);
 
@@ -43,7 +45,7 @@ class TextDecoderRunner {
    * Load the Module for text decode purpose.
    * @return The error code.
    */
-  virtual Error load() {
+  virtual ::executorch::runtime::Error load() {
     return module_->load_method("forward");
   }
 
@@ -70,13 +72,13 @@ class TextDecoderRunner {
     auto vocab_size = logits_tensor.size(2);
 
     switch (logits_tensor.scalar_type()) {
-      case ScalarType::Float: {
+      case exec_aten::ScalarType::Float: {
         float* logits = logits_tensor.mutable_data_ptr<float>();
         float* logits_last = logits;
         logits_last += (num_tokens - 1) * vocab_size;
         return sampler_->sample(logits_last);
       }
-      case ScalarType::Half: {
+      case exec_aten::ScalarType::Half: {
         exec_aten::Half* logits =
             logits_tensor.mutable_data_ptr<exec_aten::Half>();
         exec_aten::Half* logits_last = logits;
@@ -99,4 +101,14 @@ class TextDecoderRunner {
   bool should_stop_{false};
 };
 
-} // namespace torch::executor
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::TextDecoderRunner;
+} // namespace executor
+} // namespace torch
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
index beafb21434..19fc2d5936 100644
--- a/extension/llm/runner/text_prefiller.cpp
+++ b/extension/llm/runner/text_prefiller.cpp
@@ -11,7 +11,9 @@
 
 #include <executorch/extension/llm/runner/text_prefiller.h>
 
-namespace torch::executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 TextPrefiller::TextPrefiller(
     Tokenizer* tokenizer,
@@ -23,7 +25,7 @@ TextPrefiller::TextPrefiller(
       use_kv_cache_(use_kv_cache),
       enable_parallel_prefill_(enable_parallel_prefill) {}
 
-Result<uint64_t> TextPrefiller::prefill(
+::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
     std::vector<uint64_t>& prompt_tokens,
     int64_t start_pos,
     std::function<void(const std::string&)> token_callback) {
@@ -40,11 +42,14 @@ Result<uint64_t> TextPrefiller::prefill(
   if (enable_parallel_prefill_ || !use_kv_cache_) {
     // initialize tensor wrappers
     ManagedTensor managed_tokens(
-        prompt_tokens.data(), {1, num_prompt_tokens}, ScalarType::Long);
+        prompt_tokens.data(),
+        {1, num_prompt_tokens},
+        exec_aten::ScalarType::Long);
 
-    ManagedTensor managed_start_pos(&start_pos, {1}, ScalarType::Long);
+    ManagedTensor managed_start_pos(
+        &start_pos, {1}, exec_aten::ScalarType::Long);
 
-    Result<exec_aten::Tensor> outputs_res =
+    ::executorch::runtime::Result<exec_aten::Tensor> outputs_res =
         text_decoder_runner_->step(managed_tokens, managed_start_pos);
 
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
@@ -76,9 +81,11 @@ Result<uint64_t> TextPrefiller::prefill(
     cur_token = prompt_tokens[0];
 
     // initialize tensor wrappers
-    ManagedTensor managed_tokens(&cur_token, {1, 1}, ScalarType::Long);
+    ManagedTensor managed_tokens(
+        &cur_token, {1, 1}, exec_aten::ScalarType::Long);
 
-    ManagedTensor managed_start_pos(&pos_data, {1}, ScalarType::Long);
+    ManagedTensor managed_start_pos(
+        &pos_data, {1}, exec_aten::ScalarType::Long);
 
     // run the first token and get back logits tensor. Assuming the first token
     // is bos so don't callback.
@@ -114,4 +121,6 @@ Result<uint64_t> TextPrefiller::prefill(
   return cur_token;
 }
 
-} // namespace torch::executor
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
index 7293fdca2a..bcec2b895f 100644
--- a/extension/llm/runner/text_prefiller.h
+++ b/extension/llm/runner/text_prefiller.h
@@ -16,7 +16,9 @@
 // patternlint-disable-next-line executorch-cpp-nostdinc
 #include <functional>
 
-namespace torch::executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 class TextPrefiller {
  public:
@@ -35,7 +37,7 @@ class TextPrefiller {
    * token in the prompt.
    * @return The next token of the LLM Module after prefill.
    */
-  Result<uint64_t> prefill(
+  ::executorch::runtime::Result<uint64_t> prefill(
       std::vector<uint64_t>& prompt_tokens,
       int64_t start_pos = 0,
       std::function<void(const std::string&)> token_callback = {});
@@ -47,4 +49,14 @@ class TextPrefiller {
   bool enable_parallel_prefill_;
 };
 
-} // namespace torch::executor
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::TextPrefiller;
+} // namespace executor
+} // namespace torch
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
index 9b3a31f3f7..46d682a4e4 100644
--- a/extension/llm/runner/text_token_generator.h
+++ b/extension/llm/runner/text_token_generator.h
@@ -13,8 +13,9 @@
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 
-namespace torch::executor {
-using Stats = ::executorch::llm::Stats;
+namespace executorch {
+namespace extension {
+namespace llm {
 
 class TextTokenGenerator {
  public:
@@ -41,7 +42,7 @@ class TextTokenGenerator {
    * @param token_callback what to do after a token is generated.
    * @return how many tokens are generated.
    */
-  inline Result<int64_t> generate(
+  inline ::executorch::runtime::Result<int64_t> generate(
       std::vector<uint64_t> tokens,
       int64_t start_pos,
       int32_t seq_len,
@@ -69,14 +70,14 @@ class TextTokenGenerator {
 
     // initialize tensor wrappers
     ManagedTensor tokens_managed(
-        token_data.data(), token_shape, ScalarType::Long);
+        token_data.data(), token_shape, exec_aten::ScalarType::Long);
 
-    ManagedTensor start_pos_managed(&pos, {1}, ScalarType::Long);
+    ManagedTensor start_pos_managed(&pos, {1}, exec_aten::ScalarType::Long);
 
     // Generate our tokens
     while (pos < seq_len - 1) {
       // Run the model
-      Result<exec_aten::Tensor> logits_res =
+      ::executorch::runtime::Result<exec_aten::Tensor> logits_res =
           text_decoder_runner_->step(tokens_managed, start_pos_managed);
 
       ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error());
@@ -136,4 +137,15 @@ class TextTokenGenerator {
   // stats
   Stats* stats_;
 };
-} // namespace torch::executor
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::TextTokenGenerator;
+} // namespace executor
+} // namespace torch
diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h
index 5d4792b641..baf6af328b 100644
--- a/extension/llm/runner/util.h
+++ b/extension/llm/runner/util.h
@@ -11,9 +11,9 @@
 #include <time.h>
 #include <cctype>
 
-namespace torch {
-namespace executor {
-namespace util {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 void inline safe_printf(const char* piece) {
   // piece might be a raw byte token, and we only want to print printable chars
@@ -44,6 +44,17 @@ long inline time_in_ms() {
   return time.tv_sec * 1000 + time.tv_nsec / 1000000;
 }
 
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace util {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::safe_printf;
+using ::executorch::extension::llm::time_in_ms;
 } // namespace util
 } // namespace executor
 } // namespace torch
diff --git a/extension/llm/sampler/sampler.cpp b/extension/llm/sampler/sampler.cpp
index 6b0f155f12..64e1307d26 100644
--- a/extension/llm/sampler/sampler.cpp
+++ b/extension/llm/sampler/sampler.cpp
@@ -35,8 +35,9 @@
 #include <executorch/extension/llm/sampler/sampler.h>
 #include <algorithm>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 // sampler stuff
 template <typename T>
@@ -192,5 +193,6 @@ int32_t Sampler::sample(T* logits) {
 template int32_t Sampler::sample<float>(float* logits);
 template int32_t Sampler::sample<exec_aten::Half>(exec_aten::Half* logits);
 
-} // namespace executor
-} // namespace torch
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/extension/llm/sampler/sampler.h b/extension/llm/sampler/sampler.h
index 584a010bba..9d6d742e59 100644
--- a/extension/llm/sampler/sampler.h
+++ b/extension/llm/sampler/sampler.h
@@ -20,8 +20,9 @@
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 // A simple llama2 sampler.
 
 template <typename T>
@@ -57,5 +58,15 @@ class Sampler {
   unsigned long long rng_state_;
 };
 
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::ProbIndex;
+using ::executorch::extension::llm::Sampler;
 } // namespace executor
 } // namespace torch
diff --git a/extension/llm/sampler/test/test_sampler.cpp b/extension/llm/sampler/test/test_sampler.cpp
index 2dac03d976..044a39458e 100644
--- a/extension/llm/sampler/test/test_sampler.cpp
+++ b/extension/llm/sampler/test/test_sampler.cpp
@@ -12,14 +12,10 @@
 #include <torch/torch.h>
 
 using namespace ::testing;
+using ::executorch::extension::llm::Sampler;
 
-namespace torch {
-namespace executor {
-
-class SamplerTest : public Test {};
-
-TEST_F(SamplerTest, TestArgMax) {
-  torch::executor::Sampler sampler{
+TEST(SamplerTest, TestArgMax) {
+  Sampler sampler{
       /*vocab_size*/ 32000,
       /*temperature*/ 0.0f,
       /*topp*/ 0.9f,
@@ -31,8 +27,8 @@ TEST_F(SamplerTest, TestArgMax) {
   EXPECT_EQ(sampler.sample(input.data_ptr<float>()), 396);
 }
 
-TEST_F(SamplerTest, TestArgMaxWithFP16) {
-  torch::executor::Sampler sampler{
+TEST(SamplerTest, TestArgMaxWithFP16) {
+  Sampler sampler{
       /*vocab_size*/ 32000,
       /*temperature*/ 0.0f,
       /*topp*/ 0.9f,
@@ -43,6 +39,3 @@ TEST_F(SamplerTest, TestArgMaxWithFP16) {
   input[0][0][396] = 1.0f;
   EXPECT_EQ(sampler.sample(input.data_ptr<c10::Half>()), 396);
 }
-
-} // namespace executor
-} // namespace torch
diff --git a/extension/llm/tokenizer/base64.h b/extension/llm/tokenizer/base64.h
index 9fb1b5129b..7337ecead4 100644
--- a/extension/llm/tokenizer/base64.h
+++ b/extension/llm/tokenizer/base64.h
@@ -29,8 +29,10 @@
 #include <string>
 #include <string_view>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+namespace llm {
+
 namespace base64 {
 
 std::string decode(const std::string_view& input);
@@ -176,5 +178,16 @@ inline std::string decode(const std::string_view& input) {
 
 } // namespace base64
 
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+namespace base64 {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::base64::decode;
+} // namespace base64
 } // namespace executor
 } // namespace torch
diff --git a/extension/llm/tokenizer/bpe_tokenizer.cpp b/extension/llm/tokenizer/bpe_tokenizer.cpp
index 07d138548d..1548f000a5 100644
--- a/extension/llm/tokenizer/bpe_tokenizer.cpp
+++ b/extension/llm/tokenizer/bpe_tokenizer.cpp
@@ -10,8 +10,12 @@
 
 #include <cstring>
 
-namespace torch {
-namespace executor {
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+namespace executorch {
+namespace extension {
+namespace llm {
 
 static int compare_tokens(const void* a, const void* b) {
   if (((TokenIndex*)a)->str == nullptr) {
@@ -311,5 +315,6 @@ BPETokenizer::encode(const std::string& text, int8_t bos, int8_t eos) const {
   return Result(tokens);
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/extension/llm/tokenizer/bpe_tokenizer.h b/extension/llm/tokenizer/bpe_tokenizer.h
index 7ea8402583..7fc7306c10 100644
--- a/extension/llm/tokenizer/bpe_tokenizer.h
+++ b/extension/llm/tokenizer/bpe_tokenizer.h
@@ -11,8 +11,9 @@
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <memory>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 struct TokenIndex {
   const char* str;
@@ -26,13 +27,14 @@ class BPETokenizer : public Tokenizer {
   explicit BPETokenizer();
   ~BPETokenizer() override;
 
-  Error load(const std::string& tokenizer_path) override;
+  ::executorch::runtime::Error load(const std::string& tokenizer_path) override;
 
-  Result<std::vector<uint64_t>>
+  ::executorch::runtime::Result<std::vector<uint64_t>>
   encode(const std::string& input, int8_t bos, int8_t eos) const override;
 
-  Result<std::string> decode(uint64_t prev_token, uint64_t token)
-      const override;
+  ::executorch::runtime::Result<std::string> decode(
+      uint64_t prev_token,
+      uint64_t token) const override;
 
  private:
   std::unique_ptr<char*[]> vocab_ = nullptr;
@@ -41,5 +43,16 @@ class BPETokenizer : public Tokenizer {
   unsigned int max_token_length_ = 0;
   unsigned char byte_pieces_[512]; // stores all single-byte strings
 };
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::BPETokenizer;
+using ::executorch::extension::llm::TokenIndex;
 } // namespace executor
 } // namespace torch
diff --git a/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp b/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp
index 17bb83e2f4..c553fe59f9 100644
--- a/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp
+++ b/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp
@@ -13,13 +13,15 @@
 
 using namespace ::testing;
 
-namespace torch {
-namespace executor {
+using ::executorch::extension::llm::BPETokenizer;
+using ::executorch::extension::llm::Tokenizer;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
 
 class TokenizerExtensionTest : public Test {
  public:
   void SetUp() override {
-    torch::executor::runtime_init();
+    executorch::runtime::runtime_init();
     tokenizer_ = std::make_unique<BPETokenizer>();
     modelPath_ =
         std::getenv("RESOURCES_PATH") + std::string("/test_bpe_tokenizer.bin");
@@ -65,6 +67,3 @@ TEST_F(TokenizerExtensionTest, SafeToDestruct) {
   tokenizer_ = std::make_unique<BPETokenizer>();
   tokenizer_.reset();
 }
-
-} // namespace executor
-} // namespace torch
diff --git a/extension/llm/tokenizer/test/test_tiktoken.cpp b/extension/llm/tokenizer/test/test_tiktoken.cpp
index f423183b8a..a81b20bcf8 100644
--- a/extension/llm/tokenizer/test/test_tiktoken.cpp
+++ b/extension/llm/tokenizer/test/test_tiktoken.cpp
@@ -12,9 +12,10 @@
 #include <vector>
 
 using namespace ::testing;
-
-namespace torch {
-namespace executor {
+using ::executorch::extension::llm::Tiktoken;
+using ::executorch::extension::llm::Tokenizer;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
 
 namespace {
 // Test case based on Llama 2
@@ -49,7 +50,7 @@ static inline std::unique_ptr<std::vector<std::string>> _get_special_tokens() {
 class TiktokenExtensionTest : public Test {
  public:
   void SetUp() override {
-    torch::executor::runtime_init();
+    executorch::runtime::runtime_init();
     tokenizer_ = std::make_unique<Tiktoken>(
         _get_special_tokens(), kBOSTokenIndex, kEOSTokenIndex);
     modelPath_ = std::getenv("RESOURCES_PATH") +
@@ -139,5 +140,3 @@ TEST_F(TiktokenExtensionTest, ConstructionWithInvalidEOSIndex) {
       "");
 #endif
 }
-} // namespace executor
-} // namespace torch
diff --git a/extension/llm/tokenizer/tiktoken.cpp b/extension/llm/tokenizer/tiktoken.cpp
index 67d1f916f2..7b15d25f0d 100644
--- a/extension/llm/tokenizer/tiktoken.cpp
+++ b/extension/llm/tokenizer/tiktoken.cpp
@@ -30,8 +30,12 @@
 #include <fstream>
 #include <limits>
 
-namespace torch {
-namespace executor {
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+namespace executorch {
+namespace extension {
+namespace llm {
 
 // ------------------------------Util start------------------------------------
 
@@ -415,5 +419,6 @@ Result<std::string> Tiktoken::decode(uint64_t prev, uint64_t cur) const {
 }
 // -------------------------public method end-------------------------------
 
-} // namespace executor
-} // namespace torch
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/extension/llm/tokenizer/tiktoken.h b/extension/llm/tokenizer/tiktoken.h
index 0b1b1fa61e..7d78f8b60d 100644
--- a/extension/llm/tokenizer/tiktoken.h
+++ b/extension/llm/tokenizer/tiktoken.h
@@ -14,8 +14,9 @@
 #include <optional>
 #include <unordered_map>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 using Encoder = std::unordered_map<std::string, uint64_t>;
 using Decoder = std::unordered_map<uint64_t, std::string>;
@@ -33,13 +34,14 @@ class Tiktoken : public Tokenizer {
       size_t bos_token_index,
       size_t eos_token_index);
 
-  Error load(const std::string& tokenizer_path) override;
+  ::executorch::runtime::Error load(const std::string& tokenizer_path) override;
 
-  Result<std::vector<uint64_t>>
+  ::executorch::runtime::Result<std::vector<uint64_t>>
   encode(const std::string& input, int8_t bos, int8_t eos) const override;
 
-  Result<std::string> decode(uint64_t prev_token, uint64_t token)
-      const override;
+  ::executorch::runtime::Result<std::string> decode(
+      uint64_t prev_token,
+      uint64_t token) const override;
 
  private:
   template <typename T>
@@ -74,5 +76,18 @@ class Tiktoken : public Tokenizer {
   Re2UPtr _regex;
   Re2UPtr _special_token_regex;
 };
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::Decoder;
+using ::executorch::extension::llm::Encoder;
+using ::executorch::extension::llm::Re2UPtr;
+using ::executorch::extension::llm::Tiktoken;
 } // namespace executor
 } // namespace torch
diff --git a/extension/llm/tokenizer/tokenizer.h b/extension/llm/tokenizer/tokenizer.h
index b49dc245eb..3115cbdff7 100644
--- a/extension/llm/tokenizer/tokenizer.h
+++ b/extension/llm/tokenizer/tokenizer.h
@@ -17,8 +17,9 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
 
-namespace torch {
-namespace executor {
+namespace executorch {
+namespace extension {
+namespace llm {
 
 // A tokenizer interface.
 class Tokenizer {
@@ -26,15 +27,16 @@ class Tokenizer {
   explicit Tokenizer() {}
   virtual ~Tokenizer() {}
 
-  virtual Error load(const std::string& tokenizer_path) = 0;
+  virtual ::executorch::runtime::Error load(
+      const std::string& tokenizer_path) = 0;
 
-  virtual Result<std::vector<uint64_t>>
+  virtual ::executorch::runtime::Result<std::vector<uint64_t>>
   encode(const std::string& input, int8_t bos, int8_t eos) const = 0;
 
-  Error decode_verify(uint64_t token) const {
+  ::executorch::runtime::Error decode_verify(uint64_t token) const {
     if (!initialized_) {
       ET_LOG(Error, "Tokenizer not initialized");
-      return Error::NotSupported;
+      return ::executorch::runtime::Error::NotSupported;
     }
     if (token >= vocab_size_) {
       ET_LOG(
@@ -42,13 +44,14 @@ class Tokenizer {
           "token  %" PRIu64 " is out side of vacab range %d",
           token,
           vocab_size_);
-      return Error::NotSupported;
+      return ::executorch::runtime::Error::NotSupported;
     }
-    return Error::Ok;
+    return ::executorch::runtime::Error::Ok;
   }
 
-  virtual Result<std::string> decode(uint64_t prev_token, uint64_t token)
-      const = 0;
+  virtual ::executorch::runtime::Result<std::string> decode(
+      uint64_t prev_token,
+      uint64_t token) const = 0;
 
   // getters
   int32_t vocab_size() const {
@@ -70,5 +73,14 @@ class Tokenizer {
   uint64_t eos_tok_ = 0;
 };
 
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::Tokenizer;
 } // namespace executor
 } // namespace torch
diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index 83cec280b8..7c98ee4aa0 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -17,6 +17,9 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include <executorch/devtools/bundled_program/bundled_program.h>
+#include <executorch/devtools/bundled_program/schema/bundled_program_schema_generated.h>
+#include <executorch/devtools/etdump/etdump_flatcc.h>
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/extension/data_loader/mmap_data_loader.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
@@ -28,9 +31,6 @@
 #include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/profiler.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/sdk/bundled_program/bundled_program.h>
-#include <executorch/sdk/bundled_program/schema/bundled_program_schema_generated.h>
-#include <executorch/sdk/etdump/etdump_flatcc.h>
 #include <executorch/util/read_file.h>
 
 #include <ATen/Functions.h>
diff --git a/extension/training/test/training_loop_test.cpp b/extension/training/test/training_loop_test.cpp
index 28931fbfc0..8e62663c9f 100644
--- a/extension/training/test/training_loop_test.cpp
+++ b/extension/training/test/training_loop_test.cpp
@@ -23,7 +23,7 @@
 // @lint-ignore-every CLANGTIDY facebook-hte-CArray
 
 using namespace ::testing;
-using namespace torch::executor::training::optimizer;
+using namespace executorch::extension::training::optimizer;
 using namespace torch::executor::testing;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
diff --git a/pytest.ini b/pytest.ini
index 5ed1780e61..7298773255 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -13,8 +13,7 @@ addopts =
     --ignore-glob=backends/arm/**/*
     # explicitly list out tests that are running successfully in oss
     examples/models/test
-    # sdk/
-    sdk/
+    devtools/
     # examples
     examples/models/llama2/tests
     # examples/models/llava/test TODO: enable this
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
index 401581421d..5ba989ef86 100644
--- a/runtime/executor/test/targets.bzl
+++ b/runtime/executor/test/targets.bzl
@@ -19,7 +19,7 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/exir/backend/test/...",
                 "//executorch/runtime/backend/...",
                 "//executorch/extension/pybindings/...",
-                "//executorch/sdk/fb/runners/...",
+                "//executorch/devtools/fb/runners/...",
                 "//executorch/test/...",
                 "//executorch/examples/...",
             ],
@@ -43,7 +43,7 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/exir/backend/test/...",
                 "//executorch/runtime/backend/...",
                 "//executorch/extension/pybindings/...",
-                "//executorch/sdk/fb/runners/...",
+                "//executorch/devtools/fb/runners/...",
                 "//executorch/test/...",
                 "//executorch/examples/...",
             ],
diff --git a/schema/targets.bzl b/schema/targets.bzl
index 2c797baa16..40c6d8d5c8 100644
--- a/schema/targets.bzl
+++ b/schema/targets.bzl
@@ -57,7 +57,7 @@ def define_common_targets():
         name = INPUT_SCALAR_TYPE,
         visibility = [
             "//executorch/exir/_serialize/...",
-            "//executorch/sdk/etdump/...",
+            "//executorch/devtools/etdump/...",
         ],
     )
 
diff --git a/sdk/inspector/tests/TARGETS b/sdk/inspector/tests/TARGETS
deleted file mode 100644
index 374d2ea753..0000000000
--- a/sdk/inspector/tests/TARGETS
+++ /dev/null
@@ -1,40 +0,0 @@
-load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
-
-oncall("executorch")
-
-python_unittest(
-    name = "inspector_test",
-    srcs = ["inspector_test.py"],
-    deps = [
-        "//executorch/exir:lib",
-        "//executorch/sdk:lib",
-        "//executorch/sdk/debug_format:et_schema",
-        "//executorch/sdk/etdump:schema_flatcc",
-        "//executorch/sdk/etrecord/tests:etrecord_test_library",
-        "//executorch/sdk/inspector:inspector",
-        "//executorch/sdk/inspector:lib",
-    ],
-)
-
-python_unittest(
-    name = "event_blocks_test",
-    srcs = ["event_blocks_test.py"],
-    deps = [
-        "//executorch/sdk/etdump:schema_flatcc",
-        "//executorch/sdk/inspector:inspector",
-        "//executorch/sdk/inspector:lib",
-    ],
-)
-
-python_unittest(
-    name = "inspector_utils_test",
-    srcs = ["inspector_utils_test.py"],
-    deps = [
-        "//executorch/sdk:lib",
-        "//executorch/sdk/debug_format:base_schema",
-        "//executorch/sdk/debug_format:et_schema",
-        "//executorch/sdk/etdump:schema_flatcc",
-        "//executorch/sdk/etrecord/tests:etrecord_test_library",
-        "//executorch/sdk/inspector:inspector_utils",
-    ],
-)
diff --git a/setup.py b/setup.py
index 58a9973c9f..75b3ece526 100644
--- a/setup.py
+++ b/setup.py
@@ -360,12 +360,12 @@ def run(self):
             ("schema/scalar_type.fbs", "exir/_serialize/scalar_type.fbs"),
             ("schema/program.fbs", "exir/_serialize/program.fbs"),
             (
-                "sdk/bundled_program/schema/bundled_program_schema.fbs",
-                "sdk/bundled_program/serialize/bundled_program_schema.fbs",
+                "devtools/bundled_program/schema/bundled_program_schema.fbs",
+                "devtools/bundled_program/serialize/bundled_program_schema.fbs",
             ),
             (
-                "sdk/bundled_program/schema/scalar_type.fbs",
-                "sdk/bundled_program/serialize/scalar_type.fbs",
+                "devtools/bundled_program/schema/scalar_type.fbs",
+                "devtools/bundled_program/serialize/scalar_type.fbs",
             ),
         ]
         for src, dst in src_to_dst:
@@ -606,8 +606,8 @@ def get_ext_modules() -> List[Extension]:
         "executorch/extension": "extension",
         "executorch/kernels/quantized": "kernels/quantized",
         "executorch/schema": "schema",
-        "executorch/sdk": "sdk",
-        "executorch/sdk/bundled_program": "sdk/bundled_program",
+        "executorch/devtools": "devtools",
+        "executorch/devtools/bundled_program": "devtools/bundled_program",
         "executorch/util": "util",
         # Note: This will install a top-level module called "serializer",
         # which seems too generic and might conflict with other pip packages.
diff --git a/shim/xplat/executorch/extension/pybindings/pybindings.bzl b/shim/xplat/executorch/extension/pybindings/pybindings.bzl
index f62c567ba4..813b420dba 100644
--- a/shim/xplat/executorch/extension/pybindings/pybindings.bzl
+++ b/shim/xplat/executorch/extension/pybindings/pybindings.bzl
@@ -10,29 +10,29 @@ MODELS_ATEN_OPS_LEAN_MODE_GENERATED_LIB = [
 PORTABLE_MODULE_DEPS = [
     "//executorch/runtime/kernel:operator_registry",
     "//executorch/runtime/executor:program",
-    "//executorch/sdk/bundled_program/schema:bundled_program_schema_fbs",
+    "//executorch/devtools/bundled_program/schema:bundled_program_schema_fbs",
     "//executorch/extension/aten_util:aten_bridge",
-    "//executorch/sdk/bundled_program:runtime",
+    "//executorch/devtools/bundled_program:runtime",
     "//executorch/extension/data_loader:buffer_data_loader",
     "//executorch/extension/data_loader:mmap_data_loader",
     "//executorch/extension/memory_allocator:malloc_memory_allocator",
     "//executorch/util:util",
     "//executorch/runtime/executor/test:test_backend_compiler_lib",
-    "//executorch/sdk/etdump:etdump_flatcc",
+    "//executorch/devtools/etdump:etdump_flatcc",
 ] + get_all_cpu_backend_targets()
 
 ATEN_MODULE_DEPS = [
     "//executorch/runtime/kernel:operator_registry",
     "//executorch/runtime/executor:program_aten",
     "//executorch/runtime/core/exec_aten:lib",
-    "//executorch/sdk/bundled_program/schema:bundled_program_schema_fbs",
+    "//executorch/devtools/bundled_program/schema:bundled_program_schema_fbs",
     "//executorch/extension/data_loader:buffer_data_loader",
     "//executorch/extension/data_loader:mmap_data_loader",
     "//executorch/extension/memory_allocator:malloc_memory_allocator",
     "//executorch/util:read_file",
-    "//executorch/sdk/bundled_program:runtime_aten",
+    "//executorch/devtools/bundled_program:runtime_aten",
     "//executorch/runtime/executor/test:test_backend_compiler_lib_aten",
-    "//executorch/sdk/etdump:etdump_flatcc",
+    "//executorch/devtools/etdump:etdump_flatcc",
 ]
 
 # Generated lib for all ATen ops with aten kernel used by models in model inventory
diff --git a/test/end2end/TARGETS b/test/end2end/TARGETS
index 8c0885e32e..fdac0e4887 100644
--- a/test/end2end/TARGETS
+++ b/test/end2end/TARGETS
@@ -42,6 +42,9 @@ python_unittest(
         ":exported_module",
         ":register_scratch_meta_fns",
         "//caffe2:torch",
+        "//executorch/devtools:lib",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program/serialize:lib",
         "//executorch/exir:dynamic_shape",
         "//executorch/exir:lib",
         "//executorch/exir:memory",
@@ -57,9 +60,6 @@ python_unittest(
         "//executorch/exir/tests:transformer",
         "//executorch/extension/pybindings:aten_lib",
         "//executorch/extension/pytree:pybindings",
-        "//executorch/sdk:lib",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program/serialize:lib",
     ],
 )
 
@@ -73,6 +73,9 @@ python_unittest(
         ":exported_module",
         ":register_scratch_meta_fns",
         "//caffe2:torch",
+        "//executorch/devtools:lib",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program/serialize:lib",
         "//executorch/exir:dynamic_shape",
         "//executorch/exir:lib",
         "//executorch/exir:memory",
@@ -88,8 +91,5 @@ python_unittest(
         "//executorch/exir/tests:transformer",
         "//executorch/extension/pybindings:portable_lib",
         "//executorch/extension/pytree:pybindings",
-        "//executorch/sdk:lib",
-        "//executorch/sdk/bundled_program:config",
-        "//executorch/sdk/bundled_program/serialize:lib",
     ],
 )
diff --git a/test/models/generate_linear_out_bundled_program.py b/test/models/generate_linear_out_bundled_program.py
index 9201e43adf..93fd1445ef 100644
--- a/test/models/generate_linear_out_bundled_program.py
+++ b/test/models/generate_linear_out_bundled_program.py
@@ -17,15 +17,15 @@
 from typing import List
 
 import torch
+from executorch.devtools import BundledProgram
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+from executorch.devtools.bundled_program.serialize import (
+    serialize_from_bundled_program_to_flatbuffer,
+)
 from executorch.exir import ExecutorchBackendConfig, to_edge
 
 from executorch.exir.passes import MemoryPlanningPass, ToOutVarPass
 from executorch.exir.print_program import pretty_print
-from executorch.sdk import BundledProgram
-from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.sdk.bundled_program.serialize import (
-    serialize_from_bundled_program_to_flatbuffer,
-)
 
 from executorch.test.models.linear_model import LinearModel
 from torch.export import export
diff --git a/test/models/targets.bzl b/test/models/targets.bzl
index ad907304ed..aea47c9e03 100644
--- a/test/models/targets.bzl
+++ b/test/models/targets.bzl
@@ -22,9 +22,9 @@ def define_common_targets():
         deps = [
             ":linear_model",
             "//caffe2:torch",
-            "//executorch/sdk/bundled_program:config",
-            "//executorch/sdk:lib",
-            "//executorch/sdk/bundled_program/serialize:lib",
+            "//executorch/devtools/bundled_program:config",
+            "//executorch/devtools:lib",
+            "//executorch/devtools/bundled_program/serialize:lib",
             "//executorch/exir:lib",
             "//executorch/exir/_serialize:lib",
         ],
diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh
index 2d2f816209..3f17a9ead6 100755
--- a/test/run_oss_cpp_tests.sh
+++ b/test/run_oss_cpp_tests.sh
@@ -121,7 +121,7 @@ probe_tests() {
     kernels
     runtime
     schema
-    sdk
+    devtools
     test
   )