From 0601dd618099af9956e56a236c7ed3ac098fc402 Mon Sep 17 00:00:00 2001
From: pytorchbot <pytorchbot@pytorch.com>
Date: Sat, 7 Sep 2024 11:35:10 +0000
Subject: [PATCH] 2024-09-07 nightly release
 (1cc8503056eab95eaf2f753c5a1bf237102a26ba)

---
 .ci/scripts/build-qnn-sdk.sh                  |   1 +
 .ci/scripts/build_llama_android.sh            |   3 +-
 .ci/scripts/test_llama.sh                     |   3 +-
 .ci/scripts/test_llava.sh                     | 145 +++--
 .github/workflows/android-perf.yml            |   4 +-
 .github/workflows/android.yml                 |   2 +
 .github/workflows/apple-perf.yml              |   1 +
 .../workflows/upload-android-test-specs.yml   |   2 +-
 backends/qualcomm/scripts/build.sh            |   2 +
 backends/vulkan/docs/android_demo.md          |   3 +-
 .../vulkan/runtime/api/containers/Tensor.cpp  | 129 ++++-
 .../vulkan/runtime/api/containers/Tensor.h    |  65 ++-
 backends/vulkan/runtime/graph/ComputeGraph.h  |   4 +
 .../runtime/graph/ops/glsl/image_to_nchw.glsl |   9 +-
 .../runtime/graph/ops/glsl/indexing_utils.h   |  93 +++
 .../ops/glsl/int8_image_to_nchw_noint8.glsl   |   9 +-
 .../runtime/graph/ops/glsl/nchw_to_image.glsl |   9 +-
 .../ops/glsl/nchw_to_int8_image_noint8.glsl   |  15 +-
 .../runtime/graph/ops/impl/Convolution.cpp    |   2 +-
 .../vulkan/runtime/graph/ops/impl/Staging.cpp |   8 +-
 .../runtime/graph/ops/impl/utils/DimUtils.h   |   3 +-
 backends/vulkan/test/utils/test_utils.cpp     |   7 +-
 .../vulkan/test/vulkan_compute_api_test.cpp   |  25 +-
 backends/xnnpack/README.md                    |   3 +-
 build/build_android_llm_demo.sh               |   5 +-
 build/build_apple_frameworks.sh               |   1 +
 ...d-run-qualcomm-ai-engine-direct-backend.md |   2 +
 docs/source/llm/getting-started.md            |  14 +-
 .../tutorial-xnnpack-delegate-lowering.md     |   3 +-
 .../android/ExecuTorchDemo/README.md          |   2 +
 .../demo-apps/android/ExecuTorchDemo/setup.sh |   1 +
 .../android-llm-device-farm-test-spec.yml     |  22 +
 .../LlmBenchmarkRunner.java                   |  22 +-
 .../executorchllamademo/MainActivity.java     |  16 +-
 .../android/LlamaDemo/setup-with-qnn.sh       |   1 +
 examples/demo-apps/android/LlamaDemo/setup.sh |   1 +
 examples/llm_manual/CMakeLists.txt            |   2 +
 examples/llm_manual/main.cpp                  |   7 +-
 examples/llm_manual/managed_tensor.h          |  44 --
 .../cross_attention/cross_attention_mask.cpp  |  12 +-
 .../cross_attention/cross_attention_mask.h    |   8 +-
 .../cross_attention_mask_test.cpp             |  30 +-
 .../flamingo/cross_attention/targets.bzl      |   2 +-
 examples/models/llama2/README.md              |   6 +-
 examples/models/llama2/export_llama_lib.py    |   4 +-
 examples/models/llama2/runner/CMakeLists.txt  |   4 +-
 examples/models/llama2/runner/runner.cpp      |  25 +-
 examples/models/llama2/runner/runner.h        |   3 +-
 examples/models/llama2/runner/targets.bzl     |   2 +-
 examples/models/llava/CMakeLists.txt          |  18 +-
 examples/models/llava/README.md               |   3 +-
 examples/models/llava/export_llava.py         |   9 +-
 examples/models/llava/install_requirements.sh |   2 +-
 examples/models/llava/main.cpp                |  15 +
 examples/models/llava/runner/CMakeLists.txt   |   4 +-
 .../llava/runner/llava_image_prefiller.h      |  14 +-
 examples/models/llava/runner/llava_runner.cpp |  96 +++-
 examples/models/llava/runner/llava_runner.h   |  42 ++
 .../llava/runner/llava_text_decoder_runner.h  |  11 +-
 examples/models/llava/runner/targets.bzl      |   2 +-
 examples/models/phi-3-mini/CMakeLists.txt     |   3 +-
 examples/models/phi-3-mini/README.md          |   3 +-
 examples/models/phi-3-mini/runner.cpp         |  14 +-
 examples/models/test/test_export.py           |   2 +-
 .../oss_scripts/llama2/CMakeLists.txt         |   1 +
 .../oss_scripts/llama2/qnn_llama_runner.cpp   |   1 -
 .../oss_scripts/llama2/runner/runner.cpp      |  77 ++-
 .../oss_scripts/llama2/runner/runner.h        |  14 +-
 .../qaihub_scripts/llama/CMakeLists.txt       |   2 +
 .../llama/llama2/qaihub_llama2_7b_runner.cpp  |   1 -
 .../llama/llama3/qaihub_llama3_8b_runner.cpp  |   1 -
 .../qaihub_scripts/llama/runner/runner.cpp    |   1 -
 .../qaihub_scripts/llama/runner/runner.h      |   1 -
 .../stable_diffusion/CMakeLists.txt           |   1 +
 .../stable_diffusion/runner/runner.cpp        |  42 +-
 examples/xnnpack/README.md                    |   6 +-
 exir/_serialize/_program.py                   |  31 +-
 exir/_serialize/test/test_program.py          |  27 +
 exir/capture/_config.py                       |   6 -
 exir/program/_program.py                      |   5 -
 extension/android/CMakeLists.txt              |   1 +
 extension/android/jni/BUCK                    |   4 +-
 extension/android/jni/jni_layer.cpp           |  23 +-
 extension/android/jni/jni_layer_llama.cpp     |  84 ++-
 .../org/pytorch/executorch/LlamaModule.java   |  87 ++-
 .../apple/Benchmark/App/App.entitlements      |  12 +
 extension/apple/Benchmark/App/App.swift       |  16 +
 .../Benchmark.xcodeproj/project.pbxproj       | 535 ++++++++++++++++++
 .../xcshareddata/xcschemes/Benchmark.xcscheme | 107 ++++
 extension/apple/Benchmark/Tests/Tests.mm      | 105 ++++
 .../apple/Benchmark/Tests/Tests.xcconfig      |  26 +
 .../apple/Benchmark/Tests/Tests.xctestplan    |  28 +
 extension/aten_util/test/targets.bzl          |   1 -
 extension/llm/export/builder.py               |  25 +-
 extension/llm/export/partitioner_lib.py       |   2 +-
 extension/llm/runner/CMakeLists.txt           |   4 +-
 extension/llm/runner/multimodal_runner.h      |  45 +-
 extension/llm/runner/targets.bzl              |   6 +-
 extension/llm/runner/text_decoder_runner.cpp  |  14 +-
 extension/llm/runner/text_decoder_runner.h    |   6 +-
 extension/llm/runner/text_prefiller.cpp       |  35 +-
 extension/llm/runner/text_prefiller.h         |   2 +-
 extension/llm/runner/text_token_generator.h   |  15 +-
 extension/llm/runner/util.h                   |  25 +
 extension/llm/tokenizer/tiktoken.cpp          |  10 +
 extension/module/test/module_test.cpp         | 105 ++--
 extension/module/test/resources/README.md     |   4 +
 extension/module/test/resources/add.pte       | Bin 0 -> 728 bytes
 extension/module/test/resources/model.pte     | Bin 1600 -> 0 bytes
 extension/runner_util/managed_tensor.h        | 107 ----
 extension/runner_util/targets.bzl             |  15 -
 extension/runner_util/test/CMakeLists.txt     |   2 +-
 .../runner_util/test/managed_tensor_test.cpp  |  86 ---
 extension/runner_util/test/targets.bzl        |  12 -
 kernels/README.md                             |   2 +-
 kernels/optimized/cpu/binary_ops.h            |   3 +-
 kernels/optimized/cpu/op_mul.cpp              |  17 +-
 kernels/portable/cpu/op_masked_fill.cpp       |   3 +
 kernels/portable/cpu/op_max.cpp               |  18 +
 kernels/portable/cpu/op_maximum.cpp           |   3 +
 kernels/portable/cpu/op_mean.cpp              |   5 +
 kernels/portable/cpu/op_min.cpp               |  18 +
 kernels/portable/cpu/op_minimum.cpp           |   3 +
 kernels/portable/cpu/op_mm.cpp                |   5 +
 kernels/portable/cpu/op_mul.cpp               |  24 +-
 kernels/portable/cpu/op_native_batch_norm.cpp |  22 +
 kernels/portable/cpu/op_native_group_norm.cpp |  25 +
 kernels/portable/cpu/op_native_layer_norm.cpp |  27 +
 kernels/portable/cpu/op_ne.cpp                |   6 +
 kernels/portable/cpu/op_neg.cpp               |   3 +
 kernels/portable/cpu/op_pdist_forward.cpp     |   5 +
 kernels/portable/cpu/op_permute_copy.cpp      |   3 +
 kernels/portable/cpu/op_pixel_shuffle.cpp     |   4 +
 kernels/portable/cpu/op_to_copy.cpp           |   9 +-
 kernels/portable/cpu/scalar_utils.h           |  18 +-
 kernels/test/op_mul_test.cpp                  | 158 ++++--
 kernels/test/op_to_copy_test.cpp              |  27 +-
 runtime/core/exec_aten/exec_aten.h            |   2 +
 .../exec_aten/testing_util/tensor_util.cpp    |  22 +-
 .../core/exec_aten/util/genScalarTypeTable.py |  41 +-
 .../core/exec_aten/util/scalar_type_util.h    | 206 +++++--
 runtime/core/exec_aten/util/tensor_util.h     |   9 +
 .../util/test/scalar_type_util_test.cpp       |  54 +-
 runtime/core/portable_type/bfloat16.h         | 311 ++++++++++
 runtime/core/portable_type/bfloat16_math.h    | 290 ++++++++++
 runtime/core/portable_type/targets.bzl        |   1 +
 .../core/portable_type/test/CMakeLists.txt    |   2 +-
 .../core/portable_type/test/bfloat16_test.cpp | 191 +++++++
 runtime/core/portable_type/test/targets.bzl   |   8 +
 runtime/executor/test/method_test.cpp         |   8 +-
 runtime/executor/test/program_test.cpp        |  37 +-
 runtime/executor/test/targets.bzl             |   5 +-
 schema/program.fbs                            |   1 +
 test/end2end/exported_module.py               |   2 -
 .../ModuleLinear-no-constant-segment.pte      | Bin 0 -> 1040 bytes
 test/models/deprecated/README.md              |  14 +
 test/models/deprecated/TARGETS                |  12 +
 test/models/export_program.py                 |  21 +-
 test/run_oss_cpp_tests.sh                     |   8 +-
 test/utils/OSSTestConfig.json                 |   3 +-
 160 files changed, 3501 insertions(+), 945 deletions(-)
 delete mode 100644 examples/llm_manual/managed_tensor.h
 create mode 100644 extension/apple/Benchmark/App/App.entitlements
 create mode 100644 extension/apple/Benchmark/App/App.swift
 create mode 100644 extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
 create mode 100644 extension/apple/Benchmark/Benchmark.xcodeproj/xcshareddata/xcschemes/Benchmark.xcscheme
 create mode 100644 extension/apple/Benchmark/Tests/Tests.mm
 create mode 100644 extension/apple/Benchmark/Tests/Tests.xcconfig
 create mode 100644 extension/apple/Benchmark/Tests/Tests.xctestplan
 create mode 100644 extension/module/test/resources/README.md
 create mode 100644 extension/module/test/resources/add.pte
 delete mode 100644 extension/module/test/resources/model.pte
 delete mode 100644 extension/runner_util/managed_tensor.h
 delete mode 100644 extension/runner_util/test/managed_tensor_test.cpp
 create mode 100644 runtime/core/portable_type/bfloat16_math.h
 create mode 100644 runtime/core/portable_type/test/bfloat16_test.cpp
 create mode 100644 test/models/deprecated/ModuleLinear-no-constant-segment.pte
 create mode 100644 test/models/deprecated/README.md
 create mode 100644 test/models/deprecated/TARGETS

diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh
index ec3a8a39e3..c48ac2056a 100644
--- a/.ci/scripts/build-qnn-sdk.sh
+++ b/.ci/scripts/build-qnn-sdk.sh
@@ -29,6 +29,7 @@ set_up_aot() {
       -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
       -DEXECUTORCH_BUILD_SDK=ON \
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
       -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
       -DPYTHON_EXECUTABLE=python3 \
       -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF
diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh
index 644fc4c2bb..7d3370ee56 100644
--- a/.ci/scripts/build_llama_android.sh
+++ b/.ci/scripts/build_llama_android.sh
@@ -22,8 +22,9 @@ install_executorch_and_backend_lib() {
     -DANDROID_PLATFORM=android-23 \
     -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index 4fa8c94905..290ece7b8e 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -107,8 +107,9 @@ cmake_install_executorch_libraries() {
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE=Debug \
-        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
index 3543ea3fa5..7dc6d15e40 100644
--- a/.ci/scripts/test_llava.sh
+++ b/.ci/scripts/test_llava.sh
@@ -9,47 +9,97 @@ set -exu
 # shellcheck source=/dev/null
 
 BUILD_TYPE=${1:-Debug}
+TARGET_OS=${2:-Native}
+BUILD_DIR=${3:-cmake-out}
 
-echo "Building with BUILD_TYPE: $BUILD_TYPE"
+echo "Building with BUILD_TYPE: $BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR"
 
 if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
-  PYTHON_EXECUTABLE=python3
+    PYTHON_EXECUTABLE=python3
 fi
 
+TARGET_OS_lower="$(echo "${TARGET_OS}" | awk '{print tolower($0)}')"
+if [[ "${TARGET_OS_lower}" == "android" ]]; then
+    if [[ -z "${ANDROID_NDK}" ]]; then
+        echo "Set ANDROID_NDK environment variable to build for Android."
+        exit 1
+    fi
+fi
+
+# Number of processes for a parallel build
+NPROC=8
+if hash nproc &> /dev/null; then NPROC=$(nproc); fi
+
+EXECUTORCH_COMMON_CMAKE_ARGS="                      \
+        -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}         \
+        -DCMAKE_BUILD_TYPE=${BUILD_TYPE}            \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON      \
+        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON      \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON        \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON     \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON     \
+        -DEXECUTORCH_BUILD_XNNPACK=ON               \
+        -DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON        \
+        -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON"
+
 cmake_install_executorch_libraries() {
-    cmake                                               \
-        -DCMAKE_INSTALL_PREFIX=cmake-out                \
-        -DCMAKE_BUILD_TYPE=${BUILD_TYPE}                \
-        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON          \
-        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON     \
-        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON            \
-        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON         \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON         \
-        -DEXECUTORCH_BUILD_XNNPACK=ON                   \
-        -DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON            \
-        -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON        \
-        -Bcmake-out .
-
-
-    cmake --build cmake-out -j9 --target install --config ${BUILD_TYPE}
+    cmake                               \
+        ${EXECUTORCH_COMMON_CMAKE_ARGS} \
+        -B${BUILD_DIR} .
+
+    cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
+}
+
+cmake_install_executorch_libraries_for_android() {
+    cmake                                                                       \
+        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+        -DANDROID_ABI=arm64-v8a                                                 \
+        -DANDROID_PLATFORM=android-23                                           \
+        ${EXECUTORCH_COMMON_CMAKE_ARGS}                                         \
+        -B${BUILD_DIR} .
+
+    cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
 }
 
+
+LLAVA_COMMON_CMAKE_ARGS="                        \
+        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+        -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}      \
+        -DCMAKE_BUILD_TYPE=${BUILD_TYPE}         \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON     \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON  \
+        -DEXECUTORCH_BUILD_XNNPACK=ON"
+
 cmake_build_llava_runner() {
     dir=examples/models/llava
     python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')
 
-    cmake                                       \
-        -DCMAKE_INSTALL_PREFIX=cmake-out        \
-        -DCMAKE_BUILD_TYPE=${BUILD_TYPE}         \
-        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON    \
-        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DEXECUTORCH_BUILD_XNNPACK=ON           \
-        -DCMAKE_PREFIX_PATH="$python_lib"       \
-        -Bcmake-out/${dir}                      \
+    cmake                                 \
+        ${LLAVA_COMMON_CMAKE_ARGS}        \
+        -DCMAKE_PREFIX_PATH="$python_lib" \
+        -B${BUILD_DIR}/${dir}             \
         ${dir}
 
+    cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE}
+}
+
 
-    cmake --build cmake-out/${dir} -j9 --config ${BUILD_TYPE}
+cmake_build_llava_runner_for_android() {
+    dir=examples/models/llava
+    python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')
+
+    cmake                                                                       \
+        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+        -DANDROID_ABI=arm64-v8a                                                 \
+        -DANDROID_PLATFORM=android-23                                           \
+        ${LLAVA_COMMON_CMAKE_ARGS}                                              \
+        -DCMAKE_PREFIX_PATH="$python_lib"                                       \
+        -DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON                                  \
+        -B${BUILD_DIR}/${dir}                                                   \
+        ${dir}
+
+    cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE}
 }
 
 # only export the one without custom op for now since it's
@@ -61,7 +111,7 @@ export_llava() {
 # Download a new image with different size, to test if the model can handle different image sizes
 prepare_image_tensor() {
     echo "Downloading image"
-    curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg 
+    curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg
     $PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path image.pt
 }
 
@@ -80,13 +130,24 @@ run_and_verify() {
         echo "tokenizer.bin is missing."
         exit 1
     fi
-    RUNTIME_ARGS="--model_path=llava.pte \
-     --tokenizer_path=tokenizer.bin \
-     --image_path=image.pt \
-     --prompt=ASSISTANT: \
-     --temperature=0 \
-     --seq_len=650"
-    cmake-out/examples/models/llava/llava_main ${RUNTIME_ARGS} > result.txt
+
+
+
+    RUNTIME_ARGS="--model_path=llava.pte    \
+        --tokenizer_path=tokenizer.bin      \
+        --image_path=image.pt               \
+        --prompt=ASSISTANT:                 \
+        --temperature=0                     \
+        --seq_len=650"
+
+    if [[ "${TARGET_OS_lower}" == "android" ]]; then
+        echo "Transfer relevant files to the phone via ADB and run llava_main with following args,"
+        echo "$ llava_main ${RUNTIME_ARGS} "
+        exit 0;
+    fi
+
+    ${BUILD_DIR}/examples/models/llava/llava_main ${RUNTIME_ARGS} > result.txt
+
     # verify result.txt
     RESULT=$(cat result.txt)
     # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
@@ -109,8 +170,20 @@ run_and_verify() {
     fi
 }
 
-cmake_install_executorch_libraries
-cmake_build_llava_runner
+# Step1. Build stuff
+if [[ "${TARGET_OS_lower}" == "android" ]]; then
+    cmake_install_executorch_libraries_for_android
+    cmake_build_llava_runner_for_android
+elif [[ "${TARGET_OS_lower}" == "native" ]]; then
+    cmake_install_executorch_libraries
+    cmake_build_llava_runner
+else
+    echo "Invalid TARGET_OS ($2): ${TARGET_OS}"
+fi
+
+# Step2. Generate the PTE
 export_llava
+
+# Step3. Run
 prepare_image_tensor
 run_and_verify
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index c44de95533..11950623ea 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -230,9 +230,10 @@ jobs:
             PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
             PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
         fi
-        
+
         # TODO: This needs to be replaced with a generic loader .apk
         # Build LLM Demo for Android
+        export ANDROID_ABIS="arm64-v8a"
         bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
 
   # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
@@ -278,6 +279,7 @@ jobs:
         model: ${{ fromJson(needs.set-parameters.outputs.models) }}
         delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
         device: ${{ fromJson(needs.set-parameters.outputs.devices) }}
+      fail-fast: false
     with:
       device-type: android
       runner: linux.2xlarge
diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
index 4c693a90e6..1ea7f398ce 100644
--- a/.github/workflows/android.yml
+++ b/.github/workflows/android.yml
@@ -5,6 +5,8 @@ on:
     branches:
       - main
       - release/*
+    tags:
+      - ciflow/android/*
   pull_request:
     paths:
       - .ci/docker/**
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index 41e2868bfb..8da58653a8 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -290,6 +290,7 @@ jobs:
         model: ${{ fromJson(needs.set-parameters.outputs.models) }}
         delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
         device: ${{ fromJson(needs.set-parameters.outputs.devices) }}
+      fail-fast: false
     with:
       device-type: ios
       # For iOS testing, the runner just needs to call AWS Device Farm, so there is no need to run this on macOS
diff --git a/.github/workflows/upload-android-test-specs.yml b/.github/workflows/upload-android-test-specs.yml
index 5a468da44f..04f7cf40d7 100644
--- a/.github/workflows/upload-android-test-specs.yml
+++ b/.github/workflows/upload-android-test-specs.yml
@@ -41,7 +41,7 @@ jobs:
     with:
       # Just use a small model here with a minimal amount of configuration to test the spec
       models: stories110M
-      devices: samsung_galaxy_s2x
+      devices: samsung_galaxy_s22
       delegates: xnnpack
       test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/android-llm-device-farm-test-spec.yml
 
diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
index 61b363f1a7..5f77a74740 100755
--- a/backends/qualcomm/scripts/build.sh
+++ b/backends/qualcomm/scripts/build.sh
@@ -81,6 +81,7 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DEXECUTORCH_BUILD_QNN=ON \
         -DEXECUTORCH_BUILD_SDK=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
         -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
@@ -124,6 +125,7 @@ if [ "$BUILD_X86_64" = true ]; then
         -DEXECUTORCH_BUILD_QNN=ON \
         -DEXECUTORCH_BUILD_SDK=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
         -S $PRJ_ROOT \
diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md
index aaff7a7a72..8570859ed3 100644
--- a/backends/vulkan/docs/android_demo.md
+++ b/backends/vulkan/docs/android_demo.md
@@ -94,8 +94,9 @@ binary using the Android NDK toolchain.
   cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI=$ANDROID_ABI \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_VULKAN=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DPYTHON_EXECUTABLE=python \
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 7b9d30ef65..6fe6746ec0 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -80,6 +80,42 @@ std::vector<int64_t> calculate_strides(
   return strides;
 }
 
+/*
+ * Axis mapping is somewhat analogous to strides for texture backed tensors.
+ *
+ * The axis mapping is normalized to 4 dimensions, similar to the padded sizes.
+ * The first 3 values of the axis mapping indicate the (X,Y,Z) image texture
+ * axis that corresponds to the width, height, and channels dimension of the
+ * tensor. Thus the axis mapping can be considered to be in WHCN dimension
+ * order.
+ *
+ * The last value `axis_mapping.at(3)` indicates the WHCN index of the tensor
+ * dimension along which batches will be concatenated. This dimension can be
+ * referred to as the "inner dimension" To determine which image texture axis is
+ * used for the concatenation, a double lookup will need to be performed
+ * (axis_mapping.at(axis_mapping.at(3))).
+ *
+ * The reason for strucuring axis mapping this way is because for the batch dim,
+ * two things need to be easily derived:
+ *
+ * 1. The dim idx of the inner dimension, so that the size of the inner
+ *    dimension can be easily determined.
+ * 2. The texture axis used to concatenate batches
+ *
+ * By storing the dim index of the inner dimension instead of the texture axis
+ * it maps to, both pieces of information are readily available.
+ *
+ * The axis mapping allows for permuted views of texture-backed tensors.
+ */
+std::vector<int64_t> default_axis_mapping() {
+  // Currently, all compute shaders have an assumption that the channels dim is
+  // used to combine with the batch dim of a tensor. However, once dim mapping
+  // is integrated into the tensor indexing logic for each compute shader, we
+  // can be more flexible with mapping the batch dim to different texture axes
+  // in order to improve performance or memory footprint.
+  return {0, 1, 2, 2};
+}
+
 bool dim_order_is_valid(const std::vector<int64_t>& dim_order) {
   int64_t sum = 0;
   for (size_t i = 0; i < dim_order.size(); ++i) {
@@ -137,30 +173,44 @@ std::vector<int64_t> calculate_padded_sizes(
 
 utils::uvec3 calculate_image_extents(
     const std::vector<int64_t>& padded_sizes,
+    const std::vector<int64_t>& axis_mapping,
     const utils::GPUMemoryLayout memory_layout) {
   VK_CHECK_COND(padded_sizes.size() == 4);
+  VK_CHECK_COND(axis_mapping.size() == 4);
+
+  utils::uvec3 extents({1, 1, 1});
+  // First three elements of axis_mapping indicate which (X,Y,Z) image axis the
+  // width, height, and channels dim of the tensor maps to.
+  for (int whcn_dim = 0; whcn_dim < 3; ++whcn_dim) {
+    const int64_t axis = axis_mapping.at(whcn_dim);
+    const int64_t dim = padded_sizes.size() - 1 - whcn_dim;
+    extents[axis] = utils::safe_downcast<uint32_t>(padded_sizes.at(dim));
+  }
 
-  uint32_t N = utils::safe_downcast<uint32_t>(padded_sizes.at(0));
-  uint32_t C = utils::safe_downcast<uint32_t>(padded_sizes.at(1));
-  uint32_t H = utils::safe_downcast<uint32_t>(padded_sizes.at(2));
-  uint32_t W = utils::safe_downcast<uint32_t>(padded_sizes.at(3));
+  // axis_mapping[3] indicates the WHCN index of the dimension used for batch
+  // concatenation. Thus a double lookup is required to determine the image axis
+  // used for batch concatenation.
+  const int64_t concatted_whcn_dim = axis_mapping.at(3);
+  const int64_t batch_axis = axis_mapping.at(concatted_whcn_dim);
+  // Multiply the extents of the batch axis by the batch size.
+  extents[batch_axis] *= padded_sizes.at(0);
 
   switch (memory_layout) {
     case utils::kWidthPacked:
-      VK_CHECK_COND(W % 4 == 0);
-      W /= 4;
+      VK_CHECK_COND(extents[0] % 4 == 0);
+      extents[0] /= 4;
       break;
     case utils::kHeightPacked:
-      VK_CHECK_COND(H % 4 == 0);
-      H /= 4;
+      VK_CHECK_COND(extents[1] % 4 == 0);
+      extents[1] /= 4;
       break;
     case utils::kChannelsPacked:
-      VK_CHECK_COND(C % 4 == 0);
-      C /= 4;
+      VK_CHECK_COND(extents[2] % 4 == 0);
+      extents[2] /= 4;
       break;
   }
 
-  return {W, H, C * N};
+  return extents;
 }
 
 //
@@ -176,9 +226,10 @@ vTensor::vTensor(
     const bool allocate_memory)
     : dtype_(dtype),
       memory_layout_(memory_layout),
-      // Calculate tensor size metadata
+      // Calculate tensor metadata
       sizes_(sizes.begin(), sizes.end()),
       dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)),
+      axis_mapping_(default_axis_mapping()),
       strides_(calculate_strides(sizes, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
       padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
@@ -189,12 +240,14 @@ vTensor::vTensor(
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
+      axis_mapping_uniform_(),
       texture_limits_uniform_(),
       // Construct Tensor storage
       storage_(
           context,
           storage_type,
           memory_layout_,
+          axis_mapping_,
           padded_sizes_,
           dtype_,
           allocate_memory) {
@@ -222,6 +275,7 @@ vTensor::vTensor(const vTensor& other)
       // Copy tensor size metadata
       sizes_(other.sizes_.begin(), other.sizes_.end()),
       dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
+      axis_mapping_(other.axis_mapping_.begin(), other.axis_mapping_.end()),
       strides_(other.strides_.begin(), other.strides_.end()),
       numel_(other.numel_),
       padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()},
@@ -234,6 +288,7 @@ vTensor::vTensor(const vTensor& other)
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
+      axis_mapping_uniform_(),
       texture_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_) {}
@@ -248,6 +303,7 @@ vTensor::vTensor(
       // Copy tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
       dim_order_(dim_order.begin(), dim_order.end()),
+      axis_mapping_(default_axis_mapping()),
       strides_(calculate_strides(sizes_, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
       padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
@@ -258,6 +314,7 @@ vTensor::vTensor(
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
+      axis_mapping_uniform_(),
       texture_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) {
@@ -315,6 +372,14 @@ const vkapi::BufferBindInfo vTensor::strides_ubo() {
   return vkapi::BufferBindInfo(strides_uniform_.buffer());
 }
 
+const vkapi::BufferBindInfo vTensor::axis_mapping_ubo() {
+  if (!axis_mapping_uniform_.buffer()) {
+    axis_mapping_uniform_ =
+        ParamsBuffer(storage_.context_, utils::make_ivec4(axis_mapping_));
+  }
+  return vkapi::BufferBindInfo(axis_mapping_uniform_.buffer());
+}
+
 const vkapi::BufferBindInfo vTensor::texture_limits_ubo() {
   if (!texture_limits_uniform_.buffer()) {
     texture_limits_uniform_ = ParamsBuffer(storage_.context_, texture_limits_);
@@ -376,11 +441,7 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
   }
 }
 
-void vTensor::update_metadata(
-    const std::vector<int64_t>& new_sizes,
-    const std::vector<int64_t>& new_dim_order) {
-  sizes_ = new_sizes;
-  dim_order_ = new_dim_order;
+void vTensor::update_metadata() {
   strides_ = calculate_strides(sizes_, dim_order_);
   // Only update the memory layout for buffer-backed tensors. Strides are
   // meaningless for texture-backed tensors and do not impact the memory layout.
@@ -396,7 +457,7 @@ void vTensor::update_metadata(
   // Calculate the extents of the image texture that would have been required
   // for a tensor of the new sizes.
   utils::uvec3 virtual_extents =
-      calculate_image_extents(padded_sizes_, memory_layout_);
+      calculate_image_extents(padded_sizes_, axis_mapping_, memory_layout_);
 
   // Update the texture limits to reflect the new virtual extents.
   texture_limits_.limits = utils::ivec3{
@@ -407,15 +468,18 @@ void vTensor::update_metadata(
   if (sizes_uniform_.buffer()) {
     sizes_uniform_.update(utils::make_whcn_ivec4(sizes_));
   }
-  if (texture_limits_uniform_.buffer()) {
-    texture_limits_uniform_.update(texture_limits_);
-  }
   if (strides_uniform_.buffer()) {
     strides_uniform_.update(utils::make_whcn_ivec4(unsqueezed_strides_));
   }
   if (numel_uniform_.buffer()) {
     numel_uniform_.update(numel_);
   }
+  if (axis_mapping_uniform_.buffer()) {
+    axis_mapping_uniform_.update(utils::make_ivec4(axis_mapping_));
+  }
+  if (texture_limits_uniform_.buffer()) {
+    texture_limits_uniform_.update(texture_limits_);
+  }
 }
 
 void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
@@ -423,7 +487,7 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
     utils::uvec3 virtual_extents =
-        calculate_image_extents(padded_sizes_, memory_layout_);
+        calculate_image_extents(padded_sizes_, axis_mapping_, memory_layout_);
 
     bool valid_resize = virtual_extents[0] <= image_extents()[0];
     valid_resize = valid_resize && virtual_extents[1] <= image_extents()[1];
@@ -454,7 +518,9 @@ void vTensor::virtual_reconfigure(
   VK_CHECK_COND(dim_order_is_valid(new_dim_order));
 
   check_sizes(new_sizes);
-  update_metadata(new_sizes, new_dim_order);
+  sizes_ = new_sizes;
+  dim_order_ = new_dim_order;
+  update_metadata();
 }
 
 void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
@@ -463,13 +529,16 @@ void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
       "new sizes cannot modify the dimensionality of the tensor ");
 
   check_sizes(new_sizes);
-  update_metadata(new_sizes, dim_order_);
+  sizes_ = new_sizes;
+  update_metadata();
 }
 
 void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
-  update_metadata(new_sizes, dim_order_);
+  sizes_ = new_sizes;
+  update_metadata();
   storage_.discard_and_reallocate(
       calculate_padded_sizes(new_sizes, memory_layout_),
+      axis_mapping_,
       memory_layout_,
       dtype_);
 }
@@ -547,12 +616,16 @@ vTensorStorage::vTensorStorage(
     Context* const context,
     const utils::StorageType storage_type,
     const utils::GPUMemoryLayout gpu_memory_layout,
+    const std::vector<int64_t>& axis_mapping,
     const std::vector<int64_t>& padded_sizes,
     const vkapi::ScalarType dtype,
     const bool allocate_memory)
     : context_(context),
       storage_type_{storage_type},
-      image_extents_(calculate_image_extents(padded_sizes, gpu_memory_layout)),
+      image_extents_(calculate_image_extents(
+          padded_sizes,
+          axis_mapping,
+          gpu_memory_layout)),
       buffer_length_{utils::multiply_integers(padded_sizes)},
       buffer_offset_{0},
       image_(allocate_image(
@@ -665,6 +738,7 @@ bool vTensorStorage::is_copy_of(const vTensorStorage& other) const {
 
 void vTensorStorage::discard_and_reallocate(
     const std::vector<int64_t>& padded_sizes,
+    const std::vector<int64_t>& axis_mapping,
     const utils::GPUMemoryLayout gpu_memory_layout,
     const vkapi::ScalarType dtype) {
   const bool image_owns_memory = image_.owns_memory();
@@ -672,7 +746,8 @@ void vTensorStorage::discard_and_reallocate(
 
   flush();
 
-  image_extents_ = calculate_image_extents(padded_sizes, gpu_memory_layout);
+  image_extents_ =
+      calculate_image_extents(padded_sizes, axis_mapping, gpu_memory_layout);
   image_ = allocate_image(
       context_,
       image_extents_,
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index d37628e4ad..70f363796f 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -60,11 +60,11 @@ std::vector<int64_t> calculate_padded_sizes(
     const utils::GPUMemoryLayout memory_layout);
 
 /*
- * Given the padded sizes of a tensor and the GPU memory layout, calculate the
- * 3D image extents required to store the tensor data as an image texture.
+ * Calculate the image extents required of a texture backed tensor.
  */
 utils::uvec3 calculate_image_extents(
     const std::vector<int64_t>& padded_sizes,
+    const std::vector<int64_t>& axis_mapping,
     const utils::GPUMemoryLayout memory_layout);
 
 struct LastAccess {
@@ -90,7 +90,8 @@ class vTensorStorage final {
       Context* context,
       const utils::StorageType storage_type,
       const utils::GPUMemoryLayout gpu_memory_layout,
-      const std::vector<int64_t>& sizes,
+      const std::vector<int64_t>& axis_mapping,
+      const std::vector<int64_t>& padded_sizes,
       const vkapi::ScalarType dtype,
       const bool allocate_memory = true);
 
@@ -159,6 +160,7 @@ class vTensorStorage final {
 
   void discard_and_reallocate(
       const std::vector<int64_t>& padded_sizes,
+      const std::vector<int64_t>& axis_mapping,
       const utils::GPUMemoryLayout gpu_memory_layout,
       const vkapi::ScalarType dtype);
 };
@@ -218,21 +220,58 @@ class vTensor final {
   vTensor& operator=(vTensor&& other) = default;
 
  private:
+  /*
+   * "Core" tensor metadata. They are the minimum amount of information required
+   * to construct a tensor.
+   */
+
+  // Whether the tensor has elements of type float, int, etc.
   vkapi::ScalarType dtype_;
+  // Describes which dimension is "tightly packed". For texture backed tensors,
+  // this describes which dimension is packed along a texel. For buffer backed
+  // tensors, this describes which dimension has a stride of 1 (i.e. is last in
+  // the dim order).
   utils::GPUMemoryLayout memory_layout_;
-
   // sizes of the tensor in NCHW dimension order
   std::vector<int64_t> sizes_;
+
+  /*
+   * "Layout" metadata. These describe with further detail how tensor data is
+   * laid out in memory. However, they are considered secondary to the "core"
+   * metadata members above because defaults can be assumed based on a given
+   * memory layout. When permuting the tensor without performing a copy, these
+   * metadata members are the ones that will be changed. All other metadata is
+   * derived from a combination of sizes, memory layout, and the below members.
+   */
+
   // dim order of the tensor; dimension indices are in NCHW dimension order
   // i.e. 0 is N, 1 is C, 2 is H, 3 is W for a 4D tensor. The dims with larger
   // strides precede the dims with smaller strides in the dim order. The last
   // dim is always the fastest moving dim with a stride of 1.
   std::vector<int64_t> dim_order_;
+  // Describes which axis of an image texture each dimension of the tensor maps
+  // to. The axis mapping allows texture based tensors to be permuted and
+  // transposed without modifying the underlying texture storage. For a more in
+  // depth explanation of axis mapping, see the `default_axis_mapping()`
+  // function.
+  std::vector<int64_t> axis_mapping_;
+
+  /*
+   * The below can be consider "layout" metadata as well, but are derived from
+   * the above data members.
+   */
+
   // strides of the tensor in NCHW dimension order
   std::vector<int64_t> strides_;
   // Contains the number of elements in the tensor according to the canonical
   // sizes.
   size_t numel_;
+
+  /*
+   * The below metadata members are derived from the above, and are typically
+   * to i.e. pass tensor metadata to compute shaders.
+   */
+
   // padded sizes of the tensor in NCHW dimension order. See the
   // calculate_padded_sizes() function for more context. Note that padded sizes
   // are only used for texture storage, and not for buffer storage.
@@ -260,6 +299,7 @@ class vTensor final {
   ParamsBuffer sizes_uniform_;
   ParamsBuffer strides_uniform_;
   ParamsBuffer numel_uniform_;
+  ParamsBuffer axis_mapping_uniform_;
   ParamsBuffer texture_limits_uniform_;
 
   vTensorStorage storage_;
@@ -365,14 +405,18 @@ class vTensor final {
    */
   const vkapi::BufferBindInfo strides_ubo();
 
+  /*
+   * Returns a GPU buffer containing the texture axis mapping for each dimension
+   * of the tensor, in WHCN dimension order.
+   */
+  const vkapi::BufferBindInfo axis_mapping_ubo();
+
   /*
    * Returns a GPU buffer containing the virtual image extents of the tensor.
    * Since a tensor can be resized with the virtual_resize() function, this
    * GPU buffer contains the image extents of the tensor calculated using the
    * virtual_resize() function. This allows shaders to exit early if they are
    * working outside the limits of the texture.
-   *
-   * This buffer should only be used to
    */
   const vkapi::BufferBindInfo texture_limits_ubo();
 
@@ -423,13 +467,10 @@ class vTensor final {
 
  private:
   /*
-   * Given new sizes and new strides of the dim order, update the sizes and dim
-   * order metadata of the vTensor. New strides are computed using the new sizes
-   * and new dim order.
+   * Assuming sizes, dim order, or axis mapping was modified, recompute all
+   * derived metadata and update metadata UBO with new values.
    */
-  void update_metadata(
-      const std::vector<int64_t>& new_sizes,
-      const std::vector<int64_t>& new_dim_order);
+  void update_metadata();
 
   /*
    * Check that tensor sizes are valid given the current storage resource's
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 210b03c4ca..afdc8290cd 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -327,6 +327,10 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().numel_ubo();
   }
 
+  inline vkapi::BufferBindInfo axis_mapping_ubo(const ValueRef idx) {
+    return values_.at(idx).toTensor().axis_mapping_ubo();
+  }
+
   inline vkapi::BufferBindInfo texture_limits_ubo(const ValueRef idx) {
     return values_.at(idx).toTensor().texture_limits_ubo();
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
index b51d5a3f6e..8f113bd2cc 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
@@ -21,9 +21,10 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_buffer(0, "w", "nchw_out", DTYPE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec4", "sizes")}
+${layout_declare_buffer(B, "w", "nchw_out", DTYPE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_mapping")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -51,7 +52,7 @@ void write_out_texel(VEC4_T texel, ivec4 tensor_idx) {
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim);
+  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim);
 
   if (any(greaterThanEqual(tensor_idx, sizes))) {
     return;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
index 21eadff0b3..9dc06bd855 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -183,6 +183,42 @@ ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) {
   return tensor_idx;
 }
 
+/*
+ * Derive (w,h,c,n) tensor indices from (x,y,z) texture position using axis
+ * mapping.
+ */
+ivec4 to_tensor_idx(
+    ivec3 pos,
+    ivec4 sizes,
+    const ivec4 axis_mapping,
+    const int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+
+  // Packed dim contains 4 elements per texel, so moving 1 unit traverses 4
+  // elements in the tensor.
+  pos[axis_mapping[packed_dim]] *= 4;
+
+  ivec4 tensor_idx;
+  for (int dim = 0; dim < 3; ++dim) {
+    tensor_idx[dim] = pos[axis_mapping[dim]];
+  }
+
+  // Early return if batch is 1. Batch index will be 0.
+  if (sizes.w == 1) {
+    tensor_idx.w = 0;
+    return tensor_idx;
+  }
+
+  // Else, adjust the dim that's concatenated with batch. Note that the axis
+  // mapping for the batch dim indicates WHCN dim index of the dim that it is
+  // concatenated with, not a texture axis.
+  tensor_idx.w = tensor_idx[axis_mapping[3]] / sizes[axis_mapping[3]];
+  tensor_idx[axis_mapping[3]] %= sizes[axis_mapping[3]];
+
+  return tensor_idx;
+}
+
 /*
  * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim
  *        is packed along a texel
@@ -199,6 +235,34 @@ ivec3 to_texture_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
   return pos;
 }
 
+/*
+ * Derive (x,y,z) texture position from (w,h,c,n) tensor indices using axis
+ * mapping.
+ */
+ivec3 to_texture_pos(
+    const ivec4 idx,
+    ivec4 sizes,
+    const ivec4 axis_mapping,
+    const int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+
+  ivec3 pos;
+  for (int dim = 0; dim < 3; ++dim) {
+    pos[axis_mapping[dim]] = idx[dim];
+  }
+
+  // Adjust batch dim if needed
+  if (sizes.w > 1) {
+    pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w;
+  }
+
+  // Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4
+  // tensor elements in that dim.
+  pos[axis_mapping[packed_dim]] /= 4;
+  return pos;
+}
+
 /*
  * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of the tensor, which dim
  *        is packed along a texel
@@ -218,6 +282,35 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
   return pos;
 }
 
+/*
+ * Derive (x,y,z,i) texel element position from the (w,h,c,n) tensor index using
+ * the axis mapping.
+ */
+ivec4 to_texture_elem_pos(
+    const ivec4 idx,
+    ivec4 sizes,
+    const ivec4 axis_mapping,
+    const int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+
+  ivec4 pos;
+  for (int dim = 0; dim < 3; ++dim) {
+    pos[axis_mapping[dim]] = idx[dim];
+  }
+
+  // Adjust batch dim if needed
+  if (sizes.w > 1) {
+    pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w;
+  }
+
+  // Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4
+  // tensor elements in that dim.
+  pos[axis_mapping[packed_dim]] /= 4;
+  pos.w = idx[packed_dim] % 4;
+  return pos;
+}
+
 //
 // Texel Access and Storage
 //
diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl
index b1e3a0abdf..3ef984bfc9 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl
@@ -16,10 +16,11 @@ layout(std430) buffer;
 
 #extension GL_EXT_control_flow_attributes : require
 
-${layout_declare_buffer(0, "w", "nchw_out", "int")}
-${layout_declare_tensor(1, "r", "t_in", "int8", "texture3d")}
-${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
-${layout_declare_ubo(3, "int", "out_numel")}
+${layout_declare_buffer(B, "w", "nchw_out", "int")}
+${layout_declare_tensor(B, "r", "t_in", "int8", "texture3d")}
+${layout_declare_ubo(B, "ivec4", "tensor_sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_mapping")}
+${layout_declare_ubo(B, "int", "out_numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
index abe9390480..04b6a26cc4 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
@@ -21,9 +21,10 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_buffer(1, "r", "nchw_in", DTYPE)}
-${layout_declare_ubo(2, "ivec4", "sizes")}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_buffer(B, "r", "nchw_in", DTYPE)}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_mapping")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -53,7 +54,7 @@ VEC4_T read_texel(ivec4 tensor_idx) {
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim);
+  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim);
   if (any(greaterThanEqual(tensor_idx, sizes))) {
     return;
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl
index 378cf09d12..813a174d2a 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl
@@ -16,9 +16,10 @@ layout(std430) buffer;
 
 #extension GL_EXT_control_flow_attributes : require
 
-${layout_declare_tensor(0, "w", "t_out", "int8", "texture3d")}
-${layout_declare_buffer(1, "r", "nchw_in", "int")}
-${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
+${layout_declare_tensor(B, "w", "t_out", "int8", "texture3d")}
+${layout_declare_buffer(B, "r", "nchw_in", "int")}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_mapping")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -36,7 +37,7 @@ int extend_sign(int x) {
 
 ivec4 read_texel(ivec4 tensor_idx) {
   const ivec4 buf_indices = get_texel_nchw_buffer_ixs(
-      tensor_idx, tensor_sizes, packed_dim);
+      tensor_idx, sizes, packed_dim);
 
   int shift = (1 << 8) - 1;
   ivec4 masks;
@@ -51,7 +52,7 @@ ivec4 read_texel(ivec4 tensor_idx) {
   ivec4 out_tex = ivec4(0);
 
   [[unroll]] for (int i = 0; i < 4; ++i) {
-    if (tensor_idx[packed_dim] + i < tensor_sizes[packed_dim]) {
+    if (tensor_idx[packed_dim] + i < sizes[packed_dim]) {
       int in_texel = nchw_in[buf_indices[i] / 4];
       int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4));
       extracted_val = extend_sign(extracted_val);
@@ -64,9 +65,9 @@ ivec4 read_texel(ivec4 tensor_idx) {
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tensor_idx = to_tensor_idx(pos, tensor_sizes, packed_dim);
+  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim);
 
-  if (any(greaterThanEqual(tensor_idx, tensor_sizes))) {
+  if (any(greaterThanEqual(tensor_idx, sizes))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 74113197d4..dcdd2dccfa 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -106,7 +106,7 @@ ValueRef prepack_biases(
       graph.create_local_wg_size(v),
       vref,
       v,
-      {t->sizes_ubo()},
+      {t->sizes_ubo(), t->axis_mapping_ubo()},
       // Specialization constants
       {SV(t->packed_dim_whcn_idx())}));
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index 9df5b73c1a..6a759e0fd2 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -31,7 +31,8 @@ void add_staging_to_tensor_node(
          graph.strides_ubo(out_tensor),
          graph.numel_ubo(out_tensor)});
   } else {
-    ubos.append(graph.sizes_ubo(out_tensor));
+    ubos.append(
+        {graph.sizes_ubo(out_tensor), graph.axis_mapping_ubo(out_tensor)});
   }
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
@@ -69,7 +70,8 @@ void add_tensor_to_staging_node(
          graph.strides_ubo(in_tensor),
          graph.numel_ubo(in_tensor)});
   } else {
-    ubos.append(graph.sizes_ubo(in_tensor));
+    ubos.append(
+        {graph.sizes_ubo(in_tensor), graph.axis_mapping_ubo(in_tensor)});
   }
 
   // Normally, the image_to_nchw shader is structured so that each thread reads
@@ -113,7 +115,7 @@ ValueRef prepack(
   if (graph.is_buffer_storage(v)) {
     ubos.append({graph.sizes_ubo(v), graph.strides_ubo(v), graph.numel_ubo(v)});
   } else {
-    ubos.append(graph.sizes_ubo(v));
+    ubos.append({graph.sizes_ubo(v), graph.axis_mapping_ubo(v)});
   }
 
   graph.prepack_nodes().emplace_back(new PrepackNode(
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h
index 45dfceb3f0..4bd8e9b900 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h
@@ -32,7 +32,8 @@ constexpr DimIndex kChannel4D = DimIndex::DIM_3RD_LAST;
 constexpr DimIndex kBatch4D = DimIndex::DIM_4TH_LAST;
 
 inline DimIndex normalize_to_dim_index(const api::vTensor& v_in, int32_t dim) {
-  return static_cast<DimIndex>(dim - v_in.dim());
+  return dim < 0 ? static_cast<DimIndex>(dim)
+                 : static_cast<DimIndex>(dim - v_in.dim());
 }
 
 /*
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index a469a44dc1..4feaecced5 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -85,7 +85,8 @@ void record_nchw_to_image_op(
           vkapi::PipelineStage::COMPUTE,
           vkapi::MemoryAccessType::WRITE),
       src_buffer,
-      v_dst.sizes_ubo());
+      v_dst.sizes_ubo(),
+      v_dst.axis_mapping_ubo());
 }
 
 void record_image_to_nchw_op(
@@ -106,7 +107,8 @@ void record_image_to_nchw_op(
       0,
       dst_buffer,
       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
-      v_src.sizes_ubo());
+      v_src.sizes_ubo(),
+      v_src.axis_mapping_ubo());
 }
 
 void record_int8_image_to_nchw_noint8_op(
@@ -127,6 +129,7 @@ void record_int8_image_to_nchw_noint8_op(
       dst_buffer.buffer(),
       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_src.sizes_ubo(),
+      v_src.axis_mapping_ubo(),
       v_src.numel_ubo());
 }
 
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 1112548b85..53d0c820f4 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1007,10 +1007,16 @@ TEST_F(VulkanComputeAPITest, print_object_sizes) {
   // The actual sizes of each object is dependent on the platform. However, we
   // can alert ourselves to any significant changes in the sizes of these
   // objects by checking the `sizeof()` the class against some loose thresholds.
-  EXPECT_TRUE(sizeof(vTensor) < 1800);
-  EXPECT_TRUE(sizeof(Value) < 2400);
+
+  // Current known size on 64 bit system: 1824 B
+  EXPECT_TRUE(sizeof(vTensor) < 2000);
+  // Current known size on 64 bit system: 1840 B
+  EXPECT_TRUE(sizeof(Value) < 2200);
+  // Current known size on 64 bit system: 240 B
   EXPECT_TRUE(sizeof(StagingBuffer) < 500);
+  // Current known size on 64 bit system: 384 B
   EXPECT_TRUE(sizeof(ComputeGraph) < 500);
+  // Current known size on 64 bit system: 248 B
   EXPECT_TRUE(sizeof(ExecuteNode) < 500);
 }
 
@@ -1227,8 +1233,8 @@ TEST(VulkanComputeGraphTest, test_simple_graph) {
   GraphConfig config;
   ComputeGraph graph(config);
 
-  std::vector<int64_t> size_big = {8, 64, 124};
-  std::vector<int64_t> size_small = {8, 1, 124};
+  std::vector<int64_t> size_big = {1, 8, 8};
+  std::vector<int64_t> size_small = {1, 1, 8};
 
   // Build graph
 
@@ -1409,8 +1415,9 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       /*shared_object_idx = */ 4);
 
   // +2: t.sizes_ubo() for each staging shader
+  // +2: t.axis_mapping_ubo() for each staging shader
   // +2: staging buffer for each input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 4);
+  EXPECT_TRUE(get_vma_allocation_count() == 6);
 
   ValueRef c = graph.add_tensor(
       size_big,
@@ -1427,8 +1434,9 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
 
   // +2: alpha UBO, broadcast UBO for arithmetic shader
   // +1: t.sizes_ubo() uniform buffer for staging shader
+  // +1: t.axis_mapping_ubo() uniform buffer for staging shader
   // +1: staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 9);
+  EXPECT_TRUE(get_vma_allocation_count() == 12);
 
   ValueRef e = graph.add_tensor(
       size_big,
@@ -1444,14 +1452,15 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
 
   // +2: alpha UBO, broadcast UBO for arithmetic shader
   // +1: t.sizes_ubo() for staging shader
+  // +1: t.axis_mapping_ubo() for staging shader
   // +1 staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 13);
+  EXPECT_TRUE(get_vma_allocation_count() == 17);
 
   graph.prepare();
   graph.encode_execute();
 
   // +3: shared memory allocations for tensors
-  EXPECT_TRUE(get_vma_allocation_count() == 16);
+  EXPECT_TRUE(get_vma_allocation_count() == 20);
 
   // Run graph
 
diff --git a/backends/xnnpack/README.md b/backends/xnnpack/README.md
index 33a0bfaf30..0c3d7e1442 100644
--- a/backends/xnnpack/README.md
+++ b/backends/xnnpack/README.md
@@ -105,9 +105,10 @@ mkdir cmake-out
 cmake \
     -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DPYTHON_EXECUTABLE=python \
     -Bcmake-out .
diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
index 5a17c8745d..4d1a0ac123 100644
--- a/build/build_android_llm_demo.sh
+++ b/build/build_android_llm_demo.sh
@@ -38,6 +38,7 @@ build_android_native_library() {
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
@@ -139,7 +140,9 @@ collect_artifacts_to_be_uploaded() {
 
 BUILD_AAR_DIR="$(mktemp -d)"
 export BUILD_AAR_DIR
-ANDROID_ABIS=("arm64-v8a" "x86_64")
+if [ -z "$ANDROID_ABIS" ]; then
+  ANDROID_ABIS=("arm64-v8a" "x86_64")
+fi
 export ANDROID_ABIS
 
 ARTIFACTS_DIR_NAME="$1"
diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh
index 8bd9e0539f..348111e2b4 100755
--- a/build/build_apple_frameworks.sh
+++ b/build/build_apple_frameworks.sh
@@ -163,6 +163,7 @@ cmake_build() {
         -DEXECUTORCH_BUILD_COREML=$COREML \
         -DEXECUTORCH_BUILD_MPS=$MPS \
         -DEXECUTORCH_BUILD_XNNPACK=$XNNPACK \
+        -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_APPLE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
diff --git a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
index 5abaaeb7ce..230f007d3f 100644
--- a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
@@ -136,6 +136,7 @@ cmake .. \
   -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
   -DEXECUTORCH_BUILD_SDK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
   -DPYTHON_EXECUTABLE=python3 \
   -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF
@@ -167,6 +168,7 @@ cmake .. \
     -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
     -DEXECUTORCH_BUILD_SDK=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
     -DPYTHON_EXECUTABLE=python3 \
     -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md
index a086581146..9c03399444 100644
--- a/docs/source/llm/getting-started.md
+++ b/docs/source/llm/getting-started.md
@@ -201,9 +201,9 @@ Create a file called main.cpp with the following contents:
 
 #include "basic_sampler.h"
 #include "basic_tokenizer.h"
-#include "managed_tensor.h"
 
 #include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/result.h>
@@ -244,14 +244,13 @@ std::string generate(
     for (auto i = 0u; i < max_output_length; i++) {
         // Convert the input_tokens from a vector of int64_t to EValue.
         // EValue is a unified data type in the ExecuTorch runtime.
-        ManagedTensor tensor_tokens(
+        auto inputs = from_blob(
             input_tokens.data(),
             {1, static_cast<int>(input_tokens.size())},
             ScalarType::Long);
-        std::vector<EValue> inputs = {tensor_tokens.get_tensor()};
 
         // Run the model. It will return a tensor of logits (log-probabilities).
-        Result<std::vector<EValue>> logits_evalue = llm_model.forward(inputs);
+        auto logits_evalue = llm_model.forward(inputs);
 
         // Convert the output logits from EValue to std::vector, which is what
         // the sampler expects.
@@ -339,7 +338,6 @@ Finally, download the following files into the same directory as main.h:
 ```
 curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_sampler.h
 curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_tokenizer.h
-curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/managed_tensor.h
 ```
 
 To learn more, see the [Runtime APIs Tutorial](../extension-module.md).
@@ -364,6 +362,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
 # Set options for executorch build.
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
 
 # Include the executorch subdirectory.
@@ -377,6 +376,7 @@ target_link_libraries(
     PRIVATE
     executorch
     extension_module_static # Provides the Module class
+    extension_tensor # Provides the TensorPtr class
     optimized_native_cpu_ops_lib) # Provides baseline cross-platform kernels
 ```
 
@@ -386,7 +386,6 @@ At this point, the working directory should contain the following files:
 - main.cpp
 - basic_tokenizer.h
 - basic_sampler.h
-- managed_tensor.h
 - export_nanogpt.py
 - model.py
 - vocab.json
@@ -518,6 +517,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
 # Set options for executorch build.
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
 option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend
 
@@ -534,6 +534,7 @@ target_link_libraries(
     PRIVATE
     executorch
     extension_module_static # Provides the Module class
+    extension_tensor # Provides the TensorPtr class
     optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
     xnnpack_backend) # Provides the XNNPACK CPU acceleration backend
 ```
@@ -548,7 +549,6 @@ At this point, the working directory should contain the following files:
 - main.cpp
 - basic_tokenizer.h
 - basic_sampler.h
-- managed_tensor.h
 - export_nanogpt.py
 - model.py
 - vocab.json
diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md
index 4491a6e8c8..8afa6d6fe7 100644
--- a/docs/source/tutorial-xnnpack-delegate-lowering.md
+++ b/docs/source/tutorial-xnnpack-delegate-lowering.md
@@ -149,9 +149,10 @@ mkdir cmake-out
 cmake \
     -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DPYTHON_EXECUTABLE=python \
     -Bcmake-out .
diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md
index 807561f44b..9af1f5266e 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/README.md
+++ b/examples/demo-apps/android/ExecuTorchDemo/README.md
@@ -78,6 +78,7 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -Bcmake-android-out
 
 cmake --build cmake-android-out -j16 --target install
@@ -119,6 +120,7 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -Bcmake-android-out
 
 cmake --build cmake-android-out -j16 --target install
diff --git a/examples/demo-apps/android/ExecuTorchDemo/setup.sh b/examples/demo-apps/android/ExecuTorchDemo/setup.sh
index 05dc3e4492..00d9201b09 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/setup.sh
+++ b/examples/demo-apps/android/ExecuTorchDemo/setup.sh
@@ -15,6 +15,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TESNOR=ON \
   -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
   -DCMAKE_BUILD_TYPE=Release \
   -B"${CMAKE_OUT}"
diff --git a/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml b/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
index cac83b8e6f..896e7b73fb 100644
--- a/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
+++ b/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
@@ -73,8 +73,30 @@ phases:
           fi
         fi;
 
+      # Run the new generic benchmark activity https://developer.android.com/tools/adb#am
+      - echo "Run LLM benchmark"
+      - |
+        adb -s $DEVICEFARM_DEVICE_UDID shell am start -W -n com.example.executorchllamademo/.LlmBenchmarkRunner \
+        --es "model_dir" "/data/local/tmp/llama" \
+        --es "tokenizer_path" "/data/local/tmp/llama/tokenizer.bin"
+
   post_test:
     commands:
+      - echo "Gather LLM benchmark results"
+      - |
+        BENCHMARK_RESULTS=""
+        ATTEMPT=0
+        MAX_ATTEMPT=10
+        while [ -z "${BENCHMARK_RESULTS}" ] && [ $ATTEMPT -lt $MAX_ATTEMPT ]; do
+          echo "Waiting for benchmark results..."
+          BENCHMARK_RESULTS=$(adb -s $DEVICEFARM_DEVICE_UDID shell run-as com.example.executorchllamademo cat files/benchmark_results.json)
+          sleep 30
+          ((ATTEMPT++))
+        done
+
+        adb -s $DEVICEFARM_DEVICE_UDID shell run-as com.example.executorchllamademo ls -la files/
+        # Trying to pull the file using adb ends up with permission error, but this works too, so why not
+        echo "${BENCHMARK_RESULTS}" > $DEVICEFARM_LOG_DIR/benchmark_results.json
 
 artifacts:
   # By default, Device Farm will collect your artifacts from the $DEVICEFARM_LOG_DIR directory.
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
index 33b230b1df..cee623507f 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
@@ -14,8 +14,11 @@
 import android.util.Log;
 import android.widget.TextView;
 import androidx.annotation.NonNull;
+import com.google.gson.Gson;
+import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
+import java.util.Arrays;
 
 public class LlmBenchmarkRunner extends Activity implements ModelRunnerCallback {
   ModelRunner mModelRunner;
@@ -32,7 +35,12 @@ protected void onCreate(Bundle savedInstanceState) {
 
     Intent intent = getIntent();
 
-    String modelPath = intent.getStringExtra("model_path");
+    File modelDir = new File(intent.getStringExtra("model_dir"));
+    File model =
+        Arrays.stream(modelDir.listFiles())
+            .filter(file -> file.getName().endsWith(".pte"))
+            .findFirst()
+            .get();
     String tokenizerPath = intent.getStringExtra("tokenizer_path");
 
     float temperature = intent.getFloatExtra("temperature", 0.8f);
@@ -42,7 +50,7 @@ protected void onCreate(Bundle savedInstanceState) {
     }
 
     mStatsDump = new StatsDump();
-    mModelRunner = new ModelRunner(modelPath, tokenizerPath, temperature, this);
+    mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this);
     mStatsDump.loadStart = System.currentTimeMillis();
   }
 
@@ -79,11 +87,21 @@ public void onGenerationStopped() {
           mTextView.append(mStatsDump.toString());
         });
 
+    // TODO (huydhn): Remove txt files here once the JSON format is ready
     try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) {
       writer.write(mStatsDump.toString());
     } catch (IOException e) {
       e.printStackTrace();
     }
+
+    // TODO (huydhn): Figure out on what the final JSON results looks like, we need something
+    // with the same number of fields as https://github.com/pytorch/pytorch/pull/135042
+    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) {
+      Gson gson = new Gson();
+      writer.write(gson.toJson(mStatsDump));
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
   }
 }
 
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index f24254efb3..96b200303c 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -73,8 +73,15 @@ public class MainActivity extends AppCompatActivity implements Runnable, LlamaCa
 
   @Override
   public void onResult(String result) {
-    mResultMessage.appendText(result);
-    run();
+    if (result.equals("\n\n")) {
+      if (!mResultMessage.getText().isEmpty()) {
+        mResultMessage.appendText(result);
+        run();
+      }
+    } else {
+      mResultMessage.appendText(result);
+      run();
+    }
   }
 
   @Override
@@ -614,6 +621,7 @@ public void run() {
                           ModelUtils.VISION_MODEL_IMAGE_CHANNELS,
                           prompt,
                           ModelUtils.VISION_MODEL_SEQ_LEN,
+                          false,
                           MainActivity.this);
                     } else {
                       // no image selected, we pass in empty int array
@@ -624,10 +632,12 @@ public void run() {
                           ModelUtils.VISION_MODEL_IMAGE_CHANNELS,
                           prompt,
                           ModelUtils.VISION_MODEL_SEQ_LEN,
+                          false,
                           MainActivity.this);
                     }
                   } else {
-                    mModule.generate(prompt, ModelUtils.TEXT_MODEL_SEQ_LEN, MainActivity.this);
+                    mModule.generate(
+                        prompt, ModelUtils.TEXT_MODEL_SEQ_LEN, false, MainActivity.this);
                   }
 
                   long generateDuration = System.currentTimeMillis() - generateStartTime;
diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
index 5e3ac6fc01..87d0f47c95 100644
--- a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
+++ b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
@@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
   -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
   -DEXECUTORCH_BUILD_QNN=ON \
diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh
index ccb2a788d6..91a68d4b88 100644
--- a/examples/demo-apps/android/LlamaDemo/setup.sh
+++ b/examples/demo-apps/android/LlamaDemo/setup.sh
@@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
   -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
   -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
diff --git a/examples/llm_manual/CMakeLists.txt b/examples/llm_manual/CMakeLists.txt
index 185665180f..e5054a683a 100644
--- a/examples/llm_manual/CMakeLists.txt
+++ b/examples/llm_manual/CMakeLists.txt
@@ -13,6 +13,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
 # Set options for executorch build.
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
 option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend
 
@@ -29,6 +30,7 @@ target_link_libraries(
   nanogpt_runner
   PRIVATE executorch
           extension_module_static # Provides the Module class
+          extension_tensor # Provides the TensorPtr class
           optimized_native_cpu_ops_lib # Provides baseline cross-platform
                                        # kernels
           xnnpack_backend
diff --git a/examples/llm_manual/main.cpp b/examples/llm_manual/main.cpp
index c0fc482542..3c4ecd71af 100644
--- a/examples/llm_manual/main.cpp
+++ b/examples/llm_manual/main.cpp
@@ -10,9 +10,9 @@
 
 #include "basic_sampler.h"
 #include "basic_tokenizer.h"
-#include "managed_tensor.h"
 
 #include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/result.h>
@@ -42,14 +42,13 @@ std::string generate(
   for (auto i = 0u; i < max_output_length; i++) {
     // Convert the input_tokens from a vector of int64_t to EValue.
     // EValue is a unified data type in the ExecuTorch runtime.
-    ManagedTensor tensor_tokens(
+    auto inputs = from_blob(
         input_tokens.data(),
         {1, static_cast<int>(input_tokens.size())},
         ScalarType::Long);
-    std::vector<EValue> inputs = {tensor_tokens.get_tensor()};
 
     // Run the model. It will return a tensor of logits (log-probabilities).
-    Result<std::vector<EValue>> logits_evalue = llm_model.forward(inputs);
+    auto logits_evalue = llm_model.forward(inputs);
 
     // Convert the output logits from EValue to std::vector, which is what
     // the sampler expects.
diff --git a/examples/llm_manual/managed_tensor.h b/examples/llm_manual/managed_tensor.h
deleted file mode 100644
index 204b38aa4e..0000000000
--- a/examples/llm_manual/managed_tensor.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-
-/**
- * Creates and owns the necessary metadata for a Tensor instance. Does not own
- * the data pointer.
- */
-class ManagedTensor {
- public:
-  ManagedTensor(
-      void* data,
-      const std::vector<exec_aten::SizesType>& sizes,
-      exec_aten::ScalarType dtype)
-      : sizes_(sizes),
-        tensor_impl_(
-            /*type=*/dtype,
-            /*dim=*/sizes_.size(),
-            /*sizes=*/sizes_.data(),
-            /*data=*/data,
-            /*dim_order=*/nullptr,
-            /*strides=*/nullptr,
-            /*dynamism=*/
-            executorch::runtime::TensorShapeDynamism::DYNAMIC_BOUND) {}
-
-  /**
-   * Get the Tensor object managed by this class.
-   */
-  exec_aten::Tensor get_tensor() {
-    return exec_aten::Tensor(&tensor_impl_);
-  }
-
- private:
-  std::vector<exec_aten::SizesType> sizes_;
-  exec_aten::TensorImpl tensor_impl_;
-};
diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask.cpp b/examples/models/flamingo/cross_attention/cross_attention_mask.cpp
index b2a2a6a806..06887ec473 100644
--- a/examples/models/flamingo/cross_attention/cross_attention_mask.cpp
+++ b/examples/models/flamingo/cross_attention/cross_attention_mask.cpp
@@ -6,12 +6,11 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/examples/models/flamingo/cross_attention/cross_attention_mask.h>
+
 #include <algorithm>
 #include <string>
 
-#include <executorch/examples/models/flamingo/cross_attention/cross_attention_mask.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
-
 namespace torch::executor {
 
 // Fowrward declaration needed for ARM compilers.
@@ -97,7 +96,7 @@ std::vector<std::vector<int>> _get_image_attention_intervals(
   return vision_masks;
 }
 
-std::vector<ManagedTensor> cross_attention_mask(
+std::vector<executorch::extension::TensorPtr> cross_attention_mask(
     const std::vector<int>& tokens,
     const std::vector<Tensor>& images,
     size_t tile_size,
@@ -121,7 +120,7 @@ std::vector<ManagedTensor> cross_attention_mask(
   // Create mask for each individual image based on its number of tokens,
   // which can vary based on number of tiles since they are not yet tile padded.
   // The masks are padded and concatenated together in the batch collator.
-  std::vector<ManagedTensor> cross_attention_masks;
+  std::vector<executorch::extension::TensorPtr> cross_attention_masks;
   size_t text_seq_len = tokens.size();
   for (size_t image_idx = 0; image_idx < image_intervals.size(); ++image_idx) {
     size_t n_tiles = images[image_idx].size(0);
@@ -140,7 +139,8 @@ std::vector<ManagedTensor> cross_attention_mask(
     size_t stride = image_seq_len;
     std::vector<int> mask_data(num_elements);
 
-    ManagedTensor mask(mask_data.data(), sizes, ScalarType::Int);
+    auto mask = executorch::extension::from_blob(
+        mask_data.data(), sizes, ScalarType::Int);
     cross_attention_masks.emplace_back(std::move(mask));
 
     // Add the allocated data to the output vector.
diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask.h b/examples/models/flamingo/cross_attention/cross_attention_mask.h
index 6998d91ad4..ccbc9eb171 100644
--- a/examples/models/flamingo/cross_attention/cross_attention_mask.h
+++ b/examples/models/flamingo/cross_attention/cross_attention_mask.h
@@ -8,11 +8,11 @@
 
 #pragma once
 
-#include <executorch/extension/runner_util/managed_tensor.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-
 #include <vector>
 
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
 namespace torch {
 namespace executor {
 
@@ -59,7 +59,7 @@ namespace executor {
  *
  * @returns A vector of cross attention masks, as Tensors, one for each image.
  */
-std::vector<ManagedTensor> cross_attention_mask(
+std::vector<::executorch::extension::TensorPtr> cross_attention_mask(
     const std::vector<int>& tokens,
     const std::vector<Tensor>& images,
     size_t tile_size,
diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp b/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp
index 5b9e58c216..b232212fa3 100644
--- a/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp
+++ b/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp
@@ -7,10 +7,10 @@
  */
 
 #include <executorch/examples/models/flamingo/cross_attention/cross_attention_mask.h>
+
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using torch::executor::ManagedTensor;
 using torch::executor::ScalarType;
 using torch::executor::Tensor;
 using torch::executor::TensorImpl;
@@ -41,29 +41,27 @@ TEST(CrossAttentxnMaskTest, TestCrossAttentionMask) {
 
   std::vector<Tensor> images = {a, b, c};
   std::vector<std::vector<int>> mask_data;
-  std::vector<ManagedTensor> output_masks =
-      torch::executor::cross_attention_mask(
-          tokens,
-          images,
-          /*tile_size=*/1,
-          /*patch_size=*/1,
-          /*image_token_id=*/1,
-          /*out=*/mask_data);
+  auto output_masks = torch::executor::cross_attention_mask(
+      tokens,
+      images,
+      /*tile_size=*/1,
+      /*patch_size=*/1,
+      /*image_token_id=*/1,
+      /*out=*/mask_data);
 
   // Check contents of the mask.
   std::vector<std::vector<size_t>> expected_intervals = {
       {0, 7}, {1, 7}, {7, 12}};
   for (size_t mask_idx = 0; mask_idx < output_masks.size(); ++mask_idx) {
-    ManagedTensor& output_mask = output_masks[mask_idx];
-    Tensor output_tensor = output_mask.get_aliasing_tensor();
-    for (size_t i = 0; i < output_tensor.size(0); ++i) {
-      for (size_t j = 0; j < output_tensor.strides()[0]; ++j) {
-        size_t unrolled_index = i * output_tensor.strides()[0] + j;
+    auto& output_tensor = output_masks[mask_idx];
+    for (size_t i = 0; i < output_tensor->size(0); ++i) {
+      for (size_t j = 0; j < output_tensor->strides()[0]; ++j) {
+        size_t unrolled_index = i * output_tensor->strides()[0] + j;
         if (i >= expected_intervals[mask_idx][0] &&
             i < expected_intervals[mask_idx][1]) {
-          EXPECT_EQ(output_tensor.const_data_ptr<int>()[unrolled_index], 1);
+          EXPECT_EQ(output_tensor->const_data_ptr<int>()[unrolled_index], 1);
         } else {
-          EXPECT_EQ(output_tensor.const_data_ptr<int>()[unrolled_index], 0);
+          EXPECT_EQ(output_tensor->const_data_ptr<int>()[unrolled_index], 0);
         }
       }
     }
diff --git a/examples/models/flamingo/cross_attention/targets.bzl b/examples/models/flamingo/cross_attention/targets.bzl
index 7bc13270aa..c3d9da0156 100644
--- a/examples/models/flamingo/cross_attention/targets.bzl
+++ b/examples/models/flamingo/cross_attention/targets.bzl
@@ -12,8 +12,8 @@ def define_common_targets():
         srcs = ["cross_attention_mask.cpp"],
         exported_headers = ["cross_attention_mask.h"],
         exported_deps = [
+            "//executorch/extension/tensor:tensor",
             "//executorch/runtime/core/exec_aten:lib",
-            "//executorch/extension/runner_util:managed_tensor",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
         ],
     )
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
index ea95c7f965..09ada515a1 100644
--- a/examples/models/llama2/README.md
+++ b/examples/models/llama2/README.md
@@ -200,8 +200,9 @@ The Wikitext results generated above used: `{max_seq_len: 2048, limit: 1000}`
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DEXECUTORCH_ENABLE_LOGGING=1 \
         -DCMAKE_BUILD_TYPE=Release \
-        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_BUILD_XNNPACK=ON \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
@@ -251,8 +252,9 @@ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
     -DANDROID_PLATFORM=android-23 \
     -DCMAKE_INSTALL_PREFIX=cmake-out-android \
     -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_ENABLE_LOGGING=1 \
     -DPYTHON_EXECUTABLE=python \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
index 1dac12cc85..c19ddd58a2 100644
--- a/examples/models/llama2/export_llama_lib.py
+++ b/examples/models/llama2/export_llama_lib.py
@@ -313,7 +313,6 @@ def build_args_parser() -> argparse.ArgumentParser:
 
 
 def canonical_path(path: Union[str, Path], *, dir: bool = False) -> str:
-
     path = str(path)
 
     if verbose_export():
@@ -424,6 +423,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
             verbose=args.verbose,
             max_seq_len=args.max_seq_length,
             metadata_str=args.metadata,
+            args=args,
         )
         .set_output_dir(output_dir_path)
         .to_dtype(dtype_override)
@@ -633,6 +633,7 @@ def _load_llama_model(
     verbose: bool = False,
     max_seq_len: int = 128,
     metadata_str: Optional[str] = None,
+    args,
 ) -> "LLMEdgeManager":
     """
     A helper util that builds a Llama2 model. It returns a LLMEdgeManager that
@@ -694,4 +695,5 @@ def _load_llama_model(
             model.params,
             metadata_str,
         ),
+        args=args,
     )
diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama2/runner/CMakeLists.txt
index abad63a3b5..79fcd267af 100644
--- a/examples/models/llama2/runner/CMakeLists.txt
+++ b/examples/models/llama2/runner/CMakeLists.txt
@@ -75,8 +75,8 @@ add_subdirectory(
 )
 set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
 
-set(llama_runner_deps executorch extension_module extension_data_loader
-                      re2::re2
+set(llama_runner_deps executorch extension_data_loader extension_module
+                      extension_tensor re2::re2
 )
 
 target_link_libraries(llama_runner PUBLIC ${llama_runner_deps})
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
index 0a5d773092..1e17c75400 100644
--- a/examples/models/llama2/runner/runner.cpp
+++ b/examples/models/llama2/runner/runner.cpp
@@ -14,7 +14,6 @@
 #include <ctime>
 
 #include <executorch/extension/llm/runner/util.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
@@ -144,7 +143,8 @@ Error Runner::generate(
     const std::string& prompt,
     int32_t seq_len,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback) {
+    std::function<void(const Stats&)> stats_callback,
+    bool echo) {
   // Prepare the inputs.
   // Use ones-initialized inputs.
   ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
@@ -154,6 +154,11 @@ Error Runner::generate(
     stats_.model_load_end_ms = util::time_in_ms();
   }
 
+  ET_LOG(
+      Info,
+      "RSS after loading model: %f MiB (0 if unsupported)",
+      util::get_rss_bytes() / 1024.0 / 1024.0);
+
   // Wrap the token_callback with print function
   std::function<void(const std::string&)> wrapped_callback =
       [token_callback](const std::string& piece) {
@@ -204,9 +209,11 @@ Error Runner::generate(
   // after the prompt. After that we will enter generate loop.
 
   // print prompts
-  wrapped_callback(prompt);
-
-  auto prefill_res = text_prefiller_->prefill(prompt_tokens, 0);
+  if (echo) {
+    wrapped_callback(prompt);
+  }
+  int64_t pos = 0;
+  auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos);
   stats_.first_token_ms = util::time_in_ms();
   stats_.prompt_eval_end_ms = util::time_in_ms();
   ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
@@ -214,6 +221,10 @@ Error Runner::generate(
 
   // print the first token from prefill. No prev_token so use cur_token for it.
   wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token)));
+  ET_LOG(
+      Info,
+      "RSS after prompt prefill: %f MiB (0 if unsupported)",
+      util::get_rss_bytes() / 1024.0 / 1024.0);
 
   // start the main loop
   prompt_tokens.push_back(cur_token);
@@ -222,6 +233,10 @@ Error Runner::generate(
 
   stats_.inference_end_ms = util::time_in_ms();
   printf("\n");
+  ET_LOG(
+      Info,
+      "RSS after finishing text generation: %f MiB (0 if unsupported)",
+      util::get_rss_bytes() / 1024.0 / 1024.0);
 
   if (num_prompt_tokens + num_generated_tokens == seq_len) {
     ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
index 4e3c1daef7..cec8c61157 100644
--- a/examples/models/llama2/runner/runner.h
+++ b/examples/models/llama2/runner/runner.h
@@ -40,7 +40,8 @@ class Runner {
       const std::string& prompt,
       int32_t seq_len = 128,
       std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {});
+      std::function<void(const Stats&)> stats_callback = {},
+      bool echo = true);
   void stop();
 
  private:
diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl
index 475c5d92ab..9ee3f99567 100644
--- a/examples/models/llama2/runner/targets.bzl
+++ b/examples/models/llama2/runner/targets.bzl
@@ -34,8 +34,8 @@ def define_common_targets():
                 "//executorch/extension/llm/runner:text_prefiller" + aten_suffix,
                 "//executorch/extension/llm/runner:text_token_generator" + aten_suffix,
                 "//executorch/extension/evalue_util:print_evalue" + aten_suffix,
-                "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
                 "//executorch/extension/module:module" + aten_suffix,
+                "//executorch/extension/tensor:tensor" + aten_suffix,
                 "//executorch/kernels/quantized:generated_lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
index 444f6b3389..c36e39a04c 100644
--- a/examples/models/llava/CMakeLists.txt
+++ b/examples/models/llava/CMakeLists.txt
@@ -21,6 +21,9 @@ project(llava)
 # Duplicating options as root CMakeLists.txt
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF)
 
+# This is a temporary hack to get around Torch dep so we can test this on android
+option(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE "Hack option to feed dummy image to remove torch.load dep" OFF)
+
 include(CMakeDependentOption)
 #
 # pthreadpool: build pthreadpool library. Disable on unsupported platforms
@@ -70,7 +73,14 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
 find_package(gflags REQUIRED)
 
-find_package(Torch CONFIG REQUIRED)
+# Avoid torch dep from torch.load()-ing the image.
+# This is a temporary hack.
+if(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
+  add_definitions(-DLLAVA_NO_TORCH_DUMMY_IMAGE=1)
+  message("Buidling the runner without Torch, feeding a dummy image!")
+else()
+  find_package(Torch CONFIG REQUIRED)
+endif()
 add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
 
 #
@@ -95,7 +105,11 @@ endif()
 # llava_runner library
 add_subdirectory(runner)
 
-set(link_libraries gflags torch)
+set(LINK_LIBS gflags)
+if(NOT LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
+  list(APPEND LINK_LIBS torch)
+endif()
+set(link_libraries ${LINK_LIBS})
 set(_srcs main.cpp)
 
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
diff --git a/examples/models/llava/README.md b/examples/models/llava/README.md
index 807e1b3cee..8cb605d75f 100644
--- a/examples/models/llava/README.md
+++ b/examples/models/llava/README.md
@@ -34,8 +34,9 @@ Run the following cmake commands from `executorch/`:
 cmake                                               \
     -DCMAKE_INSTALL_PREFIX=cmake-out                \
     -DCMAKE_BUILD_TYPE=Debug                        \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON          \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON     \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON          \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON          \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON            \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON         \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON         \
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
index 4f8a403bb3..bdeaef15fe 100644
--- a/examples/models/llava/export_llava.py
+++ b/examples/models/llava/export_llava.py
@@ -89,6 +89,7 @@ def forward(self, input_pos, embeddings):
         use_kv_cache=True,
         example_inputs=(torch.tensor([0], dtype=torch.int64), embeddings),
         dynamic_shapes=dynamic_shapes,
+        args=llava.text_model_args,
     )
 
     dtype_override = DType.fp32
@@ -145,6 +146,7 @@ def forward(self, images):
             use_kv_cache=True,
             example_inputs=(resized,),
             dynamic_shapes=dynamic_shapes,
+            args=None,
         )
         .capture_pre_autograd_graph()
         .pt2e_quantize([quantizer])
@@ -211,10 +213,15 @@ def export_all(llava_model: LlavaModel):
         partitioner={
             "image_encoder": [XnnpackPartitioner()],
             "text_model": [
+                # First partition the DQLinear nodes, then partition the rest of the nodes,
+                # to avoid multiple DQLinear nodes in the same partition,
+                # to avoid holding multiple unpacked and packed weight buffers in memory,
+                # to reduce peak memory footprint.
                 XnnpackPartitioner(
                     config_precisions=ConfigPrecisionType.DYNAMIC_QUANT,
                     per_op_mode=True,
-                )
+                ),
+                XnnpackPartitioner(),
             ],
         },
         compile_config=EdgeCompileConfig(_check_ir_validity=False),
diff --git a/examples/models/llava/install_requirements.sh b/examples/models/llava/install_requirements.sh
index 3bf803b356..931d63b391 100644
--- a/examples/models/llava/install_requirements.sh
+++ b/examples/models/llava/install_requirements.sh
@@ -7,6 +7,6 @@
 
 set -x
 
-pip install transformers accelerate
+pip install transformers accelerate sentencepiece
 
 pip list
diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp
index 171eb77077..53f6329b4d 100644
--- a/examples/models/llava/main.cpp
+++ b/examples/models/llava/main.cpp
@@ -8,7 +8,11 @@
 
 #include <executorch/examples/models/llava/runner/llava_runner.h>
 #include <gflags/gflags.h>
+#ifndef LLAVA_NO_TORCH_DUMMY_IMAGE
 #include <torch/torch.h>
+#else
+#include <algorithm> // std::fill
+#endif
 
 #if defined(ET_USE_THREADPOOL)
 #include <executorch/extension/threadpool/cpuinfo_utils.h>
@@ -80,6 +84,15 @@ int32_t main(int32_t argc, char** argv) {
 
   // read image and resize the longest edge to 336
   std::vector<uint8_t> image_data;
+
+#ifdef LLAVA_NO_TORCH_DUMMY_IMAGE
+  // Work without torch using a random data
+  image_data.resize(3 * 240 * 336);
+  std::fill(image_data.begin(), image_data.end(), 0); // black
+  std::array<int32_t, 3> image_shape = {3, 240, 336};
+  std::vector<torch::executor::Image> images = {
+      {.data = image_data, .width = image_shape[2], .height = image_shape[1]}};
+#else //  LLAVA_NO_TORCH_DUMMY_IMAGE
   //   cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR);
   //   int longest_edge = std::max(image.rows, image.cols);
   //   float scale_factor = 336.0f / longest_edge;
@@ -102,6 +115,8 @@ int32_t main(int32_t argc, char** argv) {
       {.data = image_data,
        .width = static_cast<int32_t>(image_tensor.size(2)),
        .height = static_cast<int32_t>(image_tensor.size(1))}};
+#endif // LLAVA_NO_TORCH_DUMMY_IMAGE
+
   // generate
   runner.generate(std::move(images), prompt, seq_len);
   return 0;
diff --git a/examples/models/llava/runner/CMakeLists.txt b/examples/models/llava/runner/CMakeLists.txt
index 564d31f8e7..2d0c30a620 100644
--- a/examples/models/llava/runner/CMakeLists.txt
+++ b/examples/models/llava/runner/CMakeLists.txt
@@ -40,8 +40,8 @@ add_subdirectory(
 
 add_library(llava_runner STATIC ${_llava_runner__srcs})
 
-set(llava_runner_deps executorch extension_module extension_data_loader
-                      extension_llm_runner
+set(llava_runner_deps executorch extension_data_loader extension_llm_runner
+                      extension_module extension_tensor
 )
 
 target_link_libraries(llava_runner PUBLIC ${llava_runner_deps})
diff --git a/examples/models/llava/runner/llava_image_prefiller.h b/examples/models/llava/runner/llava_image_prefiller.h
index 50c981026a..3597ff82ef 100644
--- a/examples/models/llava/runner/llava_image_prefiller.h
+++ b/examples/models/llava/runner/llava_image_prefiller.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <executorch/extension/llm/runner/image_prefiller.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/extension/tensor/tensor.h>
 
 namespace torch::executor {
 
@@ -26,18 +26,18 @@ class LlavaImagePrefiller : public ImagePrefiller {
    */
   inline Result<exec_aten::Tensor> prefill(Image& image, int64_t& start_pos)
       override {
-    ManagedTensor managed_images(
+    auto image_tensor = executorch::extension::from_blob(
         image.data.data(), {3, image.height, image.width}, ScalarType::Byte);
     // Run image encoder
-    std::vector<EValue> image_encoder_outputs = ET_UNWRAP(module_->execute(
-        kImageEncoderMethod, managed_images.get_aliasing_tensor()));
+    auto image_encoder_outputs =
+        ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
 
     // inputs:[start_pos, embeds]
-    ManagedTensor managed_start_pos(&start_pos, {1}, ScalarType::Long);
-    auto start_pos_tensor = managed_start_pos.get_aliasing_tensor();
+    auto start_pos_tensor =
+        executorch::extension::from_blob(&start_pos, {1}, ScalarType::Long);
 
     // Run text model
-    std::vector<EValue> outputs_res = ET_UNWRAP(module_->execute(
+    auto outputs_res = ET_UNWRAP(module_->execute(
         kTextModelMethod, {start_pos_tensor, image_encoder_outputs[0]}));
     ET_CHECK_MSG(
         outputs_res[0].isTensor(),
diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp
index 0fc06da0c5..64763c7257 100644
--- a/examples/models/llava/runner/llava_runner.cpp
+++ b/examples/models/llava/runner/llava_runner.cpp
@@ -72,6 +72,54 @@ Error LlavaRunner::load() {
   return Error::Ok;
 }
 
+Error LlavaRunner::prefill_images(
+    std::vector<Image>& images,
+    int64_t& start_pos) {
+  for (auto& image : images) {
+    // pos is updated inside image prefill.
+    ET_UNWRAP(image_prefiller_->prefill(image, start_pos));
+  }
+  return Error::Ok;
+}
+
+Result<uint64_t> LlavaRunner::prefill_prompt(
+    const std::string& prompt,
+    int64_t& start_pos,
+    int8_t bos,
+    int8_t eos) {
+  std::vector<uint64_t> prompt_tokens =
+      ET_UNWRAP(tokenizer_->encode(prompt, bos, eos));
+
+  return text_prefiller_->prefill(prompt_tokens, start_pos);
+}
+
+Error LlavaRunner::generate_from_pos(
+    const std::string& prompt,
+    int32_t seq_len,
+    int64_t start_pos,
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const ::executorch::extension::llm::Stats&)>
+        stats_callback) {
+  // prefill user prompt. No BOS because preset prompt already has it.
+  token_callback(prompt);
+
+  uint64_t prefill_next_token =
+      ET_UNWRAP(prefill_prompt(prompt, start_pos, /*bos=*/0, /*eos*/ 0));
+  stats_.num_prompt_tokens = start_pos;
+
+  // Generate tokens
+  int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
+      {prefill_next_token}, start_pos, seq_len, token_callback));
+
+  // Bookkeeping
+  stats_.num_generated_tokens = num_generated_tokens;
+  ::executorch::llm::print_report(stats_);
+  if (stats_callback) {
+    stats_callback(stats_);
+  }
+  return Error::Ok;
+}
+
 Error LlavaRunner::generate(
     std::vector<Image> images,
     const std::string& prompt,
@@ -83,6 +131,11 @@ Error LlavaRunner::generate(
     ET_CHECK_OK_OR_RETURN_ERROR(load());
   }
 
+  ET_LOG(
+      Info,
+      "RSS after loading model: %f MiB (0 if unsupported)",
+      util::get_rss_bytes() / 1024.0 / 1024.0);
+
   // Wrap the token_callback with print function
   std::function<void(const std::string&)> wrapped_callback =
       [token_callback](const std::string& piece) {
@@ -96,43 +149,26 @@ Error LlavaRunner::generate(
   int64_t pos = 0;
 
   // prefill preset prompt
-  std::vector<uint64_t> preset_prompt_tokens =
-      ET_UNWRAP(tokenizer_->encode(kPresetPrompt, /*bos=*/1, /*eos=*/0));
-  size_t num_preset_tokens = preset_prompt_tokens.size();
-
-  ET_UNWRAP(text_prefiller_->prefill(preset_prompt_tokens, pos));
-  pos += num_preset_tokens;
+  prefill_prompt(kPresetPrompt, pos, /*bos=*/1, /*eos*/ 0);
 
   // prefill images
-  for (auto& image : images) {
-    // pos is updated inside image prefill.
-    ET_UNWRAP(image_prefiller_->prefill(image, pos));
-  }
+  prefill_images(images, pos);
 
-  // prefill user prompt. No BOS because preset prompt already has it.
-  wrapped_callback(prompt);
-
-  std::vector<uint64_t> user_prompt_tokens =
-      ET_UNWRAP(tokenizer_->encode(prompt, /*bos=*/0, /*eos=*/0));
-  size_t num_user_tokens = user_prompt_tokens.size();
-
-  uint64_t prefill_next_token =
-      ET_UNWRAP(text_prefiller_->prefill(user_prompt_tokens, pos));
-  pos += num_user_tokens;
+  ET_LOG(
+      Info,
+      "RSS after prompt and image prefill: %f MiB (0 if unsupported)",
+      util::get_rss_bytes() / 1024.0 / 1024.0);
 
   // Generate tokens
-  int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
-      {prefill_next_token}, pos, seq_len, wrapped_callback));
+  Error err =
+      generate_from_pos(prompt, seq_len, pos, wrapped_callback, stats_callback);
 
-  // Bookkeeping
-  stats_.num_prompt_tokens = num_preset_tokens + num_user_tokens;
-  stats_.num_generated_tokens = num_generated_tokens;
-  ::executorch::llm::print_report(stats_);
-  if (stats_callback) {
-    stats_callback(stats_);
-  }
+  ET_LOG(
+      Info,
+      "RSS after finishing text generation: %f MiB (0 if unsupported)",
+      util::get_rss_bytes() / 1024.0 / 1024.0);
 
-  return Error::Ok;
+  return err;
 }
 
 } // namespace torch::executor
diff --git a/examples/models/llava/runner/llava_runner.h b/examples/models/llava/runner/llava_runner.h
index 9b14bc9283..923f8180a8 100644
--- a/examples/models/llava/runner/llava_runner.h
+++ b/examples/models/llava/runner/llava_runner.h
@@ -38,6 +38,48 @@ class LlavaRunner : public MultimodalRunner {
       std::function<void(const ::executorch::extension::llm::Stats&)>
           stats_callback = {});
 
+  /**
+   * Prefill an LLaVA Module with the given images input.
+   * @param images The image input to LLaVA.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * It's passed as reference and will be updated inside this function.
+   * @return The error status of prefilling images.
+   */
+  Error prefill_images(std::vector<Image>& images, int64_t& start_pos);
+
+  /**
+   * Prefill an LLaVA Module with the given text input.
+   * @param prompt The text prompt to LLaVA.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * It's passed as reference and will be updated inside this function.
+   * @param bos The number of BOS (begin of sequence) token.
+   * @param eos The number of EOS (end of sequence) token.
+   * @return The generated token of the LLaVA Module after prefill prompt.
+   */
+  Result<uint64_t> prefill_prompt(
+      const std::string& prompt,
+      int64_t& start_pos,
+      int8_t bos = 0,
+      int8_t eos = 0);
+
+  /**
+   * Generate tokens from the given prompt, starting from the given position.
+   * @param prompt The text prompt to LLaVA.
+   * @param seq_len The total sequence length, including the prompt tokens and
+   * new tokens.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * @param token_callback What to do after a token is generated.
+   * @param stats_callback What to do with Stats.
+   * @return The error code.
+   */
+  Error generate_from_pos(
+      const std::string& prompt,
+      int32_t seq_len = 1024,
+      int64_t start_pos = 0,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const ::executorch::extension::llm::Stats&)>
+          stats_callback = {});
+
  private:
   inline static const std::string kPresetPrompt =
       "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: ";
diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h
index e70ba59d51..a58bcc47e0 100644
--- a/examples/models/llava/runner/llava_text_decoder_runner.h
+++ b/examples/models/llava/runner/llava_text_decoder_runner.h
@@ -20,17 +20,14 @@ class LlavaTextDecoderRunner : public TextDecoderRunner {
       : TextDecoderRunner(module, true, vocab_size, temperature){};
 
   inline Result<exec_aten::Tensor> step(
-      ManagedTensor& managed_tokens,
-      ManagedTensor& managed_start_pos) override {
-    auto tokens = managed_tokens.get_aliasing_tensor();
-    auto start_pos = managed_start_pos.get_aliasing_tensor();
-
+      executorch::extension::TensorPtr& tokens,
+      executorch::extension::TensorPtr& start_pos) override {
     // run token embedding
-    std::vector<EValue> token_embedding_outputs =
+    auto token_embedding_outputs =
         ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, tokens));
 
     // run text model
-    std::vector<EValue> outputs_res = ET_UNWRAP(module_->execute(
+    auto outputs_res = ET_UNWRAP(module_->execute(
         kTextModelMethod, {start_pos, token_embedding_outputs[0]}));
 
     ET_CHECK_MSG(
diff --git a/examples/models/llava/runner/targets.bzl b/examples/models/llava/runner/targets.bzl
index 72942acf16..c7523d6cc4 100644
--- a/examples/models/llava/runner/targets.bzl
+++ b/examples/models/llava/runner/targets.bzl
@@ -16,8 +16,8 @@ def define_common_targets():
             "//executorch/extension/llm/runner:runner_lib",
             "//executorch/extension/llm/tokenizer:bpe_tokenizer",
             "//executorch/extension/evalue_util:print_evalue",
-            "//executorch/extension/runner_util:managed_tensor",
             "//executorch/extension/module:module",
+            "//executorch/extension/tensor:tensor",
             "//executorch/kernels/quantized:generated_lib",
             "//executorch/runtime/core/exec_aten:lib",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
diff --git a/examples/models/phi-3-mini/CMakeLists.txt b/examples/models/phi-3-mini/CMakeLists.txt
index 39358e088e..e1ffd0da05 100644
--- a/examples/models/phi-3-mini/CMakeLists.txt
+++ b/examples/models/phi-3-mini/CMakeLists.txt
@@ -23,6 +23,7 @@ set(CMAKE_BUILD_TYPE Release)
 # Set options for executorch build.
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
 option(EXECUTORCH_BUILD_XNNPACK "" ON)
 
@@ -47,6 +48,6 @@ target_include_directories(
   PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags/src
 )
 target_link_libraries(
-  phi_3_mini_runner PRIVATE executorch extension_module_static
+  phi_3_mini_runner PRIVATE executorch extension_module_static extension_tensor
                             optimized_native_cpu_ops_lib xnnpack_backend gflags
 )
diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md
index 6619a111a2..1926971621 100644
--- a/examples/models/phi-3-mini/README.md
+++ b/examples/models/phi-3-mini/README.md
@@ -26,8 +26,9 @@ python -m examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-m
      -DCMAKE_INSTALL_PREFIX=cmake-out \
      -DEXECUTORCH_ENABLE_LOGGING=1 \
      -DCMAKE_BUILD_TYPE=Release \
-     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
      -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
      -DEXECUTORCH_BUILD_XNNPACK=ON \
      -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
      -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
diff --git a/examples/models/phi-3-mini/runner.cpp b/examples/models/phi-3-mini/runner.cpp
index a6cee57ea8..9da323278f 100644
--- a/examples/models/phi-3-mini/runner.cpp
+++ b/examples/models/phi-3-mini/runner.cpp
@@ -12,7 +12,7 @@
 #include <iostream>
 
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/platform/log.h>
 
 namespace torch::executor {
@@ -81,23 +81,17 @@ uint64_t Runner::logits_to_token(const exec_aten::Tensor& logits_tensor) {
 }
 
 uint64_t Runner::prefill(std::vector<uint64_t>& tokens) {
-  ManagedTensor input_tokens(
+  auto result = module_->forward(from_blob(
       tokens.data(),
       {1, static_cast<exec_aten::SizesType>(tokens.size())},
-      ScalarType::Long);
-  std::vector<EValue> inputs = {input_tokens.get_aliasing_tensor()};
-
-  auto result = module_->forward(inputs);
+      ScalarType::Long));
   ET_CHECK_MSG(result.error() == Error::Ok, "Failed to prefill tokens");
 
   return logits_to_token(result.get()[0].toTensor());
 }
 
 uint64_t Runner::run_model_step(uint64_t token) {
-  ManagedTensor input_token(&token, {1, 1}, ScalarType::Long);
-  std::vector<EValue> inputs = {input_token.get_aliasing_tensor()};
-
-  auto result = module_->forward(inputs);
+  auto result = module_->forward(from_blob(&token, {1, 1}, ScalarType::Long));
   ET_CHECK_MSG(
       result.error() == Error::Ok,
       "Failed to run forward() for token %" PRIu64,
diff --git a/examples/models/test/test_export.py b/examples/models/test/test_export.py
index f258cc2139..b3030c24fe 100644
--- a/examples/models/test/test_export.py
+++ b/examples/models/test/test_export.py
@@ -29,7 +29,7 @@ def collect_executorch_and_eager_outputs(
         Returns a tuple containing the outputs of the eager mode model and the executorch mode model.
         """
         eager_model = eager_model.eval()
-        model = torch._export.capture_pre_autograd_graph(eager_model, example_inputs)
+        model = torch.export.export_for_training(eager_model, example_inputs).module()
         edge_model = export_to_edge(model, example_inputs)
 
         executorch_prog = edge_model.to_executorch()
diff --git a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt
index 006e0f7517..9799508633 100644
--- a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt
+++ b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt
@@ -28,6 +28,7 @@ target_link_libraries(
   full_portable_ops_lib
   extension_data_loader
   extension_module
+  extension_tensor
   gflags
   re2::re2
 )
diff --git a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp
index 7340672c9e..599accfd1e 100644
--- a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp
@@ -16,7 +16,6 @@
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama2/runner/runner.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 #include <executorch/runtime/platform/log.h>
 
 #include <gflags/gflags.h>
diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
index d452336175..0ccaefa79e 100644
--- a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
@@ -13,9 +13,9 @@
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/llm/runner/util.h>
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <executorch/runtime/platform/log.h>
 
 #include <ctime>
@@ -26,6 +26,7 @@ namespace torch {
 namespace executor {
 
 namespace {
+using namespace executorch::extension;
 static constexpr auto kTopp = 0.9f;
 void printReport(const Runner::Stats& stats);
 std::string statsToJsonString(const Runner::Stats& stats);
@@ -136,32 +137,30 @@ int32_t Runner::logitsToToken(const exec_aten::Tensor& logits_tensor) {
 // step. Returning the logits tensor.
 Result<torch::executor::Tensor> Runner::run_model_step(
     int64_t input_token,
-    Tensor& token,
-    Tensor& start_pos,
-    Tensor& atten_mask,
-    std::vector<Tensor>& kv_tensors,
-    std::vector<Tensor>& kv_outputs) {
-  token.mutable_data_ptr<int32_t>()[0] = input_token;
+    TensorPtr& token,
+    TensorPtr& start_pos,
+    TensorPtr& atten_mask,
+    std::vector<TensorPtr>& kv_tensors,
+    std::vector<TensorPtr>& kv_outputs) {
+  token->mutable_data_ptr<int32_t>()[0] = input_token;
 
   // inputs:[tokens, start_pos, atten_mask, k_cache, v_cache]
-  std::vector<EValue> inputs = {token, start_pos, atten_mask};
-  inputs.insert(inputs.end(), kv_tensors.begin(), kv_tensors.end());
-  Result<std::vector<EValue>> outputs_res = module_->forward(inputs);
+  auto outputs_res = module_->forward({*token, *start_pos, *atten_mask});
   ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
 
   // TODO: need to handle batch size != 1
-  size_t v_offset = kv_outputs[0].nbytes();
-  size_t el_size = kv_outputs[0].element_size();
+  size_t v_offset = kv_outputs[0]->nbytes();
+  size_t el_size = kv_outputs[0]->element_size();
   size_t k_input_step = (max_seq_len_ - 1) * el_size;
   int k_tensors_end = kv_tensors.size() / 2;
   // update k caches
   for (int j = 0; j < k_tensors_end; ++j) {
     uint8_t* input_addr =
-        static_cast<uint8_t*>(kv_tensors[j].mutable_data_ptr());
+        static_cast<uint8_t*>(kv_tensors[j]->mutable_data_ptr());
     uint8_t* output_addr =
-        static_cast<uint8_t*>(kv_outputs[j].mutable_data_ptr());
+        static_cast<uint8_t*>(kv_outputs[j]->mutable_data_ptr());
     // fill the output k values back
-    for (int src = 0, dst = k_input_step; src < kv_outputs[j].nbytes();
+    for (int src = 0, dst = k_input_step; src < kv_outputs[j]->nbytes();
          src += el_size, dst += k_input_step) {
       input_addr[dst] = output_addr[src];
     }
@@ -169,7 +168,7 @@ Result<torch::executor::Tensor> Runner::run_model_step(
     // inputs
     ET_CHECK_MSG(
         internal::set_tensor_data(
-            kv_tensors[j], new_inp_addr, kv_tensors[j].nbytes()) == Error::Ok,
+            *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok,
         "Failed to set input tensor when updating k_cache");
   }
   // update v caches
@@ -179,25 +178,25 @@ Result<torch::executor::Tensor> Runner::run_model_step(
 
     ET_CHECK_MSG(
         internal::set_tensor_data(
-            kv_tensors[j], new_inp_addr, kv_tensors[j].nbytes()) == Error::Ok,
+            *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok,
         "Failed to set input tensor when updating v_cache");
     // outputs
     char* new_out_addr = io_mem_mgr_.update_v_caches_write(v_idx, v_offset);
     ET_CHECK_MSG(
         internal::set_tensor_data(
-            kv_outputs[j], new_out_addr, kv_outputs[j].nbytes()) == Error::Ok,
+            *kv_outputs[j], new_out_addr, kv_outputs[j]->nbytes()) == Error::Ok,
         "Failed to set output tensor when updating v_cache");
     ET_CHECK_MSG(
-        module_->set_output_data_ptr(kv_outputs[j], j + 1) == Error::Ok,
+        module_->set_output_data_ptr(*kv_outputs[j], j + 1) == Error::Ok,
         "Failed to set llama output data pointer");
   }
 
   // Bump start_pos by 1
-  start_pos.mutable_data_ptr<int32_t>()[0]++;
+  start_pos->mutable_data_ptr<int32_t>()[0]++;
 
   // update atten_mask
-  atten_mask.mutable_data_ptr<float>()
-      [atten_mask.numel() - 1 - start_pos.const_data_ptr<int32_t>()[0]] = 0;
+  atten_mask->mutable_data_ptr<float>()
+      [atten_mask->numel() - 1 - start_pos->const_data_ptr<int32_t>()[0]] = 0;
   return outputs_res.get()[0].toTensor();
 }
 // TODO: add overloaded method for on-device tokenize
@@ -253,19 +252,14 @@ Error Runner::generate(
   std::vector<exec_aten::SizesType> hidden_states_data_shape = {1, 1, dim_};
 
   // initialize tensor wrappers
-  ManagedTensor managed_token(
+  auto token = from_blob(
       io_mem_mgr_.get_input_token_ptr(), token_shape, ScalarType::Int);
-  ManagedTensor managed_pos_id(
+  auto start_pos = from_blob(
       io_mem_mgr_.get_pos_idx_ptr(), start_pos_shape, ScalarType::Int);
-  ManagedTensor managed_atten_mask(
+  auto atten_mask = from_blob(
       io_mem_mgr_.get_atten_mask_ptr(), atten_mask_shape, ScalarType::Float);
 
-  Tensor token = managed_token.get_aliasing_tensor();
-  Tensor atten_mask = managed_atten_mask.get_aliasing_tensor();
-  Tensor start_pos = managed_pos_id.get_aliasing_tensor();
-
-  std::vector<ManagedTensor> managed_kv_inputs, managed_kv_outputs;
-  std::vector<Tensor> kv_tensors, kv_outputs;
+  std::vector<TensorPtr> kv_tensors, kv_outputs;
 
   Result<MethodMeta> method_meta = get_method_meta();
   size_t num_inputs = method_meta->num_inputs();
@@ -282,22 +276,20 @@ Error Runner::generate(
     auto tensor_shape = tensor_meta->sizes();
     std::vector<exec_aten::SizesType> sizes(
         tensor_shape.data(), tensor_shape.data() + tensor_shape.size());
-    managed_kv_inputs.emplace_back(ManagedTensor(
+    kv_tensors.emplace_back(from_blob(
         io_mem_mgr_.get_k_caches_read_ptr(i),
         sizes,
         tensor_meta->scalar_type()));
-    kv_tensors.emplace_back(managed_kv_inputs.back().get_aliasing_tensor());
 
     // outpus
     Result<TensorInfo> out_tensor_meta = method_meta->output_tensor_meta(i + 1);
     tensor_shape = out_tensor_meta->sizes();
     sizes = std::vector<exec_aten::SizesType>{
         tensor_shape.data(), tensor_shape.data() + tensor_shape.size()};
-    managed_kv_outputs.emplace_back(ManagedTensor(
+    kv_outputs.emplace_back(from_blob(
         io_mem_mgr_.get_k_caches_write_ptr(i),
         sizes,
-        kv_tensors.back().scalar_type()));
-    kv_outputs.emplace_back(managed_kv_outputs.back().get_aliasing_tensor());
+        kv_tensors.back()->scalar_type()));
     ET_CHECK_MSG(
         module_->set_output_data_ptr(kv_outputs.back(), i + 1) == Error::Ok,
         "Failed to set output tensor for kv cache");
@@ -314,11 +306,10 @@ Error Runner::generate(
     std::vector<exec_aten::SizesType> sizes(
         tensor_shape.data(), tensor_shape.data() + tensor_shape.size());
 
-    managed_kv_inputs.emplace_back(ManagedTensor(
+    kv_tensors.emplace_back(from_blob(
         io_mem_mgr_.get_v_caches_read_ptr(i),
         sizes,
         tensor_meta->scalar_type()));
-    kv_tensors.push_back(managed_kv_inputs.back().get_aliasing_tensor());
 
     // outputs
     Result<TensorInfo> out_tensor_meta =
@@ -327,22 +318,20 @@ Error Runner::generate(
     sizes = std::vector<exec_aten::SizesType>{
         tensor_shape.data(), tensor_shape.data() + tensor_shape.size()};
 
-    managed_kv_outputs.push_back(ManagedTensor(
+    kv_outputs.push_back(from_blob(
         io_mem_mgr_.get_v_caches_write_ptr(i),
         sizes,
-        kv_tensors.back().scalar_type()));
-    kv_outputs.push_back(managed_kv_outputs.back().get_aliasing_tensor());
+        kv_tensors.back()->scalar_type()));
     ET_CHECK_MSG(
         module_->set_output_data_ptr(kv_outputs.back(), output_index) ==
             Error::Ok,
         "Failed to set output tensor for llama block");
   }
 
-  ManagedTensor affine_managed_logits(
+  auto affine_logits = from_blob(
       reinterpret_cast<float*>(io_mem_mgr_.get_logit_ptr()),
       logits_data_shape,
       ScalarType::Float);
-  Tensor affine_logits = affine_managed_logits.get_aliasing_tensor();
   ET_CHECK_MSG(
       module_->set_output_data_ptr(affine_logits, 0) == Error::Ok,
       "Failed to set output tensor for affine module - logits");
@@ -351,7 +340,7 @@ Error Runner::generate(
   std::string final_output;
   while (pos < seq_len - 1) {
     // Run the model
-    Result<torch::executor::Tensor> logits_res = run_model_step(
+    auto logits_res = run_model_step(
         cur_token, token, start_pos, atten_mask, kv_tensors, kv_outputs);
     if (pos == num_prompt_tokens) {
       stats_.first_token_ms = util::time_in_ms();
diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.h b/examples/qualcomm/oss_scripts/llama2/runner/runner.h
index cdbb2cdd2e..1c35c821ce 100644
--- a/examples/qualcomm/oss_scripts/llama2/runner/runner.h
+++ b/examples/qualcomm/oss_scripts/llama2/runner/runner.h
@@ -21,7 +21,7 @@
 #include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/extension/tensor/tensor.h>
 
 class RpcMemAllocator {
  public:
@@ -248,13 +248,13 @@ class Runner {
   T getMetadataHelper(std::string method_name, T default_val);
   template <typename T>
   int32_t logitsToToken(const exec_aten::Tensor& logits_tensor);
-  Result<torch::executor::Tensor> run_model_step(
+  Result<Tensor> run_model_step(
       int64_t input_token,
-      Tensor& token,
-      Tensor& start_pos,
-      Tensor& atten_mask,
-      std::vector<Tensor>& kv_tensors,
-      std::vector<Tensor>& kv_outputs);
+      ::executorch::extension::TensorPtr& token,
+      ::executorch::extension::TensorPtr& start_pos,
+      ::executorch::extension::TensorPtr& atten_mask,
+      std::vector<::executorch::extension::TensorPtr>& kv_tensors,
+      std::vector<::executorch::extension::TensorPtr>& kv_outputs);
   // metadata
   int32_t vocab_size_;
   int64_t bos_id_;
diff --git a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
index c1fd5dc653..1a9406ca95 100644
--- a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
+++ b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
@@ -35,6 +35,7 @@ target_link_libraries(
   executorch_no_prim_ops
   extension_data_loader
   extension_module
+  extension_tensor
   gflags
   re2::re2
 )
@@ -89,6 +90,7 @@ target_link_libraries(
   executorch_no_prim_ops
   extension_data_loader
   extension_module
+  extension_tensor
   gflags
   re2::re2
 )
diff --git a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp
index 49782cf878..d69aa0aa7a 100644
--- a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp
@@ -16,7 +16,6 @@
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/examples/qualcomm/qaihub_scripts/llama/runner/runner.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 #include <executorch/runtime/platform/log.h>
 
 #include <gflags/gflags.h>
diff --git a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp
index aae18434c6..9d06e8118d 100644
--- a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp
@@ -16,7 +16,6 @@
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/examples/qualcomm/qaihub_scripts/llama/runner/runner.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 #include <executorch/runtime/platform/log.h>
 
 #include <gflags/gflags.h>
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp
index ec13cec37c..d6d9911293 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp
@@ -17,7 +17,6 @@
 #include <executorch/examples/qualcomm/qaihub_scripts/llama/runner/runner.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/llm/runner/util.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/platform/log.h>
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h
index b9849a2132..bd24ea6beb 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h
@@ -21,7 +21,6 @@
 #include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 
 namespace torch {
 namespace executor {
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt
index e6af95595b..c59cea32b9 100644
--- a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt
@@ -24,6 +24,7 @@ target_link_libraries(
   executorch_no_prim_ops
   extension_data_loader
   extension_module
+  extension_tensor
   gflags
   re2::re2
 )
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp
index 3d3d99d707..b6c211d8ac 100644
--- a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp
@@ -12,7 +12,7 @@
 
 #include <executorch/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h>
 #include <executorch/extension/llm/runner/util.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/extension/tensor/tensor.h>
 
 #include <ctime>
 #include <fstream>
@@ -22,6 +22,8 @@
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/platform/log.h>
 
+using namespace ::executorch::extension;
+
 namespace torch {
 namespace executor {
 
@@ -350,31 +352,27 @@ Error Runner::generate(std::string prompt) {
 
   MethodMeta encoder_method_meta = method_metas[0].get();
   // Initialize text_encoder input tensors: cond/uncond tokenized_input[1,77]
-  ManagedTensor managed_cond_tokens(
+  auto cond_tokens_tensor = from_blob(
       cond_tokens.data(),
       {1, 77},
       encoder_method_meta.input_tensor_meta(0)->scalar_type());
-  ManagedTensor managed_uncond_tokens(
+  auto uncond_tokens_tensor = from_blob(
       uncond_tokens.data(),
       {1, 77},
       encoder_method_meta.input_tensor_meta(0)->scalar_type());
-  Tensor cond_tokens_tensor = managed_cond_tokens.get_aliasing_tensor();
-  Tensor uncond_tokens_tensor = managed_uncond_tokens.get_aliasing_tensor();
   // Initialize text_encoder output tensors: cond/uncond embedding[1, 77, 1024]
   constexpr int emb_size = 1 * 77 * 1024;
   std::vector<uint16_t> cond_emb_vec(emb_size);
   std::vector<uint16_t> uncond_emb_vec(emb_size);
   std::vector<float> fp_emb_vec(emb_size);
-  ManagedTensor managed_cond_emb(
+  auto cond_emb_tensor = from_blob(
       cond_emb_vec.data(),
       {1, 77, 1024},
       encoder_method_meta.output_tensor_meta(0)->scalar_type());
-  ManagedTensor managed_uncond_emb(
+  auto uncond_emb_tensor = from_blob(
       uncond_emb_vec.data(),
       {1, 77, 1024},
       encoder_method_meta.output_tensor_meta(0)->scalar_type());
-  Tensor cond_emb_tensor = managed_cond_emb.get_aliasing_tensor();
-  Tensor uncond_emb_tensor = managed_uncond_emb.get_aliasing_tensor();
   modules_[0]->set_output_data_ptr(cond_emb_tensor, 0);
   long encoder_start = util::time_in_ms();
   auto cond_res = modules_[0]->forward(cond_tokens_tensor);
@@ -403,22 +401,17 @@ Error Runner::generate(std::string prompt) {
   //  3. cond/uncond embedding[1,77,1024]
   std::vector<uint16_t> latent_model_input(latent.size());
   std::vector<float> fp_latent_model_input(latent.size());
-  ManagedTensor managed_latent(
+  auto latent_tensor = from_blob(
       latent_model_input.data(),
       {1, 64, 64, 4},
       unet_method_meta.input_tensor_meta(0)->scalar_type());
-  Tensor latent_tensor = managed_latent.get_aliasing_tensor();
-  std::vector<ManagedTensor> managed_time_emb_tensors;
-  std::vector<Tensor> time_emb_tensors;
-  managed_time_emb_tensors.reserve(num_time_steps_);
+  std::vector<TensorPtr> time_emb_tensors;
   time_emb_tensors.reserve(num_time_steps_);
-  for (int step_index = 0; step_index < num_time_steps_; step_index++) {
-    managed_time_emb_tensors.emplace_back(ManagedTensor(
+  for (auto step_index = 0; step_index < num_time_steps_; step_index++) {
+    time_emb_tensors.emplace_back(from_blob(
         time_emb_list_[step_index].data(),
         {1, 1280},
         unet_method_meta.input_tensor_meta(1)->scalar_type()));
-    time_emb_tensors.emplace_back(
-        managed_time_emb_tensors.back().get_aliasing_tensor());
   }
   // requantize text encoders output
   dequant_tensor(
@@ -447,17 +440,14 @@ Error Runner::generate(std::string prompt) {
   std::vector<uint16_t> noise_pred_uncond(latent.size());
   std::vector<float> fp_noise_pred_text(noise_pred_text.size());
   std::vector<float> fp_noise_pred_uncond(noise_pred_uncond.size());
-  ManagedTensor managed_noise_pred_text(
+  auto noise_pred_text_tensor = from_blob(
       noise_pred_text.data(),
       {1, 64, 64, 4},
       unet_method_meta.output_tensor_meta(0)->scalar_type());
-  Tensor noise_pred_text_tensor = managed_noise_pred_text.get_aliasing_tensor();
-  ManagedTensor managed_noise_pred_uncond(
+  auto noise_pred_uncond_tensor = from_blob(
       noise_pred_uncond.data(),
       {1, 64, 64, 4},
       unet_method_meta.output_tensor_meta(0)->scalar_type());
-  Tensor noise_pred_uncond_tensor =
-      managed_noise_pred_uncond.get_aliasing_tensor();
 
   // Execute unet
   for (int step_index = 0; step_index < num_time_steps_; step_index++) {
@@ -514,20 +504,18 @@ Error Runner::generate(std::string prompt) {
   MethodMeta vae_method_meta = method_metas[2].get();
   // Initialize vae input tensor : latent[1,64,64,4]
   std::vector<uint16_t> vae_input(latent.size());
-  ManagedTensor managed_vae_input(
+  auto vae_input_tensor = from_blob(
       vae_input.data(),
       {1, 64, 64, 4},
       vae_method_meta.input_tensor_meta(0)->scalar_type());
-  Tensor vae_input_tensor = managed_vae_input.get_aliasing_tensor();
   // Intialize vae output tensor: output[1,512,512,3]
   constexpr int image_size = 1 * 512 * 512 * 3;
   std::vector<uint16_t> q_out(image_size);
   std::vector<float> out(image_size);
-  ManagedTensor managed_output(
+  auto output_tensor = from_blob(
       q_out.data(),
       {1, 512, 512, 3},
       vae_method_meta.output_tensor_meta(0)->scalar_type());
-  Tensor output_tensor = managed_output.get_aliasing_tensor();
 
   quant_tensor(latent, vae_input, vae_input_scale_, vae_input_offset_);
 
diff --git a/examples/xnnpack/README.md b/examples/xnnpack/README.md
index 61c14b5c7e..dcd5b9c5d7 100644
--- a/examples/xnnpack/README.md
+++ b/examples/xnnpack/README.md
@@ -38,9 +38,10 @@ mkdir cmake-out
 cmake \
     -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DPYTHON_EXECUTABLE=python \
     -Bcmake-out .
@@ -92,9 +93,10 @@ mkdir cmake-out
 cmake \
     -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DPYTHON_EXECUTABLE=python \
     -Bcmake-out .
diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py
index 24ee6bd21a..2256d5fcc9 100644
--- a/exir/_serialize/_program.py
+++ b/exir/_serialize/_program.py
@@ -347,7 +347,6 @@ def serialize_pte_binary(
     *,
     mutable_data: Optional[List[Buffer]] = None,
     extract_delegate_segments: bool = False,
-    extract_constant_segment: bool = True,
     segment_alignment: int = 128,
     constant_tensor_alignment: Optional[int] = None,
     delegate_alignment: Optional[int] = None,
@@ -363,8 +362,6 @@ def serialize_pte_binary(
               and the starting segment offset.
             - Update the Program.segments field with the offsets and lengths
               of each segment.
-        extract_constant_segment: Whether to move the constant data from the Program
-            into a separate segment.
         segment_alignment: Alignment in bytes. The starting offset of each
             segment will be aligned to this value in the output data.
         constant_tensor_alignment: The minimum alignment of tensor
@@ -387,19 +384,23 @@ def serialize_pte_binary(
     # Store extracted segment data; this may be constant data or delegate data.
     segments: List[Cord] = []
 
-    if extract_constant_segment:
-        constant_segment_data, constant_segment_offsets = _extract_constant_segment(
-            program.constant_buffer, tensor_alignment=constant_tensor_alignment
+    constant_segment_data, constant_segment_offsets = _extract_constant_segment(
+        program.constant_buffer, tensor_alignment=constant_tensor_alignment
+    )
+
+    # If there are no constants, len(constant_segment_data) = 0. However, there may
+    # be non-constants, in which case len(constant_segment_offsets) = 1, containing
+    # the placeholder value 0. Ensure the placeholder value is put into
+    # program.constant_segment.offsets.
+    if len(constant_segment_offsets) > 0:
+        # Update program.constant_segment with constant subsegment offset information.
+        program.constant_segment = SubsegmentOffsets(
+            segment_index=len(segments), offsets=constant_segment_offsets
         )
-        if len(constant_segment_data) > 0:
-            # Update program.constant_segment with constant subsegment offset information.
-            program.constant_segment = SubsegmentOffsets(
-                segment_index=len(segments), offsets=constant_segment_offsets
-            )
-            # Clear the constant buffer, as constant data will be stored in segments.
-            program.constant_buffer = []
-            # Add to the aggregate segments cord.
-            segments.append(constant_segment_data)
+        # Clear the constant buffer, as constant data will be stored in segments.
+        program.constant_buffer = []
+        # Add to the aggregate segments cord.
+        segments.append(constant_segment_data)
 
     if mutable_data is not None:
         mutable_segment_data, mutable_segment_offsets = _extract_constant_segment(
diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py
index c4f4df0d0b..afd8e3d282 100644
--- a/exir/_serialize/test/test_program.py
+++ b/exir/_serialize/test/test_program.py
@@ -583,6 +583,33 @@ def test_round_trip_with_segments(self) -> None:
         program2 = deserialize_pte_binary(pte_data)
         self.assert_programs_equal(program, program2)
 
+    def test_no_constants(self) -> None:
+        program = get_test_program()
+        # Insert placeholder for non-const tensors.
+        add_constant_data(program, [b""])
+
+        pte_data = bytes(
+            serialize_pte_binary(
+                program,
+                extract_delegate_segments=True,
+                segment_alignment=SEGMENT_ALIGNMENT,
+                constant_tensor_alignment=CONSTANT_TENSOR_ALIGNMENT,
+            )
+        )
+        # The input Program should not be modified.
+        self.assertEqual(program.segments, [])
+
+        # Peek inside the actual flatbuffer data to see the segments.
+        flatbuffer_program = _json_to_program(_program_flatbuffer_to_json(pte_data))
+
+        # Constant buffer should be empty.
+        self.assertEqual(len(flatbuffer_program.constant_buffer), 0)
+
+        # Constant segment should contain the placeholder.
+        self.assertEqual(flatbuffer_program.constant_segment.segment_index, 0)
+        self.assertEqual(len(flatbuffer_program.constant_segment.offsets), 1)
+        self.assertEqual(flatbuffer_program.constant_segment.offsets[0], 0)
+
     def test_unused_inline_delegate_blobs_with_segments(self) -> None:
         # Create a program with some delegate data blobs.
         program = get_test_program()
diff --git a/exir/capture/_config.py b/exir/capture/_config.py
index 7b91464bdc..2d0a6c4ca8 100644
--- a/exir/capture/_config.py
+++ b/exir/capture/_config.py
@@ -65,12 +65,6 @@ class ExecutorchBackendConfig:
     # This makes it possible to free those blobs at runtime.
     extract_delegate_segments: bool = True
 
-    # Whether to extract constants from the Program into separate segments,
-    # rather than encoding those constants in the flatbuffer data.
-    # This reduces the memory overhead of creating the .pte file for models with
-    # large constant data.
-    extract_constant_segment: bool = True
-
     # When extracting segments, the starting offset of each segment will be
     # aligned to this value (in bytes). Must be a power of two.
     segment_alignment: int = 128
diff --git a/exir/program/_program.py b/exir/program/_program.py
index 849eae4f6f..1339760f21 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -439,7 +439,6 @@ def to_executorch(
             new_prog,
             emit_stacktrace=config.emit_stacktrace,
             extract_delegate_segments=config.extract_delegate_segments,
-            extract_constant_segment=config.extract_constant_segment,
             segment_alignment=config.segment_alignment,
             constant_tensor_alignment=config.constant_tensor_alignment,
             delegate_alignment=config.delegate_alignment,
@@ -468,7 +467,6 @@ def __init__(
         exir_exported_program: ExirExportedProgram,
         emit_stacktrace: bool,
         extract_delegate_segments: bool,
-        extract_constant_segment: bool,
         segment_alignment: int,
         constant_tensor_alignment: Optional[int] = None,
         delegate_alignment: Optional[int] = None,
@@ -483,7 +481,6 @@ def __init__(
         self._emitter_output: Optional[EmitterOutput] = None
         self._emit_stacktrace: bool = emit_stacktrace
         self._extract_delegate_segments: bool = extract_delegate_segments
-        self._extract_constant_segment: bool = extract_constant_segment
         self._segment_alignment: int = segment_alignment
         self._constant_tensor_alignment: Optional[int] = constant_tensor_alignment
         self._delegate_alignment: Optional[int] = delegate_alignment
@@ -493,7 +490,6 @@ def _get_pte_data(self) -> Cord:
             self._pte_data = _serialize_pte_binary(
                 program=self.program,
                 extract_delegate_segments=self._extract_delegate_segments,
-                extract_constant_segment=self._extract_constant_segment,
                 segment_alignment=self._segment_alignment,
                 constant_tensor_alignment=self._constant_tensor_alignment,
                 delegate_alignment=self._delegate_alignment,
@@ -1351,7 +1347,6 @@ def __init__(
             program=self._emitter_output.program,
             mutable_data=self._emitter_output.mutable_data,
             extract_delegate_segments=backend_config.extract_delegate_segments,
-            extract_constant_segment=backend_config.extract_constant_segment,
             segment_alignment=backend_config.segment_alignment,
             constant_tensor_alignment=backend_config.constant_tensor_alignment,
             delegate_alignment=backend_config.delegate_alignment,
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 6827ae7904..74f9896000 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -39,6 +39,7 @@ list(
   extension_data_loader
   extension_module
   extension_runner_util
+  extension_tensor
   extension_threadpool
   fbjni
 )
diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK
index 7afd9f8a94..7cdf8ef7ec 100644
--- a/extension/android/jni/BUCK
+++ b/extension/android/jni/BUCK
@@ -40,7 +40,7 @@ fb_android_cxx_library(
         "//third-party/glog:glog",
         "//xplat/executorch/extension/module:module_static",
         "//xplat/executorch/extension/runner_util:inputs_static",
-        "//xplat/executorch/extension/runner_util:managed_tensor_static",
+        "//xplat/executorch/extension/tensor:tensor_static",
     ],
 )
 
@@ -64,7 +64,7 @@ fb_android_cxx_library(
         "//xplat/executorch/backends/xnnpack:xnnpack_backend_static",
         "//xplat/executorch/extension/module:module_static",
         "//xplat/executorch/extension/runner_util:inputs_static",
-        "//xplat/executorch/extension/runner_util:managed_tensor_static",
+        "//xplat/executorch/extension/tensor:tensor_static",
     ],
 )
 
diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp
index 79c6ebc516..ef74d6480b 100644
--- a/extension/android/jni/jni_layer.cpp
+++ b/extension/android/jni/jni_layer.cpp
@@ -19,7 +19,7 @@
 
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/runner_util/inputs.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/portable_type/tensor_impl.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/platform.h>
@@ -55,6 +55,7 @@ void et_pal_emit_log_message(
 }
 #endif
 
+using namespace executorch::extension;
 using namespace torch::executor;
 
 namespace executorch::extension {
@@ -167,7 +168,7 @@ class JEValue : public facebook::jni::JavaClass<JEValue> {
         evalue.tag);
   }
 
-  static ManagedTensor JEValueToTensorImpl(
+  static TensorPtr JEValueToTensorImpl(
       facebook::jni::alias_ref<JEValue> JEValue) {
     static const auto typeCodeField =
         JEValue::javaClassStatic()->getField<jint>("mTypeCode");
@@ -221,7 +222,7 @@ class JEValue : public facebook::jni::JavaClass<JEValue> {
             numel,
             dataCapacity);
       }
-      return ManagedTensor(
+      return from_blob(
           jni->GetDirectBufferAddress(jbuffer.get()), shape_vec, scalar_type);
     }
     facebook::jni::throwNewJavaException(
@@ -293,9 +294,8 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
       facebook::jni::alias_ref<
           facebook::jni::JArrayClass<JEValue::javaobject>::javaobject>
           jinputs) {
-    std::vector<EValue> evalues = {};
-
-    std::vector<ManagedTensor> managed_tensors = {};
+    std::vector<EValue> evalues;
+    std::vector<TensorPtr> tensors;
 
     static const auto typeCodeField =
         JEValue::javaClassStatic()->getField<jint>("mTypeCode");
@@ -304,18 +304,17 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
       auto jevalue = jinputs->getElement(i);
       const auto typeCode = jevalue->getFieldValue(typeCodeField);
       if (typeCode == JEValue::kTypeCodeTensor) {
-        managed_tensors.emplace_back(JEValue::JEValueToTensorImpl(jevalue));
-        evalues.emplace_back(
-            EValue(managed_tensors.back().get_aliasing_tensor()));
+        tensors.emplace_back(JEValue::JEValueToTensorImpl(jevalue));
+        evalues.emplace_back(tensors.back());
       } else if (typeCode == JEValue::kTypeCodeInt) {
         int64_t value = jevalue->getFieldValue(typeCodeField);
-        evalues.emplace_back(EValue(value));
+        evalues.emplace_back(value);
       } else if (typeCode == JEValue::kTypeCodeDouble) {
         double value = jevalue->getFieldValue(typeCodeField);
-        evalues.emplace_back(EValue(value));
+        evalues.emplace_back(value);
       } else if (typeCode == JEValue::kTypeCodeBool) {
         bool value = jevalue->getFieldValue(typeCodeField);
-        evalues.emplace_back(EValue(value));
+        evalues.emplace_back(value);
       }
     }
 
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index dda9ece589..0d43317c3c 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -150,6 +150,7 @@ class ExecuTorchLlamaJni
       jint channels,
       facebook::jni::alias_ref<jstring> prompt,
       jint seq_len,
+      jboolean echo,
       facebook::jni::alias_ref<ExecuTorchLlamaCallbackJni> callback) {
     if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
       auto image_size = image->size();
@@ -175,11 +176,92 @@ class ExecuTorchLlamaJni
           prompt->toStdString(),
           seq_len,
           [callback](std::string result) { callback->onResult(result); },
-          [callback](const Stats& result) { callback->onStats(result); });
+          [callback](const Stats& result) { callback->onStats(result); },
+          echo);
     }
     return 0;
   }
 
+  // Returns a tuple of (error, start_pos)
+  // Contract is valid within an AAR (JNI + corresponding Java code)
+  // If the first element is not Error::Ok, the other element is undefined.
+  facebook::jni::local_ref<jlongArray> prefill_prompt(
+      facebook::jni::alias_ref<jstring> prompt,
+      jlong start_pos,
+      jint bos,
+      jint eos) {
+    facebook::jni::local_ref<jlongArray> tuple_result =
+        facebook::jni::make_long_array(2);
+    if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) {
+      tuple_result->pin()[0] = static_cast<jint>(Error::NotSupported);
+      return tuple_result;
+    }
+
+    auto&& result = multi_modal_runner_->prefill_prompt(
+        prompt->toStdString(), start_pos, bos, eos);
+    tuple_result->pin()[0] = static_cast<jint>(Error::Ok);
+    if (result.ok()) {
+      tuple_result->pin()[1] = static_cast<jlong>(start_pos);
+    }
+    return tuple_result;
+  }
+
+  // Returns a tuple of (error, start_pos)
+  // Contract is valid within an AAR (JNI + corresponding Java code)
+  // If the first element is not Error::Ok, the other element is undefined.
+
+  facebook::jni::local_ref<jlongArray> prefill_images(
+      facebook::jni::alias_ref<jintArray> image,
+      jint width,
+      jint height,
+      jint channels,
+      jlong start_pos) {
+    facebook::jni::local_ref<jlongArray> tuple_result =
+        facebook::jni::make_long_array(2);
+
+    if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) {
+      tuple_result->pin()[0] = static_cast<jint>(Error::NotSupported);
+      return tuple_result;
+    }
+
+    auto image_size = image->size();
+    std::vector<Image> images;
+    if (image_size != 0) {
+      std::vector<jint> image_data_jint(image_size);
+      std::vector<uint8_t> image_data(image_size);
+      image->getRegion(0, image_size, image_data_jint.data());
+      for (int i = 0; i < image_size; i++) {
+        image_data[i] = image_data_jint[i];
+      }
+      Image image_runner{image_data, width, height, channels};
+      images.push_back(image_runner);
+    }
+    // TODO(hsz): make  start_pos a reference and update it here
+    jint result = static_cast<jint>(
+        multi_modal_runner_->prefill_images(images, start_pos));
+    tuple_result->pin()[0] = result;
+    tuple_result->pin()[1] = static_cast<jlong>(start_pos);
+    return tuple_result;
+  }
+
+  jint generate_from_pos(
+      facebook::jni::alias_ref<jstring> prompt,
+      jint seq_len,
+      jlong start_pos,
+      facebook::jni::alias_ref<ExecuTorchLlamaCallbackJni> callback) {
+    if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) {
+      return static_cast<jint>(Error::NotSupported);
+    }
+    return static_cast<jint>(multi_modal_runner_->generate_from_pos(
+        prompt->toStdString(),
+        seq_len,
+        start_pos,
+        [callback](const std::string& result) { callback->onResult(result); },
+        [callback](const ::executorch::extension::llm::Stats& stats) {
+          callback->onStats(stats);
+        }));
+  }
+
   void stop() {
     if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
       multi_modal_runner_->stop();
diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
index bdc8506aa9..c4de23df0e 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
+++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
@@ -33,6 +33,7 @@ public class LlamaModule {
 
   private final HybridData mHybridData;
   private static final int DEFAULT_SEQ_LEN = 128;
+  private static final boolean DEFAULT_ECHO = true;
 
   @DoNotStrip
   private static native HybridData initHybrid(
@@ -59,7 +60,7 @@ public void resetNative() {
    * @param llamaCallback callback object to receive results.
    */
   public int generate(String prompt, LlamaCallback llamaCallback) {
-    return generate(prompt, DEFAULT_SEQ_LEN, llamaCallback);
+    return generate(prompt, DEFAULT_SEQ_LEN, DEFAULT_ECHO, llamaCallback);
   }
 
   /**
@@ -70,7 +71,30 @@ public int generate(String prompt, LlamaCallback llamaCallback) {
    * @param llamaCallback callback object to receive results.
    */
   public int generate(String prompt, int seqLen, LlamaCallback llamaCallback) {
-    return generate(null, 0, 0, 0, prompt, seqLen, llamaCallback);
+    return generate(null, 0, 0, 0, prompt, seqLen, DEFAULT_ECHO, llamaCallback);
+  }
+
+  /**
+   * Start generating tokens from the module.
+   *
+   * @param prompt Input prompt
+   * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
+   * @param llamaCallback callback object to receive results.
+   */
+  public int generate(String prompt, boolean echo, LlamaCallback llamaCallback) {
+    return generate(null, 0, 0, 0, prompt, DEFAULT_SEQ_LEN, echo, llamaCallback);
+  }
+
+  /**
+   * Start generating tokens from the module.
+   *
+   * @param prompt Input prompt
+   * @param seqLen sequence length
+   * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
+   * @param llamaCallback callback object to receive results.
+   */
+  public int generate(String prompt, int seqLen, boolean echo, LlamaCallback llamaCallback) {
+    return generate(null, 0, 0, 0, prompt, seqLen, echo, llamaCallback);
   }
 
   /**
@@ -82,6 +106,7 @@ public int generate(String prompt, int seqLen, LlamaCallback llamaCallback) {
    * @param channels Input image number of channels
    * @param prompt Input prompt
    * @param seqLen sequence length
+   * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
    * @param llamaCallback callback object to receive results.
    */
   @DoNotStrip
@@ -92,8 +117,66 @@ public native int generate(
       int channels,
       String prompt,
       int seqLen,
+      boolean echo,
       LlamaCallback llamaCallback);
 
+  /**
+   * Prefill an LLaVA Module with the given images input.
+   *
+   * @param image Input image as a byte array
+   * @param width Input image width
+   * @param height Input image height
+   * @param channels Input image number of channels
+   * @param startPos The starting position in KV cache of the input in the LLM.
+   * @return The updated starting position in KV cache of the input in the LLM.
+   * @throws RuntimeException if the prefill failed
+   */
+  public long prefillImages(int[] image, int width, int height, int channels, long startPos) {
+    long[] nativeResult = prefillImagesNative(image, width, height, channels, startPos);
+    if (nativeResult[0] != 0) {
+      throw new RuntimeException("Prefill failed with error code: " + nativeResult[0]);
+    }
+    return nativeResult[1];
+  }
+
+  // returns a tuple of (status, updated startPos)
+  private native long[] prefillImagesNative(
+      int[] image, int width, int height, int channels, long startPos);
+
+  /**
+   * Prefill an LLaVA Module with the given text input.
+   *
+   * @param prompt The text prompt to LLaVA.
+   * @param startPos The starting position in KV cache of the input in the LLM. It's passed as
+   *     reference and will be updated inside this function.
+   * @param bos The number of BOS (begin of sequence) token.
+   * @param eos The number of EOS (end of sequence) token.
+   * @return The updated starting position in KV cache of the input in the LLM.
+   * @throws RuntimeException if the prefill failed
+   */
+  public long prefillPrompt(String prompt, long startPos, int bos, int eos) {
+    long[] nativeResult = prefillPromptNative(prompt, startPos, bos, eos);
+    if (nativeResult[0] != 0) {
+      throw new RuntimeException("Prefill failed with error code: " + nativeResult[0]);
+    }
+    return nativeResult[1];
+  }
+
+  // returns a tuple of (status, updated startPos)
+  private native long[] prefillPromptNative(String prompt, long startPos, int bos, int eos);
+
+  /**
+   * Generate tokens from the given prompt, starting from the given position.
+   *
+   * @param prompt The text prompt to LLaVA.
+   * @param seqLen The total sequence length, including the prompt tokens and new tokens.
+   * @param startPos The starting position in KV cache of the input in the LLM.
+   * @param llamaCallback callback object to receive results.
+   * @return The error code.
+   */
+  public native int generateFromPos(
+      String prompt, int seqLen, long startPos, LlamaCallback callback);
+
   /** Stop current generate() before it finishes. */
   @DoNotStrip
   public native void stop();
diff --git a/extension/apple/Benchmark/App/App.entitlements b/extension/apple/Benchmark/App/App.entitlements
new file mode 100644
index 0000000000..e461e7f22f
--- /dev/null
+++ b/extension/apple/Benchmark/App/App.entitlements
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>com.apple.security.app-sandbox</key>
+	<true/>
+	<key>com.apple.security.files.user-selected.read-only</key>
+	<true/>
+	<key>com.apple.developer.kernel.increased-memory-limit</key>
+	<true/>
+</dict>
+</plist>
diff --git a/extension/apple/Benchmark/App/App.swift b/extension/apple/Benchmark/App/App.swift
new file mode 100644
index 0000000000..30fbd221dc
--- /dev/null
+++ b/extension/apple/Benchmark/App/App.swift
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+import SwiftUI
+
+@main
+struct BenchmarkApp: App {
+  var body: some Scene {
+    WindowGroup {}
+  }
+}
diff --git a/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
new file mode 100644
index 0000000000..4dcffaffbf
--- /dev/null
+++ b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
@@ -0,0 +1,535 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 56;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		03B2D3682C8A515A0046936E /* App.swift in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3672C8A515A0046936E /* App.swift */; };
+		03B2D37A2C8A515C0046936E /* Tests.mm in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3792C8A515C0046936E /* Tests.mm */; };
+		03C7FA382C8AA3EC00E6E9AE /* Models in Resources */ = {isa = PBXBuildFile; fileRef = 03C7FA322C8AA24200E6E9AE /* Models */; };
+		03ED6CFF2C8AAFB300F2D6EE /* backend_coreml.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */; };
+		03ED6D012C8AAFB300F2D6EE /* backend_mps.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */; };
+		03ED6D032C8AAFB300F2D6EE /* backend_xnnpack.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */; };
+		03ED6D052C8AAFB300F2D6EE /* executorch.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */; };
+		03ED6D072C8AAFB300F2D6EE /* kernels_custom.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */; };
+		03ED6D092C8AAFB300F2D6EE /* kernels_optimized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */; };
+		03ED6D0B2C8AAFB300F2D6EE /* kernels_portable.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */; };
+		03ED6D0D2C8AAFB300F2D6EE /* kernels_quantized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */; };
+		03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */; };
+		03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */; };
+		03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */; };
+		03ED6D152C8AAFFF00F2D6EE /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D142C8AAFFF00F2D6EE /* Metal.framework */; };
+		03ED6D172C8AB00500F2D6EE /* CoreML.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D162C8AB00500F2D6EE /* CoreML.framework */; };
+		03ED6D192C8AB00A00F2D6EE /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D182C8AB00A00F2D6EE /* Accelerate.framework */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXContainerItemProxy section */
+		03B2D3762C8A515C0046936E /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 03B2D35C2C8A515A0046936E /* Project object */;
+			proxyType = 1;
+			remoteGlobalIDString = 03B2D3632C8A515A0046936E;
+			remoteInfo = Benchmark;
+		};
+/* End PBXContainerItemProxy section */
+
+/* Begin PBXFileReference section */
+		037C96A02C8A570B00B3DF38 /* Tests.xctestplan */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = Tests.xctestplan; sourceTree = "<group>"; };
+		03B019502C8A80D30044D558 /* Tests.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = Tests.xcconfig; sourceTree = "<group>"; };
+		03B2D3642C8A515A0046936E /* Benchmark.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Benchmark.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		03B2D3672C8A515A0046936E /* App.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = App.swift; sourceTree = "<group>"; };
+		03B2D36D2C8A515B0046936E /* App.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = App.entitlements; sourceTree = "<group>"; };
+		03B2D3752C8A515C0046936E /* Tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = Tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
+		03B2D3792C8A515C0046936E /* Tests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = Tests.mm; sourceTree = "<group>"; };
+		03C7FA322C8AA24200E6E9AE /* Models */ = {isa = PBXFileReference; lastKnownFileType = folder; path = Models; sourceTree = SOURCE_ROOT; };
+		03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_coreml.xcframework; path = Frameworks/backend_coreml.xcframework; sourceTree = "<group>"; };
+		03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_mps.xcframework; path = Frameworks/backend_mps.xcframework; sourceTree = "<group>"; };
+		03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_xnnpack.xcframework; path = Frameworks/backend_xnnpack.xcframework; sourceTree = "<group>"; };
+		03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = executorch.xcframework; path = Frameworks/executorch.xcframework; sourceTree = "<group>"; };
+		03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_custom.xcframework; path = Frameworks/kernels_custom.xcframework; sourceTree = "<group>"; };
+		03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_optimized.xcframework; path = Frameworks/kernels_optimized.xcframework; sourceTree = "<group>"; };
+		03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_portable.xcframework; path = Frameworks/kernels_portable.xcframework; sourceTree = "<group>"; };
+		03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_quantized.xcframework; path = Frameworks/kernels_quantized.xcframework; sourceTree = "<group>"; };
+		03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libsqlite3.0.tbd; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/usr/lib/libsqlite3.0.tbd; sourceTree = DEVELOPER_DIR; };
+		03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShadersGraph.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShadersGraph.framework; sourceTree = DEVELOPER_DIR; };
+		03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShaders.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShaders.framework; sourceTree = DEVELOPER_DIR; };
+		03ED6D142C8AAFFF00F2D6EE /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/Metal.framework; sourceTree = DEVELOPER_DIR; };
+		03ED6D162C8AB00500F2D6EE /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/CoreML.framework; sourceTree = DEVELOPER_DIR; };
+		03ED6D182C8AB00A00F2D6EE /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/Accelerate.framework; sourceTree = DEVELOPER_DIR; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		03B2D3612C8A515A0046936E /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		03B2D3722C8A515C0046936E /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				03ED6D192C8AB00A00F2D6EE /* Accelerate.framework in Frameworks */,
+				03ED6D172C8AB00500F2D6EE /* CoreML.framework in Frameworks */,
+				03ED6D152C8AAFFF00F2D6EE /* Metal.framework in Frameworks */,
+				03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */,
+				03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */,
+				03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */,
+				03ED6CFF2C8AAFB300F2D6EE /* backend_coreml.xcframework in Frameworks */,
+				03ED6D032C8AAFB300F2D6EE /* backend_xnnpack.xcframework in Frameworks */,
+				03ED6D092C8AAFB300F2D6EE /* kernels_optimized.xcframework in Frameworks */,
+				03ED6D012C8AAFB300F2D6EE /* backend_mps.xcframework in Frameworks */,
+				03ED6D0D2C8AAFB300F2D6EE /* kernels_quantized.xcframework in Frameworks */,
+				03ED6D0B2C8AAFB300F2D6EE /* kernels_portable.xcframework in Frameworks */,
+				03ED6D052C8AAFB300F2D6EE /* executorch.xcframework in Frameworks */,
+				03ED6D072C8AAFB300F2D6EE /* kernels_custom.xcframework in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		03B2D35B2C8A515A0046936E = {
+			isa = PBXGroup;
+			children = (
+				03B2D3662C8A515A0046936E /* App */,
+				03ED6CEB2C8AAF5300F2D6EE /* Frameworks */,
+				03C7FA322C8AA24200E6E9AE /* Models */,
+				03B2D3782C8A515C0046936E /* Tests */,
+				03B2D3652C8A515A0046936E /* Products */,
+			);
+			sourceTree = "<group>";
+		};
+		03B2D3652C8A515A0046936E /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				03B2D3642C8A515A0046936E /* Benchmark.app */,
+				03B2D3752C8A515C0046936E /* Tests.xctest */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		03B2D3662C8A515A0046936E /* App */ = {
+			isa = PBXGroup;
+			children = (
+				03B2D3672C8A515A0046936E /* App.swift */,
+				03B2D36D2C8A515B0046936E /* App.entitlements */,
+			);
+			path = App;
+			sourceTree = SOURCE_ROOT;
+		};
+		03B2D3782C8A515C0046936E /* Tests */ = {
+			isa = PBXGroup;
+			children = (
+				03B2D3792C8A515C0046936E /* Tests.mm */,
+				03B019502C8A80D30044D558 /* Tests.xcconfig */,
+				037C96A02C8A570B00B3DF38 /* Tests.xctestplan */,
+			);
+			path = Tests;
+			sourceTree = SOURCE_ROOT;
+		};
+		03ED6CEB2C8AAF5300F2D6EE /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				03ED6D182C8AB00A00F2D6EE /* Accelerate.framework */,
+				03ED6D162C8AB00500F2D6EE /* CoreML.framework */,
+				03ED6D142C8AAFFF00F2D6EE /* Metal.framework */,
+				03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */,
+				03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */,
+				03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */,
+				03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */,
+				03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */,
+				03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */,
+				03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */,
+				03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */,
+				03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */,
+				03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */,
+				03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */,
+			);
+			name = Frameworks;
+			sourceTree = SOURCE_ROOT;
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		03B2D3632C8A515A0046936E /* App */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 03B2D3892C8A515C0046936E /* Build configuration list for PBXNativeTarget "App" */;
+			buildPhases = (
+				03B2D3602C8A515A0046936E /* Sources */,
+				03B2D3612C8A515A0046936E /* Frameworks */,
+				03B2D3622C8A515A0046936E /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = App;
+			productName = Benchmark;
+			productReference = 03B2D3642C8A515A0046936E /* Benchmark.app */;
+			productType = "com.apple.product-type.application";
+		};
+		03B2D3742C8A515C0046936E /* Tests */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 03B2D38C2C8A515C0046936E /* Build configuration list for PBXNativeTarget "Tests" */;
+			buildPhases = (
+				03B2D3712C8A515C0046936E /* Sources */,
+				03B2D3722C8A515C0046936E /* Frameworks */,
+				03B2D3732C8A515C0046936E /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+				03B2D3772C8A515C0046936E /* PBXTargetDependency */,
+			);
+			name = Tests;
+			productName = BenchmarkTests;
+			productReference = 03B2D3752C8A515C0046936E /* Tests.xctest */;
+			productType = "com.apple.product-type.bundle.unit-test";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		03B2D35C2C8A515A0046936E /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				BuildIndependentTargetsInParallel = 1;
+				LastSwiftUpdateCheck = 1540;
+				LastUpgradeCheck = 1540;
+				TargetAttributes = {
+					03B2D3632C8A515A0046936E = {
+						CreatedOnToolsVersion = 15.4;
+					};
+					03B2D3742C8A515C0046936E = {
+						CreatedOnToolsVersion = 15.4;
+						TestTargetID = 03B2D3632C8A515A0046936E;
+					};
+				};
+			};
+			buildConfigurationList = 03B2D35F2C8A515A0046936E /* Build configuration list for PBXProject "Benchmark" */;
+			compatibilityVersion = "Xcode 14.0";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 03B2D35B2C8A515A0046936E;
+			productRefGroup = 03B2D3652C8A515A0046936E /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				03B2D3632C8A515A0046936E /* App */,
+				03B2D3742C8A515C0046936E /* Tests */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		03B2D3622C8A515A0046936E /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		03B2D3732C8A515C0046936E /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				03C7FA382C8AA3EC00E6E9AE /* Models in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		03B2D3602C8A515A0046936E /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				03B2D3682C8A515A0046936E /* App.swift in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		03B2D3712C8A515C0046936E /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				03B2D37A2C8A515C0046936E /* Tests.mm in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin PBXTargetDependency section */
+		03B2D3772C8A515C0046936E /* PBXTargetDependency */ = {
+			isa = PBXTargetDependency;
+			target = 03B2D3632C8A515A0046936E /* App */;
+			targetProxy = 03B2D3762C8A515C0046936E /* PBXContainerItemProxy */;
+		};
+/* End PBXTargetDependency section */
+
+/* Begin XCBuildConfiguration section */
+		03B2D3872C8A515C0046936E /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "c++17";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+				MTL_FAST_MATH = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+			};
+			name = Debug;
+		};
+		03B2D3882C8A515C0046936E /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "c++17";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				MTL_FAST_MATH = YES;
+				SWIFT_COMPILATION_MODE = wholemodule;
+			};
+			name = Release;
+		};
+		03B2D38A2C8A515C0046936E /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				CODE_SIGN_ENTITLEMENTS = App/App.entitlements;
+				"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development";
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_TEAM = "";
+				ENABLE_PREVIEWS = YES;
+				GENERATE_INFOPLIST_FILE = YES;
+				"INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphoneos*]" = YES;
+				"INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphonesimulator*]" = YES;
+				"INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphoneos*]" = YES;
+				"INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphonesimulator*]" = YES;
+				"INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphoneos*]" = YES;
+				"INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphonesimulator*]" = YES;
+				"INFOPLIST_KEY_UIStatusBarStyle[sdk=iphoneos*]" = UIStatusBarStyleDefault;
+				"INFOPLIST_KEY_UIStatusBarStyle[sdk=iphonesimulator*]" = UIStatusBarStyleDefault;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				LD_RUNPATH_SEARCH_PATHS = "@executable_path/Frameworks";
+				"LD_RUNPATH_SEARCH_PATHS[sdk=macosx*]" = "@executable_path/../Frameworks";
+				MACOSX_DEPLOYMENT_TARGET = 11.0;
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.Benchmark;
+				PRODUCT_NAME = Benchmark;
+				SDKROOT = auto;
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx";
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		03B2D38B2C8A515C0046936E /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				CODE_SIGN_ENTITLEMENTS = App/App.entitlements;
+				"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development";
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_TEAM = "";
+				ENABLE_PREVIEWS = YES;
+				GENERATE_INFOPLIST_FILE = YES;
+				"INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphoneos*]" = YES;
+				"INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphonesimulator*]" = YES;
+				"INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphoneos*]" = YES;
+				"INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphonesimulator*]" = YES;
+				"INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphoneos*]" = YES;
+				"INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphonesimulator*]" = YES;
+				"INFOPLIST_KEY_UIStatusBarStyle[sdk=iphoneos*]" = UIStatusBarStyleDefault;
+				"INFOPLIST_KEY_UIStatusBarStyle[sdk=iphonesimulator*]" = UIStatusBarStyleDefault;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				LD_RUNPATH_SEARCH_PATHS = "@executable_path/Frameworks";
+				"LD_RUNPATH_SEARCH_PATHS[sdk=macosx*]" = "@executable_path/../Frameworks";
+				MACOSX_DEPLOYMENT_TARGET = 11.0;
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.Benchmark;
+				PRODUCT_NAME = Benchmark;
+				SDKROOT = auto;
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx";
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+		03B2D38D2C8A515C0046936E /* Debug */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 03B019502C8A80D30044D558 /* Tests.xcconfig */;
+			buildSettings = {
+				ALWAYS_EMBED_SWIFT_STANDARD_LIBRARIES = YES;
+				BUNDLE_LOADER = "$(TEST_HOST)";
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_TEAM = "";
+				GENERATE_INFOPLIST_FILE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				MACOSX_DEPLOYMENT_TARGET = 10.15;
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.BenchmarkTests;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SDKROOT = auto;
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx";
+				SWIFT_EMIT_LOC_STRINGS = NO;
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+				TEST_HOST = "$(BUILT_PRODUCTS_DIR)/Benchmark.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/Benchmark";
+			};
+			name = Debug;
+		};
+		03B2D38E2C8A515C0046936E /* Release */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 03B019502C8A80D30044D558 /* Tests.xcconfig */;
+			buildSettings = {
+				ALWAYS_EMBED_SWIFT_STANDARD_LIBRARIES = YES;
+				BUNDLE_LOADER = "$(TEST_HOST)";
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_TEAM = "";
+				GENERATE_INFOPLIST_FILE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				MACOSX_DEPLOYMENT_TARGET = 10.15;
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.BenchmarkTests;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SDKROOT = auto;
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx";
+				SWIFT_EMIT_LOC_STRINGS = NO;
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+				TEST_HOST = "$(BUILT_PRODUCTS_DIR)/Benchmark.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/Benchmark";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		03B2D35F2C8A515A0046936E /* Build configuration list for PBXProject "Benchmark" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				03B2D3872C8A515C0046936E /* Debug */,
+				03B2D3882C8A515C0046936E /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		03B2D3892C8A515C0046936E /* Build configuration list for PBXNativeTarget "App" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				03B2D38A2C8A515C0046936E /* Debug */,
+				03B2D38B2C8A515C0046936E /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		03B2D38C2C8A515C0046936E /* Build configuration list for PBXNativeTarget "Tests" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				03B2D38D2C8A515C0046936E /* Debug */,
+				03B2D38E2C8A515C0046936E /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 03B2D35C2C8A515A0046936E /* Project object */;
+}
diff --git a/extension/apple/Benchmark/Benchmark.xcodeproj/xcshareddata/xcschemes/Benchmark.xcscheme b/extension/apple/Benchmark/Benchmark.xcodeproj/xcshareddata/xcschemes/Benchmark.xcscheme
new file mode 100644
index 0000000000..ebfe1e5fd3
--- /dev/null
+++ b/extension/apple/Benchmark/Benchmark.xcodeproj/xcshareddata/xcschemes/Benchmark.xcscheme
@@ -0,0 +1,107 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Scheme
+   LastUpgradeVersion = "1540"
+   version = "1.7">
+   <BuildAction
+      parallelizeBuildables = "YES"
+      buildImplicitDependencies = "YES"
+      buildArchitectures = "Automatic">
+      <BuildActionEntries>
+         <BuildActionEntry
+            buildForTesting = "YES"
+            buildForRunning = "YES"
+            buildForProfiling = "YES"
+            buildForArchiving = "YES"
+            buildForAnalyzing = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "03B2D3632C8A515A0046936E"
+               BuildableName = "Benchmark.app"
+               BlueprintName = "App"
+               ReferencedContainer = "container:Benchmark.xcodeproj">
+            </BuildableReference>
+         </BuildActionEntry>
+      </BuildActionEntries>
+   </BuildAction>
+   <TestAction
+      buildConfiguration = "Release"
+      selectedDebuggerIdentifier = ""
+      selectedLauncherIdentifier = "Xcode.IDEFoundation.Launcher.PosixSpawn"
+      shouldUseLaunchSchemeArgsEnv = "YES">
+      <TestPlans>
+         <TestPlanReference
+            reference = "container:Tests/Tests.xctestplan"
+            default = "YES">
+         </TestPlanReference>
+      </TestPlans>
+      <Testables>
+         <TestableReference
+            skipped = "NO"
+            parallelizable = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "03B2D3742C8A515C0046936E"
+               BuildableName = "Tests.xctest"
+               BlueprintName = "Tests"
+               ReferencedContainer = "container:Benchmark.xcodeproj">
+            </BuildableReference>
+         </TestableReference>
+         <TestableReference
+            skipped = "NO"
+            parallelizable = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "03B2D37E2C8A515C0046936E"
+               BuildableName = "BenchmarkUITests.xctest"
+               BlueprintName = "BenchmarkUITests"
+               ReferencedContainer = "container:Benchmark.xcodeproj">
+            </BuildableReference>
+         </TestableReference>
+      </Testables>
+   </TestAction>
+   <LaunchAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      launchStyle = "0"
+      useCustomWorkingDirectory = "NO"
+      ignoresPersistentStateOnLaunch = "NO"
+      debugDocumentVersioning = "YES"
+      debugServiceExtension = "internal"
+      allowLocationSimulation = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "03B2D3632C8A515A0046936E"
+            BuildableName = "Benchmark.app"
+            BlueprintName = "App"
+            ReferencedContainer = "container:Benchmark.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </LaunchAction>
+   <ProfileAction
+      buildConfiguration = "Release"
+      shouldUseLaunchSchemeArgsEnv = "YES"
+      savedToolIdentifier = ""
+      useCustomWorkingDirectory = "NO"
+      debugDocumentVersioning = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "03B2D3632C8A515A0046936E"
+            BuildableName = "Benchmark.app"
+            BlueprintName = "App"
+            ReferencedContainer = "container:Benchmark.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </ProfileAction>
+   <AnalyzeAction
+      buildConfiguration = "Debug">
+   </AnalyzeAction>
+   <ArchiveAction
+      buildConfiguration = "Release"
+      revealArchiveInOrganizer = "YES">
+   </ArchiveAction>
+</Scheme>
diff --git a/extension/apple/Benchmark/Tests/Tests.mm b/extension/apple/Benchmark/Tests/Tests.mm
new file mode 100644
index 0000000000..5cf958765d
--- /dev/null
+++ b/extension/apple/Benchmark/Tests/Tests.mm
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import <XCTest/XCTest.h>
+
+#import <objc/runtime.h>
+
+#import <executorch/extension/module/module.h>
+#import <executorch/extension/tensor/tensor.h>
+
+using namespace ::executorch::extension;
+using namespace ::executorch::runtime;
+
+@interface Tests : XCTestCase
+@end
+
+@implementation Tests
+
++ (void)initialize {
+  if (self == [Tests class]) {
+    NSString *modelsDir = [[NSBundle bundleForClass:[self class]].resourcePath
+        stringByAppendingPathComponent:@"Models"];
+    NSArray *models =
+        [NSFileManager.defaultManager contentsOfDirectoryAtPath:modelsDir
+                                                          error:nil];
+    for (NSString *model in models) {
+      NSString *modelName = model.stringByDeletingPathExtension;
+      NSString *modelPath = [modelsDir stringByAppendingPathComponent:model];
+      XCTAssertGreaterThan(modelPath.length, 0);
+
+      SEL testLoadSelector = NSSelectorFromString(
+          [NSString stringWithFormat:@"test_load_%@", modelName]);
+      IMP testLoadImplementation = imp_implementationWithBlock(^(id _self) {
+        auto __block module = std::make_unique<Module>(modelPath.UTF8String);
+        [_self
+            measureWithMetrics:@[ [XCTClockMetric new], [XCTMemoryMetric new] ]
+                       options:XCTMeasureOptions.defaultOptions
+                         block:^{
+                           XCTAssertEqual(module->load_method("forward"),
+                                          Error::Ok);
+                         }];
+      });
+      class_addMethod(
+          [self class], testLoadSelector, testLoadImplementation, "v@:");
+
+      SEL testForwardSelector = NSSelectorFromString(
+          [NSString stringWithFormat:@"test_forward_%@", modelName]);
+      IMP testForwardImplementation = imp_implementationWithBlock(^(id _self) {
+        auto __block module = std::make_unique<Module>(modelPath.UTF8String);
+        XCTAssertEqual(module->load_method("forward"), Error::Ok);
+
+        const auto method_meta = module->method_meta("forward");
+        XCTAssertEqual(method_meta.error(), Error::Ok);
+
+        const auto num_inputs = method_meta->num_inputs();
+        XCTAssertGreaterThan(num_inputs, 0);
+
+        std::vector<std::vector<uint8_t>> buffers;
+        buffers.reserve(num_inputs);
+        std::vector<TensorPtr> tensors;
+        tensors.reserve(num_inputs);
+        std::vector<EValue> __block inputs;
+        inputs.reserve(num_inputs);
+
+        for (auto index = 0; index < num_inputs; ++index) {
+          auto input_tag = method_meta->input_tag(index);
+          XCTAssertEqual(input_tag.error(), Error::Ok);
+
+          switch (*input_tag) {
+          case Tag::Tensor: {
+            const auto tensor_meta = method_meta->input_tensor_meta(index);
+            XCTAssertEqual(tensor_meta.error(), Error::Ok);
+
+            const auto sizes = tensor_meta->sizes();
+            buffers.emplace_back(tensor_meta->nbytes(),
+                                 0b01010101); // Set all bytes to be non-zero.
+            tensors.emplace_back(from_blob(buffers.rbegin()->data(),
+                                           {sizes.begin(), sizes.end()},
+                                           tensor_meta->scalar_type()));
+            inputs.emplace_back(tensors.back());
+          } break;
+          default:
+            XCTFail("Unsupported tag %i at input %d", *input_tag, index);
+          }
+        }
+        [_self
+            measureWithMetrics:@[ [XCTClockMetric new], [XCTMemoryMetric new] ]
+                       options:XCTMeasureOptions.defaultOptions
+                         block:^{
+                           XCTAssertEqual(module->forward(inputs).error(),
+                                          Error::Ok);
+                         }];
+      });
+      class_addMethod(
+          [self class], testForwardSelector, testForwardImplementation, "v@:");
+    }
+  }
+}
+
+@end
diff --git a/extension/apple/Benchmark/Tests/Tests.xcconfig b/extension/apple/Benchmark/Tests/Tests.xcconfig
new file mode 100644
index 0000000000..e8168046c3
--- /dev/null
+++ b/extension/apple/Benchmark/Tests/Tests.xcconfig
@@ -0,0 +1,26 @@
+OTHER_LDFLAGS[sdk=iphonesimulator*] = $(inherited) \
+-force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-simulator-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-release.a
+
+OTHER_LDFLAGS[sdk=iphoneos*] = $(inherited) \
+-force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-ios-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-ios-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-ios-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-ios-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-ios-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-ios-release.a
+
+OTHER_LDFLAGS[sdk=macos*] = $(inherited) \
+-force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-macos-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-macos-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-macos-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-macos-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-macos-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-macos-release.a \
+-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-macos-release.a
diff --git a/extension/apple/Benchmark/Tests/Tests.xctestplan b/extension/apple/Benchmark/Tests/Tests.xctestplan
new file mode 100644
index 0000000000..025f50f194
--- /dev/null
+++ b/extension/apple/Benchmark/Tests/Tests.xctestplan
@@ -0,0 +1,28 @@
+{
+  "configurations" : [
+    {
+      "id" : "0430A5ED-FD8D-444E-9933-740E01CCD53C",
+      "name" : "Test Scheme Action",
+      "options" : {
+
+      }
+    }
+  ],
+  "defaultOptions" : {
+    "targetForVariableExpansion" : {
+      "containerPath" : "container:Benchmark.xcodeproj",
+      "identifier" : "03B2D3632C8A515A0046936E",
+      "name" : "App"
+    }
+  },
+  "testTargets" : [
+    {
+      "target" : {
+        "containerPath" : "container:Benchmark.xcodeproj",
+        "identifier" : "03B2D3742C8A515C0046936E",
+        "name" : "Tests"
+      }
+    }
+  ],
+  "version" : 1
+}
diff --git a/extension/aten_util/test/targets.bzl b/extension/aten_util/test/targets.bzl
index b724bbce2b..db2247fd60 100644
--- a/extension/aten_util/test/targets.bzl
+++ b/extension/aten_util/test/targets.bzl
@@ -18,7 +18,6 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten:lib",
             "//executorch/runtime/kernel:operator_registry",
             "//executorch/extension/aten_util:aten_bridge",
-            "//executorch/extension/runner_util:managed_tensor",
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
         ],
         external_deps = [
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 4f5bab7bc0..2c2e52c744 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -65,6 +65,7 @@ def __init__(
         dtype,
         use_kv_cache,
         example_inputs,
+        args: Optional[Any] = None,
         enable_dynamic_shape: bool = False,
         verbose: bool = False,
         metadata: Optional[dict] = None,
@@ -87,6 +88,7 @@ def __init__(
         self.output_dir = "."
         self.dynamic_shapes = dynamic_shapes
         self._saved_pte_filename = None
+        self.args = args
 
     def set_output_dir(self, output_dir: str) -> "LLMEdgeManager":
         """
@@ -162,9 +164,20 @@ def capture_pre_autograd_graph(self) -> "LLMEdgeManager":
         # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up)
         with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
             # pyre-fixme[8]
-            self.pre_autograd_graph_module = capture_pre_autograd_graph(
-                self.model, self.example_inputs, dynamic_shapes=dynamic_shape
-            )
+            if hasattr(self.args, "qnn") and self.args.qnn:
+                # TODO: this is temporary and export_for_training doesn't work with qnn either. We need a
+                # functional graph. See issue https://github.com/pytorch/executorch/pull/4627 for more details
+                self.pre_autograd_graph_module = torch.export.export(
+                    self.model,
+                    self.example_inputs,
+                    dynamic_shapes=dynamic_shape,
+                    strict=True,
+                ).module()
+            else:
+                self.pre_autograd_graph_module = capture_pre_autograd_graph(
+                    self.model, self.example_inputs, dynamic_shapes=dynamic_shape
+                )
+
         return self
 
     def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManager":
@@ -210,10 +223,8 @@ def export_to_edge(self) -> "LLMEdgeManager":
         # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up)
         with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
             if self.pre_autograd_graph_module is None:
-                # pyre-fixme[8]
-                self.pre_autograd_graph_module = capture_pre_autograd_graph(
-                    self.model, self.example_inputs, dynamic_shapes=dynamic_shape
-                )
+                # Run capture_pre_autograd_graph if it didn't run
+                self.capture_pre_autograd_graph()
             self.edge_manager = export_to_edge(
                 self.pre_autograd_graph_module,  # pyre-fixme[6]
                 self.example_inputs,
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
index 0d9f7c6cfd..e75d5bef3f 100644
--- a/extension/llm/export/partitioner_lib.py
+++ b/extension/llm/export/partitioner_lib.py
@@ -140,7 +140,7 @@ def get_qnn_partitioner(
 
     return QnnPartitioner(  # pyre-fixme[16]
         generate_qnn_executorch_compiler_spec(  # pyre-fixme[16]
-            soc_model=QcomChipset.SM8450,  # default to SM8450  # pyre-fixme[16]
+            soc_model=QcomChipset.SM8650,  # default to SM8650  # pyre-fixme[16]
             # pyre-fixme[16]
             backend_options=generate_htp_compiler_spec(
                 use_fp16=use_fp16,
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
index 27bc84fe11..a9245768b9 100644
--- a/extension/llm/runner/CMakeLists.txt
+++ b/extension/llm/runner/CMakeLists.txt
@@ -43,7 +43,9 @@ target_include_directories(
 
 add_library(extension_llm_runner STATIC ${_extension_llm_runner__srcs})
 
-set(runner_deps executorch extension_module extension_data_loader)
+set(runner_deps executorch extension_data_loader extension_module
+                extension_tensor
+)
 
 target_link_libraries(extension_llm_runner PUBLIC ${runner_deps})
 
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
index dbffac46fc..70ecafee81 100644
--- a/extension/llm/runner/multimodal_runner.h
+++ b/extension/llm/runner/multimodal_runner.h
@@ -31,7 +31,6 @@
 #include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
 
 namespace executorch {
 namespace extension {
@@ -62,6 +61,50 @@ class MultimodalRunner {
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const Stats&)> stats_callback = {}) = 0;
 
+  /**
+   * Prefill an LLaVA Module with the given images input.
+   * @param images The image input to LLaVA.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * It's passed as reference and will be updated inside this function.
+   * @return The error status of prefilling images.
+   */
+  virtual runtime::Error prefill_images(
+      std::vector<Image>& images,
+      int64_t& start_pos) = 0;
+
+  /**
+   * Prefill an LLaVA Module with the given text input.
+   * @param prompt The text prompt to LLaVA.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * It's passed as reference and will be updated inside this function.
+   * @param bos The number of BOS (begin of sequence) token.
+   * @param eos The number of EOS (end of sequence) token.
+   * @return The generated token of the LLaVA Module after prefill prompt.
+   */
+  virtual runtime::Result<uint64_t> prefill_prompt(
+      const std::string& prompt,
+      int64_t& start_pos,
+      int8_t bos = 0,
+      int8_t eos = 0) = 0;
+
+  /**
+   * Generate tokens from the given prompt, starting from the given position.
+   * @param prompt The text prompt to LLaVA.
+   * @param seq_len The total sequence length, including the prompt tokens and
+   * new tokens.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * @param token_callback What to do after a token is generated.
+   * @param stats_callback What to do with Stats.
+   * @return The error code.
+   */
+  virtual runtime::Error generate_from_pos(
+      const std::string& prompt,
+      int32_t seq_len = 1024,
+      int64_t start_pos = 0,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const ::executorch::extension::llm::Stats&)>
+          stats_callback = {}) = 0;
+
   inline void stop() {
     text_token_generator_->stop();
   }
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
index 4d715980af..f20240956c 100644
--- a/extension/llm/runner/targets.bzl
+++ b/extension/llm/runner/targets.bzl
@@ -26,7 +26,7 @@ def define_common_targets():
                 ":stats",
                 "//executorch/extension/llm/sampler:sampler" + aten_suffix,
                 "//executorch/extension/module:module" + aten_suffix,
-                "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
+                "//executorch/extension/tensor:tensor" + aten_suffix,
             ],
         )
 
@@ -41,7 +41,7 @@ def define_common_targets():
                 ":text_decoder_runner" + aten_suffix,
                 "//executorch/extension/llm/tokenizer:tokenizer_header",
                 "//executorch/extension/module:module" + aten_suffix,
-                "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
+                "//executorch/extension/tensor:tensor" + aten_suffix,
             ],
         )
 
@@ -55,7 +55,7 @@ def define_common_targets():
                 ":text_decoder_runner" + aten_suffix,
                 "//executorch/extension/llm/tokenizer:tokenizer_header",
                 "//executorch/extension/module:module" + aten_suffix,
-                "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
+                "//executorch/extension/tensor:tensor" + aten_suffix,
             ],
         )
 
diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
index 5b77c69825..928a21244a 100644
--- a/extension/llm/runner/text_decoder_runner.cpp
+++ b/extension/llm/runner/text_decoder_runner.cpp
@@ -38,14 +38,11 @@ TextDecoderRunner::TextDecoderRunner(
 // input. It should be safe to call multiple times with the same inputs. The
 // outer loop (call site) is responsible for managing state.
 ::executorch::runtime::Result<exec_aten::Tensor> TextDecoderRunner::step(
-    ManagedTensor& managed_tokens,
-    ManagedTensor& managed_start_pos) {
-  auto tokens = managed_tokens.get_aliasing_tensor();
+    TensorPtr& tokens,
+    TensorPtr& start_pos) {
   // ET_LOG(Info, "Input token %" PRIu64, input_token);
   if (use_kv_cache_) {
-    auto start_pos = managed_start_pos.get_aliasing_tensor();
-    ::executorch::runtime::Result<std::vector<::executorch::runtime::EValue>>
-        outputs_res = module_->forward({tokens, start_pos});
+    auto outputs_res = module_->forward({*tokens, *start_pos});
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_CHECK_MSG(
         outputs_res.get().size() == 1,
@@ -57,10 +54,9 @@ ::executorch::runtime::Result<exec_aten::Tensor> TextDecoderRunner::step(
     // Return the logits tensor
     return outputs_res.get()[0].toTensor();
   } else { // no kv cache
-    (void)managed_start_pos; // unused
+    (void)start_pos; // unused
 
-    ::executorch::runtime::Result<std::vector<::executorch::runtime::EValue>>
-        outputs_res = module_->forward(tokens);
+    auto outputs_res = module_->forward(tokens);
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_CHECK_MSG(
         outputs_res.get().size() == 1,
diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h
index 70ee1d0136..16adeeed0a 100644
--- a/extension/llm/runner/text_decoder_runner.h
+++ b/extension/llm/runner/text_decoder_runner.h
@@ -12,7 +12,7 @@
 
 #include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/module/module.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/extension/tensor/tensor.h>
 // patternlint-disable-next-line executorch-cpp-nostdinc
 #include <functional>
 
@@ -38,8 +38,8 @@ class TextDecoderRunner {
    * @return The output of the LLM Module. This will be a tensor of logits.
    */
   virtual ::executorch::runtime::Result<exec_aten::Tensor> step(
-      ManagedTensor& input,
-      ManagedTensor& start_pos);
+      TensorPtr& input,
+      TensorPtr& start_pos);
 
   /**
    * Load the Module for text decode purpose.
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
index 53a737e6af..705583d638 100644
--- a/extension/llm/runner/text_prefiller.cpp
+++ b/extension/llm/runner/text_prefiller.cpp
@@ -25,7 +25,7 @@ TextPrefiller::TextPrefiller(
 
 ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
     std::vector<uint64_t>& prompt_tokens,
-    int64_t start_pos) {
+    int64_t& start_pos) {
   ET_CHECK_MSG(!prompt_tokens.empty(), "Prompt cannot be null");
   if (!text_decoder_runner_->is_method_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load());
@@ -38,54 +38,51 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
   uint64_t cur_token;
   if (enable_parallel_prefill_ || !use_kv_cache_) {
     // initialize tensor wrappers
-    ManagedTensor managed_tokens(
+    auto tokens = from_blob(
         prompt_tokens.data(),
         {1, num_prompt_tokens},
         exec_aten::ScalarType::Long);
 
-    ManagedTensor managed_start_pos(
-        &start_pos, {1}, exec_aten::ScalarType::Long);
+    auto start_pos_tensor =
+        from_blob(&start_pos, {1}, exec_aten::ScalarType::Long);
 
-    ::executorch::runtime::Result<exec_aten::Tensor> outputs_res =
-        text_decoder_runner_->step(managed_tokens, managed_start_pos);
+    auto outputs_res = text_decoder_runner_->step(tokens, start_pos_tensor);
 
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_LOG(
         Info, "Prefill token result numel(): %zu", outputs_res.get().numel());
 
+    start_pos += num_prompt_tokens;
     cur_token = text_decoder_runner_->logits_to_token(outputs_res.get());
   } else { // sequential prefill
     int64_t pos = 0; // position in the sequence
-    // token & pos
-    int64_t pos_data = 0;
     // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
     cur_token = prompt_tokens[0];
 
     // initialize tensor wrappers
-    ManagedTensor managed_tokens(
-        &cur_token, {1, 1}, exec_aten::ScalarType::Long);
+    auto tokens = from_blob(&cur_token, {1, 1}, exec_aten::ScalarType::Long);
 
-    ManagedTensor managed_start_pos(
-        &pos_data, {1}, exec_aten::ScalarType::Long);
+    auto start_pos_tensor =
+        from_blob(&start_pos, {1}, exec_aten::ScalarType::Long);
 
     // run the first token and get back logits tensor. Assuming the first token
     // is bos so don't callback.
-    exec_aten::Tensor logits_tensor = ET_UNWRAP(
-        text_decoder_runner_->step(managed_tokens, managed_start_pos));
+    auto logits_tensor =
+        ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos_tensor));
 
-    pos = 1; // start from index 1
+    pos += 1; // start the loop from index 1
+    start_pos += 1;
 
     while (pos < num_prompt_tokens) {
       // Run the model
-      pos_data = start_pos + pos;
-
       // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
       cur_token = prompt_tokens[pos];
 
-      logits_tensor = ET_UNWRAP(
-          text_decoder_runner_->step(managed_tokens, managed_start_pos));
+      logits_tensor =
+          ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos_tensor));
 
       pos++;
+      start_pos++;
     }
 
     cur_token = text_decoder_runner_->logits_to_token(logits_tensor);
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
index a8ba77b860..0ea126f32d 100644
--- a/extension/llm/runner/text_prefiller.h
+++ b/extension/llm/runner/text_prefiller.h
@@ -36,7 +36,7 @@ class TextPrefiller {
    */
   ::executorch::runtime::Result<uint64_t> prefill(
       std::vector<uint64_t>& prompt_tokens,
-      int64_t start_pos = 0);
+      int64_t& start_pos);
 
  private:
   TextDecoderRunner* text_decoder_runner_;
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
index 46d682a4e4..01887e7560 100644
--- a/extension/llm/runner/text_token_generator.h
+++ b/extension/llm/runner/text_token_generator.h
@@ -12,6 +12,7 @@
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
+#include <executorch/extension/tensor/tensor.h>
 
 namespace executorch {
 namespace extension {
@@ -69,15 +70,18 @@ class TextTokenGenerator {
     }
 
     // initialize tensor wrappers
-    ManagedTensor tokens_managed(
-        token_data.data(), token_shape, exec_aten::ScalarType::Long);
+    auto tokens_managed = from_blob(
+        token_data.data(),
+        token_shape,
+        exec_aten::ScalarType::Long,
+        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
 
-    ManagedTensor start_pos_managed(&pos, {1}, exec_aten::ScalarType::Long);
+    auto start_pos_managed = from_blob(&pos, {1}, exec_aten::ScalarType::Long);
 
     // Generate our tokens
     while (pos < seq_len - 1) {
       // Run the model
-      ::executorch::runtime::Result<exec_aten::Tensor> logits_res =
+      auto logits_res =
           text_decoder_runner_->step(tokens_managed, start_pos_managed);
 
       ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error());
@@ -98,7 +102,8 @@ class TextTokenGenerator {
       } else {
         // push it to the back
         token_data.push_back(cur_token);
-        tokens_managed.resize({1, static_cast<int>(token_data.size())});
+        ET_CHECK_OK_OR_RETURN_ERROR(resize_tensor_ptr(
+            tokens_managed, {1, static_cast<int>(token_data.size())}));
       }
 
       // print the token as string, decode it with the Tokenizer object
diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h
index baf6af328b..2f1d084811 100644
--- a/extension/llm/runner/util.h
+++ b/extension/llm/runner/util.h
@@ -10,6 +10,9 @@
 #include <stdio.h>
 #include <time.h>
 #include <cctype>
+#if defined(__linux__) || defined(__ANDROID__) || defined(__unix__)
+#include <sys/resource.h>
+#endif
 
 namespace executorch {
 namespace extension {
@@ -44,6 +47,27 @@ long inline time_in_ms() {
   return time.tv_sec * 1000 + time.tv_nsec / 1000000;
 }
 
+// ----------------------------------------------------------------------------
+// utilities: memory usage
+
+// Returns the current RSS in bytes. Returns 0 if not supported.
+// RSS: Resident Set Size, the amount of memory currently in the RAM for this
+// process. These values are approximate, and are only used for logging
+// purposes.
+size_t inline get_rss_bytes() {
+#if defined(__linux__) || defined(__ANDROID__) || defined(__unix__)
+  struct rusage r_usage;
+  if (getrusage(RUSAGE_SELF, &r_usage) == 0) {
+    return r_usage.ru_maxrss * 1024;
+  }
+#endif // __linux__ || __ANDROID__ || __unix__
+  // Unsupported platform like Windows, or getrusage() failed.
+  // __APPLE__ and __MACH__ are not supported because r_usage.ru_maxrss does not
+  // consistently return kbytes on macOS. On older versions of macOS, it
+  // returns bytes, but on newer versions it returns kbytes. Need to figure out
+  // when this changed.
+  return 0;
+}
 } // namespace llm
 } // namespace extension
 } // namespace executorch
@@ -53,6 +77,7 @@ namespace executor {
 namespace util {
 // TODO(T197294990): Remove these deprecated aliases once all users have moved
 // to the new `::executorch` namespaces.
+using ::executorch::extension::llm::get_rss_bytes;
 using ::executorch::extension::llm::safe_printf;
 using ::executorch::extension::llm::time_in_ms;
 } // namespace util
diff --git a/extension/llm/tokenizer/tiktoken.cpp b/extension/llm/tokenizer/tiktoken.cpp
index f8ccf74fd6..f99ac2e955 100644
--- a/extension/llm/tokenizer/tiktoken.cpp
+++ b/extension/llm/tokenizer/tiktoken.cpp
@@ -266,7 +266,11 @@ Tiktoken::_split_with_allowed_special_token(
     return std::make_pair(std::nullopt, input);
   }
 
+#if __cplusplus >= 202002L
   auto start = input.begin();
+#else
+  const char* start = input.data();
+#endif
   std::string special;
   while (true) {
     if (!re2::RE2::FindAndConsume(&input, *_special_token_regex, &special)) {
@@ -276,9 +280,15 @@ Tiktoken::_split_with_allowed_special_token(
 
     if (allowed_special.count(special) == 1) {
       // Found an allowed special token, split the text with it.
+#if __cplusplus >= 202002L
       return std::make_pair(
           special,
           re2::StringPiece(start, input.begin() - start - special.size()));
+#else
+      return std::make_pair(
+          special,
+          re2::StringPiece(start, (input.data() - start) - special.size()));
+#endif
     } // else try to find the next special token
   }
 
diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp
index 4ef454e1c7..75cead25a7 100644
--- a/extension/module/test/module_test.cpp
+++ b/extension/module/test/module_test.cpp
@@ -22,7 +22,7 @@ namespace torch::executor {
 class ModuleTest : public ::testing::Test {
  protected:
   static void SetUpTestSuite() {
-    model_path_ = std::getenv("RESOURCES_PATH") + std::string("/model.pte");
+    model_path_ = std::getenv("RESOURCES_PATH") + std::string("/add.pte");
   }
 
   static std::string model_path_;
@@ -95,7 +95,7 @@ TEST_F(ModuleTest, TestMethodMeta) {
   const auto meta = module.method_meta("forward");
   EXPECT_TRUE(meta.ok());
   EXPECT_STREQ(meta->name(), "forward");
-  EXPECT_EQ(meta->num_inputs(), 1);
+  EXPECT_EQ(meta->num_inputs(), 2);
   EXPECT_EQ(*(meta->input_tag(0)), Tag::Tensor);
   EXPECT_EQ(meta->num_outputs(), 1);
   EXPECT_EQ(*(meta->output_tag(0)), Tag::Tensor);
@@ -103,9 +103,8 @@ TEST_F(ModuleTest, TestMethodMeta) {
   const auto input_meta = meta->input_tensor_meta(0);
   EXPECT_TRUE(input_meta.ok());
   EXPECT_EQ(input_meta->scalar_type(), ScalarType::Float);
-  EXPECT_EQ(input_meta->sizes().size(), 2);
+  EXPECT_EQ(input_meta->sizes().size(), 1);
   EXPECT_EQ(input_meta->sizes()[0], 1);
-  EXPECT_EQ(input_meta->sizes()[1], 2);
 
   const auto output_meta = meta->output_tensor_meta(0);
   EXPECT_TRUE(output_meta.ok());
@@ -124,19 +123,22 @@ TEST_F(ModuleTest, TestNonExistentMethodMeta) {
 TEST_F(ModuleTest, TestExecute) {
   Module module(model_path_);
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
+  std::array<float, 1> input{1};
+  std::array<int32_t, 1> sizes{1};
   TensorImpl tensor(
       ScalarType::Float, sizes.size(), sizes.data(), input.data());
 
-  const auto result = module.execute("forward", Tensor(&tensor));
+  const auto result =
+      module.execute("forward", {Tensor(&tensor), Tensor(&tensor)});
+  EXPECT_TRUE(result.ok());
+
   EXPECT_TRUE(result.ok());
   EXPECT_TRUE(module.is_loaded());
   EXPECT_TRUE(module.is_method_loaded("forward"));
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
 
-  EXPECT_NEAR(data[0], 1.5, 1e-5);
+  EXPECT_NEAR(data[0], 2, 1e-5);
 }
 
 TEST_F(ModuleTest, TestExecutePreload) {
@@ -145,17 +147,18 @@ TEST_F(ModuleTest, TestExecutePreload) {
   const auto error = module.load();
   EXPECT_EQ(error, Error::Ok);
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
+  std::array<float, 1> input{1};
+  std::array<int32_t, 1> sizes{1};
   TensorImpl tensor(
       ScalarType::Float, sizes.size(), sizes.data(), input.data());
 
-  const auto result = module.execute("forward", Tensor(&tensor));
+  const auto result =
+      module.execute("forward", {Tensor(&tensor), Tensor(&tensor)});
   EXPECT_TRUE(result.ok());
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
 
-  EXPECT_NEAR(data[0], 1.5, 1e-5);
+  EXPECT_NEAR(data[0], 2, 1e-5);
 }
 
 TEST_F(ModuleTest, TestExecutePreload_method) {
@@ -164,17 +167,18 @@ TEST_F(ModuleTest, TestExecutePreload_method) {
   const auto error = module.load_method("forward");
   EXPECT_EQ(error, Error::Ok);
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
+  std::array<float, 1> input{1};
+  std::array<int32_t, 1> sizes{1};
   TensorImpl tensor(
       ScalarType::Float, sizes.size(), sizes.data(), input.data());
 
-  const auto result = module.execute("forward", Tensor(&tensor));
+  const auto result =
+      module.execute("forward", {Tensor(&tensor), Tensor(&tensor)});
   EXPECT_TRUE(result.ok());
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
 
-  EXPECT_NEAR(data[0], 1.5, 1e-5);
+  EXPECT_NEAR(data[0], 2, 1e-5);
 }
 
 TEST_F(ModuleTest, TestExecutePreloadProgramAndMethod) {
@@ -186,17 +190,18 @@ TEST_F(ModuleTest, TestExecutePreloadProgramAndMethod) {
   const auto load_method_error = module.load_method("forward");
   EXPECT_EQ(load_method_error, Error::Ok);
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
+  std::array<float, 1> input{1};
+  std::array<int32_t, 1> sizes{1};
   TensorImpl tensor(
       ScalarType::Float, sizes.size(), sizes.data(), input.data());
 
-  const auto result = module.execute("forward", Tensor(&tensor));
+  const auto result =
+      module.execute("forward", {Tensor(&tensor), Tensor(&tensor)});
   EXPECT_TRUE(result.ok());
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
 
-  EXPECT_NEAR(data[0], 1.5, 1e-5);
+  EXPECT_NEAR(data[0], 2, 1e-5);
 }
 
 TEST_F(ModuleTest, TestExecuteOnNonExistent) {
@@ -218,41 +223,42 @@ TEST_F(ModuleTest, TestExecuteOnCurrupted) {
 TEST_F(ModuleTest, TestGet) {
   Module module(model_path_);
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
+  std::array<float, 1> input{1};
+  std::array<int32_t, 1> sizes{1};
   TensorImpl tensor(
       ScalarType::Float, sizes.size(), sizes.data(), input.data());
 
-  const auto result = module.get("forward", Tensor(&tensor));
+  const auto result = module.get("forward", {Tensor(&tensor), Tensor(&tensor)});
 
   EXPECT_TRUE(result.ok());
   const auto data = result->toTensor().const_data_ptr<float>();
-  EXPECT_NEAR(data[0], 1.5, 1e-5);
+  EXPECT_NEAR(data[0], 2, 1e-5);
 }
 
 TEST_F(ModuleTest, TestForward) {
   auto module = std::make_unique<Module>(model_path_);
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
+  std::array<float, 1> input{1};
+  std::array<int32_t, 1> sizes{1};
   TensorImpl tensor(
       ScalarType::Float, sizes.size(), sizes.data(), input.data());
-  const auto result = module->forward(Tensor(&tensor));
+
+  const auto result = module->forward({Tensor(&tensor), Tensor(&tensor)});
   EXPECT_TRUE(result.ok());
 
   const auto data = result->at(0).toTensor().const_data_ptr<float>();
 
-  EXPECT_NEAR(data[0], 1.5, 1e-5);
+  EXPECT_NEAR(data[0], 2, 1e-5);
 
   std::array<float, 2> input2{2, 3};
   TensorImpl tensor2(
       ScalarType::Float, sizes.size(), sizes.data(), input2.data());
-  const auto result2 = module->forward(Tensor(&tensor2));
+  const auto result2 = module->forward({Tensor(&tensor2), Tensor(&tensor2)});
   EXPECT_TRUE(result2.ok());
 
   const auto data2 = result->at(0).toTensor().const_data_ptr<float>();
 
-  EXPECT_NEAR(data2[0], 2.5, 1e-5);
+  EXPECT_NEAR(data2[0], 4, 1e-5);
 }
 
 TEST_F(ModuleTest, TestForwardWithInvalidInputs) {
@@ -303,23 +309,26 @@ TEST_F(ModuleTest, TestProgramSharingAndDataLoaderManagement) {
   EXPECT_EQ(load_error, Error::Ok);
   EXPECT_TRUE(module1->is_loaded());
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
+  std::array<float, 1> input{1};
+  std::array<int32_t, 1> sizes{1};
   TensorImpl tensor(
       ScalarType::Float, sizes.size(), sizes.data(), input.data());
 
-  auto result1 = module1->execute("forward", Tensor(&tensor));
+  auto result1 =
+      module1->execute("forward", {Tensor(&tensor), Tensor(&tensor)});
   EXPECT_TRUE(result1.ok());
 
   auto module2 = std::make_unique<Module>(module1->program());
 
-  auto result2 = module2->execute("forward", Tensor(&tensor));
+  auto result2 =
+      module2->execute("forward", {Tensor(&tensor), Tensor(&tensor)});
   EXPECT_TRUE(result2.ok());
 
   module1 = std::make_unique<Module>("/path/to/nonexistent/file.pte");
   EXPECT_FALSE(module1->is_loaded());
 
-  auto result3 = module2->execute("forward", Tensor(&tensor));
+  auto result3 =
+      module2->execute("forward", {Tensor(&tensor), Tensor(&tensor)});
   EXPECT_TRUE(result3.ok());
 }
 
@@ -351,17 +360,17 @@ TEST_F(ModuleTest, TestProgramPersistenceAndReuseAfterModuleDestruction) {
 
   EXPECT_EQ(module.program(), shared_program);
 
-  std::array<float, 2> input{1, 2};
-  std::array<int32_t, 2> sizes{1, 2};
+  std::array<float, 1> input{1};
+  std::array<int32_t, 1> sizes{1};
   TensorImpl tensor(
       ScalarType::Float, sizes.size(), sizes.data(), input.data());
 
-  auto result = module.execute("forward", Tensor(&tensor));
+  auto result = module.execute("forward", {Tensor(&tensor), Tensor(&tensor)});
   EXPECT_TRUE(result.ok());
 
   auto data = result->at(0).toTensor().const_data_ptr<float>();
 
-  EXPECT_NEAR(data[0], 1.5, 1e-5);
+  EXPECT_NEAR(data[0], 2, 1e-5);
 }
 
 TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) {
@@ -379,24 +388,24 @@ TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) {
   EXPECT_TRUE(program != nullptr);
 
   auto thread = [](std::shared_ptr<Program> program,
-                   const std::array<float, 2>& input) {
+                   const std::array<float, 1>& input) {
     Module module(program);
-    std::array<int32_t, 2> sizes{1, 2};
+    std::array<int32_t, 1> sizes{1};
     TensorImpl tensor(
         ScalarType::Float, sizes.size(), sizes.data(), (void*)input.data());
 
-    const auto result = module.forward(Tensor(&tensor));
+    const auto result = module.forward({Tensor(&tensor), Tensor(&tensor)});
     EXPECT_TRUE(result.ok());
 
     const auto data = result->at(0).toTensor().const_data_ptr<float>();
-    EXPECT_NEAR(data[0], (input[0] + input[1]) / 2.0, 1e-5);
+    EXPECT_NEAR(data[0], (input[0] * 2), 1e-5);
   };
 
-  std::thread t1(thread, program, std::array<float, 2>{1, 2});
-  std::thread t2(thread, program, std::array<float, 2>{2, 3});
-  std::thread t3(thread, program, std::array<float, 2>{3, 4});
-  std::thread t4(thread, program, std::array<float, 2>{4, 5});
-  std::thread t5(thread, program, std::array<float, 2>{5, 6});
+  std::thread t1(thread, program, std::array<float, 1>{1});
+  std::thread t2(thread, program, std::array<float, 1>{2});
+  std::thread t3(thread, program, std::array<float, 1>{3});
+  std::thread t4(thread, program, std::array<float, 1>{4});
+  std::thread t5(thread, program, std::array<float, 1>{5});
 
   t1.join();
   t2.join();
diff --git a/extension/module/test/resources/README.md b/extension/module/test/resources/README.md
new file mode 100644
index 0000000000..5067c870a3
--- /dev/null
+++ b/extension/module/test/resources/README.md
@@ -0,0 +1,4 @@
+## Resources
+
+### model.pte
+- generated via `buck2 run fbcode//executorch/examples/portable/scripts:export -- --model_name="add"` after D62209852.
diff --git a/extension/module/test/resources/add.pte b/extension/module/test/resources/add.pte
new file mode 100644
index 0000000000000000000000000000000000000000..43252ca7d3d05e8fe847e122c9c7de976e0e0096
GIT binary patch
literal 728
zcmZ`$O-_Sg5Pi1LVna-08q!4<B(Q*_3m`5`(<Sb@<N_qvr8P~B8xP=dJc4oU5j>0d
z9v`$+<B<2w&o}dC=3@ciu1GT~Oflza!GTMAMgTdnCT<9yun4!VYv~K3FY+D2YQEEr
z;)}-h5cT-yjQR-S@h%^bm-wRnNxTyU5fZ9ZKVBatIa+28-LJpvS~JTY61%4ZE~(4I
z{ljWiRuyf{q8wmCZFZf|+YmXSTIEQ;a8J1|52xMC><8G4NtCNznyo*d;{901sofq4
z&95FC(_^3>z=$tz@ie@O=wFQ6?sRR{e3+V%PkTlvcFUN0_=L1XT6i=0w*D?~&6+W_
zm?cw28ad;8ZTa+8zxf<ooz2wodjIf-<WNqxqtn~*FSlO6ytaC4H|2C3MHXa15>cf=
X8imU+Ugk*@$4qIuZ+H9Wa$n&GV)QgK

literal 0
HcmV?d00001

diff --git a/extension/module/test/resources/model.pte b/extension/module/test/resources/model.pte
deleted file mode 100644
index 91b52416847fff9a794db423583a0c8c5a303d66..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1600
zcmbtUF>6y%6h2LpG`67x38h1a$0LYCkyaeiEbSs>a5ICDhBhX*B=CHR$qS80K{`1U
zXNL|&931)!#9tu#4;(T$Iyg9V(8lk(C%1VGUBnAt?z!iF=R4<~d-K*r<Y8s=J|@Y@
zDt?oaku2;yAW{Jez#5PTlECQCc+5FI5?GK!|C`^sPy6ealWk1$ZsMao2aqeWB6nl?
zR^j!KZ3R20DUmV+E<pYPjDT<O7XX)sIo;TC24?0x0d|20lXVvP^>l*siS$iyu-6$x
zFl7m-M<y(l4qK*YqzEq0PMvvxv?-?%bV4J$F}+1_9yjViy{C%QS3UC_ro6sEk1@)5
zX6m8#ggHeP(RaNAa=rsRH)|o@0xh68Sr>2@Fz2}U6M0xKZM07ofLj3j#QKR!jPJqb
zp3{bj>>9(kS26A**tmi#F5`Gk<{-`|+^qF;hi6cJ8ncUKU0gGsaD)4-k27I)K=^>g
zK2QX*z%T5LfHQ!1Tmg0fo?+wC_98|5SnV|5io44@V?6_8?BGnalxusq56&>A^EsxI
z2bbDU;A7kd;KsS~65M|ejZZO+Wvt;|^6=v{PA3|yZML2Zjnz9vY)0efwVz2_!%%$-
z>=QehftP({e(LcZaGWVG;dp#o?mm=(^DD>p?W5Xh51X#mGJMpeZ?@KqEaUz&_O7v=
z;(tu!He&fI_$A@U!DGiYUDifD_Q1XcEI8B0nbC1=*2C}8R5@p3;Z9S7{c-idwDEj}
z^p^qlW(%lpu6r;2=tbA~B`+KV!Mf)+I$>mnQOwb52hE-d_xfGafV)~dsv-7rHr{_a
zXa`X{?B9z9-N5+Qw0x>t*R0xA{{M~tm->%l=bq+mwj0q*+11*OMWY350{ef^=^fU4
GP5BEPFB_=<

diff --git a/extension/runner_util/managed_tensor.h b/extension/runner_util/managed_tensor.h
deleted file mode 100644
index 5c74f7550a..0000000000
--- a/extension/runner_util/managed_tensor.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <memory>
-// @nolint PATTERNLINT Ok to use stdlib for this optional library
-#include <vector>
-
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
-#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
-#include <executorch/runtime/platform/assert.h>
-
-#ifdef USE_ATEN_LIB
-#include <torch/torch.h>
-#endif
-
-namespace executorch {
-namespace extension {
-
-/**
- * A tensor wrapper takes ownership of all the memory of the necessary metadata
- * for exec_aten::Tensor. Note that it doesn't own the data memory.
- */
-class ManagedTensor {
- public:
-  /// The type used for elements of `sizes()`.
-  using SizesType = exec_aten::SizesType;
-  /// The type used for elements of `dim_order()`.
-  using DimOrderType = exec_aten::DimOrderType;
-  /// The type used for elements of `strides()`.
-  using StridesType = exec_aten::StridesType;
-
-  ManagedTensor() = delete;
-
-  explicit ManagedTensor(
-      void* data,
-      const std::vector<SizesType>& sizes,
-      exec_aten::ScalarType dtype)
-      : sizes_(sizes) {
-#ifdef USE_ATEN_LIB
-    tensor_ = torch::from_blob(data, sizes, dtype);
-#else
-    // Calculate strides.
-    strides_ = std::vector<StridesType>(sizes_.size());
-    if (sizes_.size() > 0) {
-      strides_.back() = 1;
-      for (size_t i = strides_.size() - 1; i > 0; --i) {
-        strides_[i - 1] = strides_[i] * sizes_[i];
-      }
-    }
-
-    // Allocate TensorImpl.
-    tensor_impl_ = std::make_unique<exec_aten::TensorImpl>(
-        dtype,
-        sizes_.size(),
-        sizes_.data(),
-        data,
-        /*dim_order=*/nullptr,
-        strides_.data(),
-        executorch::runtime::TensorShapeDynamism::DYNAMIC_BOUND);
-#endif
-  }
-
-  void resize(const std::vector<SizesType>& new_sizes) {
-    auto err = executorch::runtime::resize_tensor(
-        this->get_aliasing_tensor(),
-        exec_aten::ArrayRef<SizesType>(new_sizes.data(), new_sizes.size()));
-    ET_CHECK(err == executorch::runtime::Error::Ok);
-  }
-
-  /**
-   * Get the underlying Tensor object. This is assuming the copying is cheap.
-   */
-  exec_aten::Tensor get_aliasing_tensor() {
-#ifdef USE_ATEN_LIB
-    return tensor_;
-#else
-    return exec_aten::Tensor(tensor_impl_.get());
-#endif
-  }
-
- private:
-  std::unique_ptr<exec_aten::TensorImpl> tensor_impl_;
-  std::vector<SizesType> sizes_;
-  std::vector<StridesType> strides_;
-#ifdef USE_ATEN_LIB
-  exec_aten::Tensor tensor_;
-#endif
-};
-
-} // namespace extension
-} // namespace executorch
-
-namespace torch {
-namespace executor {
-// TODO(T197294990): Remove these deprecated aliases once all users have moved
-// to the new `::executorch` namespaces.
-using ::executorch::extension::ManagedTensor;
-} // namespace executor
-} // namespace torch
diff --git a/extension/runner_util/targets.bzl b/extension/runner_util/targets.bzl
index 43c0ed08f3..bc0fee197d 100644
--- a/extension/runner_util/targets.bzl
+++ b/extension/runner_util/targets.bzl
@@ -26,18 +26,3 @@ def define_common_targets():
                 "//executorch/runtime/executor:program" + aten_suffix,
             ],
         )
-
-        runtime.cxx_library(
-            name = "managed_tensor" + aten_suffix,
-            exported_headers = [
-                "managed_tensor.h",
-            ],
-            visibility = [
-                "//executorch/...",
-                "@EXECUTORCH_CLIENTS",
-            ],
-            deps = [
-                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
-                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
-            ],
-        )
diff --git a/extension/runner_util/test/CMakeLists.txt b/extension/runner_util/test/CMakeLists.txt
index 6b295611fd..aefb3b0417 100644
--- a/extension/runner_util/test/CMakeLists.txt
+++ b/extension/runner_util/test/CMakeLists.txt
@@ -23,7 +23,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
 include(${EXECUTORCH_ROOT}/build/Test.cmake)
 
-set(_test_srcs inputs_test.cpp managed_tensor_test.cpp)
+set(_test_srcs inputs_test.cpp)
 
 et_cxx_test(
   extension_runner_util_test
diff --git a/extension/runner_util/test/managed_tensor_test.cpp b/extension/runner_util/test/managed_tensor_test.cpp
deleted file mode 100644
index 8ac1285f2b..0000000000
--- a/extension/runner_util/test/managed_tensor_test.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/extension/runner_util/managed_tensor.h>
-
-#include <executorch/runtime/platform/runtime.h>
-
-#include <gtest/gtest.h>
-
-using namespace ::testing;
-using exec_aten::DimOrderType;
-using exec_aten::ScalarType;
-using exec_aten::SizesType;
-using exec_aten::StridesType;
-using executorch::extension::ManagedTensor;
-using executorch::runtime::ArrayRef;
-
-class ManagedTensorTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    executorch::runtime::runtime_init();
-
-    data_ = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-    sizes_ = {2, 3, 4};
-    expected_strides_ = {12, 4, 1};
-    managed_tensor_ =
-        std::make_unique<ManagedTensor>(data_.data(), sizes_, ScalarType::Long);
-  }
-
- protected:
-  std::vector<int64_t> data_;
-  std::vector<SizesType> sizes_;
-  std::vector<int> expected_strides_;
-  std::unique_ptr<ManagedTensor> managed_tensor_;
-};
-
-TEST_F(ManagedTensorTest, Smoke) {
-  const auto tensor = managed_tensor_->get_aliasing_tensor();
-
-  EXPECT_EQ(tensor.sizes(), ArrayRef<SizesType>(sizes_.data(), sizes_.size()));
-  EXPECT_EQ(tensor.scalar_type(), ScalarType::Long);
-  EXPECT_EQ(tensor.const_data_ptr(), data_.data());
-  for (size_t i = 0; i < expected_strides_.size(); ++i) {
-    EXPECT_EQ(tensor.strides()[i], expected_strides_[i]);
-  }
-}
-
-TEST_F(ManagedTensorTest, ResizeWithUpdatedRank) {
-  // gtest death test doesn't work on iOS:
-  // https://github.com/google/googletest/issues/2834
-#if !GTEST_OS_IOS
-  EXPECT_EXIT(
-      managed_tensor_->resize(std::vector<SizesType>{2, 3, 4, 5}),
-      ::testing::KilledBySignal(SIGABRT),
-      "");
-#endif
-}
-
-TEST_F(ManagedTensorTest, ResizeShrink) {
-  managed_tensor_->resize(std::vector<SizesType>{2, 2, 2});
-  const auto tensor = managed_tensor_->get_aliasing_tensor();
-
-  std::vector<SizesType> expected_sizes = {2, 2, 2};
-  EXPECT_EQ(
-      tensor.sizes(),
-      ArrayRef<SizesType>(expected_sizes.data(), expected_sizes.size()));
-  EXPECT_EQ(tensor.scalar_type(), ScalarType::Long);
-  EXPECT_EQ(tensor.const_data_ptr(), data_.data());
-}
-
-TEST_F(ManagedTensorTest, Resize) {
-  managed_tensor_->resize(std::vector<SizesType>{4, 3, 2});
-  const auto tensor = managed_tensor_->get_aliasing_tensor();
-
-  std::vector<SizesType> expected_sizes = {4, 3, 2};
-  EXPECT_EQ(
-      tensor.sizes(),
-      ArrayRef<SizesType>(expected_sizes.data(), expected_sizes.size()));
-  EXPECT_EQ(tensor.scalar_type(), ScalarType::Long);
-  EXPECT_EQ(tensor.const_data_ptr(), data_.data());
-}
diff --git a/extension/runner_util/test/targets.bzl b/extension/runner_util/test/targets.bzl
index 7c042ca9d9..f55a1ea995 100644
--- a/extension/runner_util/test/targets.bzl
+++ b/extension/runner_util/test/targets.bzl
@@ -30,15 +30,3 @@ def define_common_targets(is_fbcode = False):
                     "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])",
                 },
             )
-
-    runtime.cxx_test(
-        name = "managed_tensor_test",
-        srcs = [
-            "managed_tensor_test.cpp",
-        ],
-        deps = [
-            "//executorch/extension/runner_util:managed_tensor",
-            "//executorch/runtime/core/exec_aten:lib",
-            "//executorch/runtime/core/exec_aten/util:tensor_util",
-        ],
-    )
diff --git a/kernels/README.md b/kernels/README.md
index 4e9656e6e9..026778cc28 100644
--- a/kernels/README.md
+++ b/kernels/README.md
@@ -355,7 +355,7 @@ cmake . \
   -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-  -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_BUILD_SDK=ON \
   -DEXECUTORCH_BUILD_VULKAN=OFF \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
diff --git a/kernels/optimized/cpu/binary_ops.h b/kernels/optimized/cpu/binary_ops.h
index 01f3eed401..6d941509f7 100644
--- a/kernels/optimized/cpu/binary_ops.h
+++ b/kernels/optimized/cpu/binary_ops.h
@@ -75,7 +75,8 @@ ElementwiseOptimizedPath inline select_optimized_path(
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
 
-  if (a_type != b_type || a_type != out_type || a_type == ScalarType::Half) {
+  if (a_type != b_type || a_type != out_type || a_type == ScalarType::Half ||
+      a_type == ScalarType::BFloat16) {
     return ElementwiseOptimizedPath::kNone;
   }
   if (a.sizes().equals(b.sizes()) ||
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
index 3b93870a61..31b0f7754f 100644
--- a/kernels/optimized/cpu/op_mul.cpp
+++ b/kernels/optimized/cpu/op_mul.cpp
@@ -80,7 +80,8 @@ Tensor& opt_mul_out(
   ScalarType out_type = out.scalar_type();
 
   if (b.numel() == 1) {
-    if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) {
+    if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half &&
+        a_type != ScalarType::BFloat16) {
       auto error = resize_tensor(out, a.sizes());
       ET_KERNEL_CHECK_MSG(
           ctx,
@@ -170,12 +171,12 @@ Tensor& opt_mul_out(
         InvalidArgument,
         out);
 
-    ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() {
-      ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() {
+      ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
         using CTYPE_IN = typename torch::executor::
             promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
         ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-        ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
+        ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
           apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
               [](const CTYPE_A val_a, const CTYPE_B val_b) {
                 CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
@@ -210,7 +211,7 @@ Tensor& opt_mul_scalar_out(
 
   ET_CHECK(common_type == out_type);
 
-  if (common_type == ScalarType::Half) {
+  if (common_type == ScalarType::Half || common_type == ScalarType::BFloat16) {
     common_type = ScalarType::Float;
   }
 
@@ -219,7 +220,7 @@ Tensor& opt_mul_scalar_out(
   ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
 
   if (a_type == common_type && a_type == out_type &&
-      a_type != ScalarType::Half) {
+      a_type != ScalarType::Half && a_type != ScalarType::BFloat16) {
     ET_SWITCH_REALB_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE, [&]() {
       ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "mul.Scalar_out", CTYPE_B, [&]() {
         CTYPE_B b_val;
@@ -235,11 +236,11 @@ Tensor& opt_mul_scalar_out(
       });
     });
   } else {
-    ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() {
       ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "mul.Scalar_out", CTYPE_B, [&]() {
         ET_SWITCH_REALB_TYPES(
             common_type, ctx, "mul.Scalar_out", CTYPE_IN, [&]() {
-              ET_SWITCH_REALHB_TYPES(
+              ET_SWITCH_REALHBBF16_TYPES(
                   out_type, ctx, "mul.Scalar_out", CTYPE_OUT, [&]() {
                     CTYPE_B b_val;
                     ET_EXTRACT_SCALAR(b, b_val);
diff --git a/kernels/portable/cpu/op_masked_fill.cpp b/kernels/portable/cpu/op_masked_fill.cpp
index 7a72994b07..e6c0bb4442 100644
--- a/kernels/portable/cpu/op_masked_fill.cpp
+++ b/kernels/portable/cpu/op_masked_fill.cpp
@@ -39,6 +39,9 @@ Tensor& masked_fill_scalar_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, mask, out), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES_AND(
       Bool, in_type, ctx, "masked_fill.Scalar_out", CTYPE, [&]() {
         ET_SWITCH_REAL_TYPES_AND(
diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp
index 8f363ced4e..b36cde42e4 100644
--- a/kernels/portable/cpu/op_max.cpp
+++ b/kernels/portable/cpu/op_max.cpp
@@ -49,6 +49,24 @@ std::tuple<Tensor&, Tensor&> max_out(
       InvalidArgument,
       (std::tuple<Tensor&, Tensor&>({max, max_indices})));
 
+  ET_KERNEL_CHECK(
+      ctx,
+      tensors_have_same_dim_order(in, max),
+      InvalidArgument,
+      (std::tuple<Tensor&, Tensor&>({max, max_indices})));
+
+  ET_KERNEL_CHECK(
+      ctx,
+      tensor_is_default_dim_order(max_indices),
+      InvalidArgument,
+      (std::tuple<Tensor&, Tensor&>({max, max_indices})));
+
+  ET_KERNEL_CHECK(
+      ctx,
+      tensor_is_default_dim_order(in),
+      InvalidArgument,
+      (std::tuple<Tensor&, Tensor&>({max, max_indices})));
+
   dim = dim < 0 ? dim + in.dim() : dim;
 
   ET_SWITCH_REAL_TYPES_AND(
diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp
index 1353479b29..e52a6fd072 100644
--- a/kernels/portable/cpu/op_maximum.cpp
+++ b/kernels/portable/cpu/op_maximum.cpp
@@ -75,6 +75,9 @@ Tensor& maximum_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp
index 79e66c62b5..e930eb6c83 100644
--- a/kernels/portable/cpu/op_mean.cpp
+++ b/kernels/portable/cpu/op_mean.cpp
@@ -33,6 +33,11 @@ Tensor& mean_dim_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   ET_KERNEL_CHECK(
       ctx,
       resize_reduction_out(in, dim_list, keepdim, out) == Error::Ok,
diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp
index 8e3b5a00b3..e4f5e5714f 100644
--- a/kernels/portable/cpu/op_min.cpp
+++ b/kernels/portable/cpu/op_min.cpp
@@ -49,6 +49,24 @@ std::tuple<Tensor&, Tensor&> min_out(
       InvalidArgument,
       (std::tuple<Tensor&, Tensor&>({min, min_indices})));
 
+  ET_KERNEL_CHECK(
+      ctx,
+      tensors_have_same_dim_order(in, min),
+      InvalidArgument,
+      (std::tuple<Tensor&, Tensor&>({min, min_indices})));
+
+  ET_KERNEL_CHECK(
+      ctx,
+      tensor_is_default_dim_order(min_indices),
+      InvalidArgument,
+      (std::tuple<Tensor&, Tensor&>({min, min_indices})));
+
+  ET_KERNEL_CHECK(
+      ctx,
+      tensor_is_default_dim_order(in),
+      InvalidArgument,
+      (std::tuple<Tensor&, Tensor&>({min, min_indices})));
+
   dim = dim < 0 ? dim + in.dim() : dim;
 
   ET_SWITCH_REAL_TYPES_AND(
diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp
index f18d1a6d36..84024beffa 100644
--- a/kernels/portable/cpu/op_minimum.cpp
+++ b/kernels/portable/cpu/op_minimum.cpp
@@ -75,6 +75,9 @@ Tensor& minimum_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
diff --git a/kernels/portable/cpu/op_mm.cpp b/kernels/portable/cpu/op_mm.cpp
index 6903bf3cad..4a6a8f3cfd 100644
--- a/kernels/portable/cpu/op_mm.cpp
+++ b/kernels/portable/cpu/op_mm.cpp
@@ -29,6 +29,11 @@ mm_out(RuntimeContext& ctx, const Tensor& in, const Tensor& mat2, Tensor& out) {
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, mat2, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES_AND(Half, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() {
     size_t m = in.size(0);
     size_t n = in.size(1);
diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp
index c933d10d27..8fc4f9d459 100644
--- a/kernels/portable/cpu/op_mul.cpp
+++ b/kernels/portable/cpu/op_mul.cpp
@@ -70,7 +70,14 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
       InvalidArgument,
       out);
 
-  ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_realhbbf16_type(out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
 
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
@@ -79,12 +86,12 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
 
   ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
 
-  ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() {
-    ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
       using CTYPE_IN = typename torch::executor::
           promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
       ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-      ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
+      ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
         MulInner<
             can_cast<CTYPE_IN, CTYPE_OUT>::value,
             CTYPE_A,
@@ -113,6 +120,9 @@ Tensor& mul_scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out);
 
   ScalarType a_type = a.scalar_type();
@@ -123,15 +133,15 @@ Tensor& mul_scalar_out(
 
   ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out);
 
-  if (common_type == ScalarType::Half) {
+  if (common_type == ScalarType::Half || common_type == ScalarType::BFloat16) {
     common_type = ScalarType::Float;
   }
 
-  ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() {
     ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "mul.Scalar_out", CTYPE_B, [&]() {
       ET_SWITCH_REALB_TYPES(
           common_type, ctx, "mul.Scalar_out", CTYPE_IN, [&]() {
-            ET_SWITCH_REALHB_TYPES(
+            ET_SWITCH_REALHBBF16_TYPES(
                 out_type, ctx, "mul.Scalar_out", CTYPE_OUT, [&]() {
                   CTYPE_B b_val;
                   utils::extract_scalar(b, &b_val);
diff --git a/kernels/portable/cpu/op_native_batch_norm.cpp b/kernels/portable/cpu/op_native_batch_norm.cpp
index 2e613c0a63..fceb8b24d9 100644
--- a/kernels/portable/cpu/op_native_batch_norm.cpp
+++ b/kernels/portable/cpu/op_native_batch_norm.cpp
@@ -73,6 +73,28 @@ std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_no_training_out(
       InvalidArgument,
       ret_val);
 
+  ET_KERNEL_CHECK(
+      ctx,
+      tensors_have_same_dim_order(in, out, mean_out, invstd_out),
+      InvalidArgument,
+      ret_val);
+
+  if (weight.has_value()) {
+    ET_KERNEL_CHECK(
+        ctx,
+        tensors_have_same_dim_order(in, weight.value()),
+        InvalidArgument,
+        ret_val);
+  }
+
+  if (bias.has_value()) {
+    ET_KERNEL_CHECK(
+        ctx,
+        tensors_have_same_dim_order(in, bias.value()),
+        InvalidArgument,
+        ret_val);
+  }
+
   size_t C_dim = in.dim() >= 1 ? 1 : 0;
   size_t C = in.size(C_dim);
   size_t outer = getLeadingDims(in, C_dim);
diff --git a/kernels/portable/cpu/op_native_group_norm.cpp b/kernels/portable/cpu/op_native_group_norm.cpp
index f9213fdeb1..b61f5be676 100644
--- a/kernels/portable/cpu/op_native_group_norm.cpp
+++ b/kernels/portable/cpu/op_native_group_norm.cpp
@@ -158,6 +158,31 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_group_norm_out(
       InvalidArgument,
       ret_val);
 
+  ET_KERNEL_CHECK(
+      ctx, tensor_is_default_dim_order(input), InvalidArgument, ret_val);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      tensors_have_same_dim_order(input, out, mean_out, rstd_out),
+      InvalidArgument,
+      ret_val);
+
+  if (weight.has_value()) {
+    ET_KERNEL_CHECK(
+        ctx,
+        tensors_have_same_dim_order(input, weight.value()),
+        InvalidArgument,
+        ret_val);
+  }
+
+  if (bias.has_value()) {
+    ET_KERNEL_CHECK(
+        ctx,
+        tensors_have_same_dim_order(input, bias.value()),
+        InvalidArgument,
+        ret_val);
+  }
+
   constexpr auto name = "native_group_norm.out";
 
   ET_SWITCH_FLOAT_TYPES(input.scalar_type(), ctx, name, CTYPE, [&]() {
diff --git a/kernels/portable/cpu/op_native_layer_norm.cpp b/kernels/portable/cpu/op_native_layer_norm.cpp
index f10acda10e..711c747ca2 100644
--- a/kernels/portable/cpu/op_native_layer_norm.cpp
+++ b/kernels/portable/cpu/op_native_layer_norm.cpp
@@ -117,6 +117,33 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
       InvalidArgument,
       ret_val);
 
+  // Only support default dim order for now.
+  // TODO: Support other dim orders.
+  ET_KERNEL_CHECK(
+      ctx, tensor_is_default_dim_order(input), InvalidArgument, ret_val);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      tensors_have_same_dim_order(input, out, mean_out, rstd_out),
+      InvalidArgument,
+      ret_val);
+
+  if (weight.has_value()) {
+    ET_KERNEL_CHECK(
+        ctx,
+        tensors_have_same_dim_order(input, weight.value()),
+        InvalidArgument,
+        ret_val);
+  }
+
+  if (bias.has_value()) {
+    ET_KERNEL_CHECK(
+        ctx,
+        tensors_have_same_dim_order(input, bias.value()),
+        InvalidArgument,
+        ret_val);
+  }
+
   Tensor::SizesType mean_rstd_sizes[kTensorDimensionLimit];
   size_t mean_rstd_ndim = 0;
   get_layer_norm_out_target_size(
diff --git a/kernels/portable/cpu/op_ne.cpp b/kernels/portable/cpu/op_ne.cpp
index 5601fdafbd..2c25dc7029 100644
--- a/kernels/portable/cpu/op_ne.cpp
+++ b/kernels/portable/cpu/op_ne.cpp
@@ -30,6 +30,9 @@ Tensor& ne_tensor_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
@@ -75,6 +78,9 @@ Tensor& ne_scalar_out(
       out,
       "Failed to resize output tensor.");
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = utils::get_scalar_dtype(b);
   ScalarType out_type = out.scalar_type();
diff --git a/kernels/portable/cpu/op_neg.cpp b/kernels/portable/cpu/op_neg.cpp
index 026d1009c4..b88cdb03a2 100644
--- a/kernels/portable/cpu/op_neg.cpp
+++ b/kernels/portable/cpu/op_neg.cpp
@@ -30,6 +30,9 @@ Tensor& neg_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "neg.out", CTYPE, [&] {
     apply_unary_map_fn(
         [](const CTYPE val_in) { return static_cast<CTYPE>(-val_in); },
diff --git a/kernels/portable/cpu/op_pdist_forward.cpp b/kernels/portable/cpu/op_pdist_forward.cpp
index 88b5e88194..9b06b880b6 100644
--- a/kernels/portable/cpu/op_pdist_forward.cpp
+++ b/kernels/portable/cpu/op_pdist_forward.cpp
@@ -24,6 +24,11 @@ Tensor& _pdist_forward_out(
 
   ET_KERNEL_CHECK(ctx, check_pdist_args(in, p, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
   Tensor::SizesType target_sizes[kTensorDimensionLimit];
   size_t target_ndim = 0;
   get_pdist_out_target_size(in, target_sizes, &target_ndim);
diff --git a/kernels/portable/cpu/op_permute_copy.cpp b/kernels/portable/cpu/op_permute_copy.cpp
index e7df5c9657..1362b57c00 100644
--- a/kernels/portable/cpu/op_permute_copy.cpp
+++ b/kernels/portable/cpu/op_permute_copy.cpp
@@ -46,6 +46,9 @@ Tensor& permute_copy_out(
   ET_KERNEL_CHECK(
       ctx, check_permute_copy_args(in, dims, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
   Tensor::SizesType expected_out_size[kTensorDimensionLimit];
   size_t expected_out_dim = 0;
   get_permute_copy_out_target_size(
diff --git a/kernels/portable/cpu/op_pixel_shuffle.cpp b/kernels/portable/cpu/op_pixel_shuffle.cpp
index 104348f3fe..e1e459b1b2 100644
--- a/kernels/portable/cpu/op_pixel_shuffle.cpp
+++ b/kernels/portable/cpu/op_pixel_shuffle.cpp
@@ -72,6 +72,10 @@ Tensor& pixel_shuffle_out(
       InvalidArgument,
       out);
 
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
   Tensor::SizesType expected_out_size[kTensorDimensionLimit];
   size_t expected_out_dim = 0;
   get_pixel_shuffle_out_target_size(
diff --git a/kernels/portable/cpu/op_to_copy.cpp b/kernels/portable/cpu/op_to_copy.cpp
index 7ecd4f3b5e..c0c04e65e9 100644
--- a/kernels/portable/cpu/op_to_copy.cpp
+++ b/kernels/portable/cpu/op_to_copy.cpp
@@ -46,10 +46,11 @@ Tensor& to_copy_out(
       InvalidArgument,
       out);
 
-  ET_SWITCH_REALHB_TYPES(self.scalar_type(), ctx, "to_copy", CTYPE_IN, [&] {
-    ET_SWITCH_REALHB_TYPES(out.scalar_type(), ctx, "to_copy", CTYPE_OUT, [&] {
-      _to_impl<CTYPE_IN, CTYPE_OUT>(self, out);
-    });
+  ET_SWITCH_REALHBBF16_TYPES(self.scalar_type(), ctx, "to_copy", CTYPE_IN, [&] {
+    ET_SWITCH_REALHBBF16_TYPES(
+        out.scalar_type(), ctx, "to_copy", CTYPE_OUT, [&] {
+          _to_impl<CTYPE_IN, CTYPE_OUT>(self, out);
+        });
   });
 
   return out;
diff --git a/kernels/portable/cpu/scalar_utils.h b/kernels/portable/cpu/scalar_utils.h
index 3daf3e7252..3d6dfb75e4 100644
--- a/kernels/portable/cpu/scalar_utils.h
+++ b/kernels/portable/cpu/scalar_utils.h
@@ -94,12 +94,6 @@ struct promote_type_with_scalar_type {
   static_assert(
       !is_bits_type<T1>::value,
       "promote_type_with_scalar_type not valid for bits dtypes");
-  static_assert(
-      !std::is_same<
-          T1,
-          typename ScalarTypeToCppType<exec_aten::ScalarType::BFloat16>::type>::
-          value,
-      "promote_type_with_scalar_type not valid for BFloat16");
   using promote_type_with_scalar_type_not_respecting_half_to_float =
       typename std::conditional<
           is_complex_type<T1>::value ||
@@ -119,10 +113,14 @@ struct promote_type_with_scalar_type {
  public:
   using type = typename std::conditional<
       half_to_float &&
-          std::is_same<
-              promote_type_with_scalar_type_not_respecting_half_to_float,
-              typename ScalarTypeToCppType<exec_aten::ScalarType::Half>::type>::
-              value,
+          (std::is_same<
+               promote_type_with_scalar_type_not_respecting_half_to_float,
+               typename ScalarTypeToCppType<
+                   exec_aten::ScalarType::Half>::type>::value ||
+           std::is_same<
+               promote_type_with_scalar_type_not_respecting_half_to_float,
+               typename ScalarTypeToCppType<
+                   exec_aten::ScalarType::BFloat16>::type>::value),
       typename ScalarTypeToCppType<exec_aten::ScalarType::Float>::type,
       promote_type_with_scalar_type_not_respecting_half_to_float>::type;
 };
diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp
index 32b69352ef..41a8656f96 100644
--- a/kernels/test/op_mul_test.cpp
+++ b/kernels/test/op_mul_test.cpp
@@ -72,7 +72,7 @@ class OpMulOutTest : public OperatorTest {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_mul_enumerate_out_types<DTYPE_A, ScalarType::dtype>();
 
-    ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
   }
@@ -89,29 +89,99 @@ class OpMulOutTest : public OperatorTest {
 
     // Multiply two tensors
     op_mul_out(
-        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.ones(sizes), out);
-    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}));
+        tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875}), tf.ones(sizes), out);
+    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875}));
 
     op_mul_out(
         tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.zeros(sizes), out);
     EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.0, 0.0, 0.0, 0.0}));
 
     op_mul_out(
-        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
-        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
+        tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875}),
+        tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875}),
         out);
     EXPECT_TENSOR_CLOSE(
-        out, tf.make(sizes, /*data=*/{1.21, 4.84, 19.36, 77.44}));
+        out, tf.make(sizes, /*data=*/{1.5625, 6.25, 22.5625, 78.765625}));
   }
 
   void test_mul_enumerate_a_types() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_mul_enumerate_b_types<ScalarType::dtype>();
 
-    ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY)
+    ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY)
 
 #undef ENUMERATE_TEST_ENTRY
   }
+
+  template <ScalarType DTYPE>
+  void test_optimized_path_ignores_leading_1_dimensions() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes1 = {1, 1, 2, 2};
+    const std::vector<int32_t> sizes2 = {1, 2, 2};
+
+    // Destination for the mul.
+    Tensor out = tf.zeros(sizes1);
+
+    // Multiply two tensors
+    op_mul_out(
+        tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.ones(sizes2), out);
+    EXPECT_TENSOR_CLOSE(out, tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8}));
+  }
+
+  template <ScalarType DTYPE>
+  void test_broadcast_a2b() {
+    TensorFactory<DTYPE> tf_a;
+
+    std::vector<std::vector<int32_t>> b_sizeses = {
+        {2},
+        {1, 2},
+    };
+    for (const auto& b_sizes : b_sizeses) {
+      // a and b of different shapes
+      Tensor a = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4});
+      Tensor b = tf_a.make(b_sizes, /*data=*/{2, 2});
+
+      // Destination for output of mul.
+      Tensor out = tf_a.zeros({2, 2});
+
+      // Check that it matches the expected output.
+      EXPECT_TENSOR_CLOSE(
+          op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8}));
+    }
+  }
+
+  template <ScalarType DTYPE>
+  void test_broadcast_b2a() {
+    TensorFactory<DTYPE> tf_a;
+    // a and b of different shapes
+    Tensor a = tf_a.make({2}, /*data=*/{2, 2});
+    Tensor b = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4});
+
+    // Destination for output of mul.
+    Tensor out = tf_a.zeros({2, 2});
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(
+        op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8}));
+  }
+
+  template <ScalarType DTYPE>
+  void test_scalar_input_broadcast() {
+    TensorFactory<DTYPE> tf_a;
+
+    // a is a 1d tensor and b is a scalar
+    Tensor a = tf_a.make({2}, /*data=*/{2, 2});
+    Tensor b = tf_a.make({}, /*data=*/{2});
+
+    // Destination for output of mul.
+    Tensor out = tf_a.make({2}, /*data=*/{2, 2});
+    Tensor expected = tf_a.make({2}, /*data=*/{4, 4});
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(op_mul_out(a, b, out), expected);
+    EXPECT_TENSOR_CLOSE(op_mul_out(b, a, out), expected);
+  }
 };
 
 class OpMulScalarOutTest : public OperatorTest {
@@ -141,6 +211,14 @@ TEST_F(OpMulOutTest, DoubleTensors) {
   test_floating_point_mul_out<ScalarType::Double>();
 }
 
+TEST_F(OpMulOutTest, HalfTensors) {
+  test_floating_point_mul_out<ScalarType::Half>();
+}
+
+TEST_F(OpMulOutTest, BFloat16Tensors) {
+  test_floating_point_mul_out<ScalarType::BFloat16>();
+}
+
 TEST_F(OpMulOutTest, BoolTensors) {
   TensorFactory<ScalarType::Bool> tf;
 
@@ -166,18 +244,12 @@ TEST_F(OpMulOutTest, BoolTensors) {
 }
 
 TEST_F(OpMulOutTest, OptimizedPathIgnoresLeading1Dimensions) {
-  TensorFactory<ScalarType::Float> tf;
+#define ENUMERATE_TEST_ENTRY(ctype, dtype) \
+  test_optimized_path_ignores_leading_1_dimensions<ScalarType::dtype>();
 
-  const std::vector<int32_t> sizes1 = {1, 1, 2, 2};
-  const std::vector<int32_t> sizes2 = {1, 2, 2};
+  ET_FORALL_FLOATHBF16_TYPES(ENUMERATE_TEST_ENTRY);
 
-  // Destination for the mul.
-  Tensor out = tf.zeros(sizes1);
-
-  // Multiply two tensors
-  op_mul_out(
-      tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.ones(sizes2), out);
-  EXPECT_TENSOR_CLOSE(out, tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8}));
+#undef ENUMERATE_TEST_ENTRY
 }
 
 // Mismatched shape tests.
@@ -202,40 +274,16 @@ TEST_F(OpMulOutTest, MismatchedNonBroadcastableInputShapesDies) {
 
 // Broadcast tensor b's size to tensor a's size
 TEST_F(OpMulOutTest, BroadcastA2BTest) {
-  TensorFactory<ScalarType::Int> tf_a;
-
-  std::vector<std::vector<int32_t>> b_sizeses = {
-      {2},
-      {1, 2},
-  };
-  for (const auto& b_sizes : b_sizeses) {
-    // a and b of different shapes
-    Tensor a = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4});
-    Tensor b = tf_a.make(b_sizes, /*data=*/{2, 2});
-
-    // Destination for output of mul.
-    Tensor out = tf_a.zeros({2, 2});
-
-    // Check that it matches the expected output.
-    EXPECT_TENSOR_CLOSE(
-        op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8}));
-  }
+  test_broadcast_a2b<ScalarType::Int>();
+  test_broadcast_a2b<ScalarType::Half>();
+  test_broadcast_a2b<ScalarType::BFloat16>();
 }
 
 // Broadcast tensor a's size to tensor b's size
 TEST_F(OpMulOutTest, BroadcastB2ATest) {
-  TensorFactory<ScalarType::Int> tf_a;
-
-  // a and b of different shapes
-  Tensor a = tf_a.make({2}, /*data=*/{2, 2});
-  Tensor b = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4});
-
-  // Destination for output of mul.
-  Tensor out = tf_a.zeros({2, 2});
-
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_CLOSE(
-      op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8}));
+  test_broadcast_b2a<ScalarType::Int>();
+  test_broadcast_b2a<ScalarType::Half>();
+  test_broadcast_b2a<ScalarType::BFloat16>();
 }
 
 // Broadcast tensor a and b's size to a new size c.
@@ -256,19 +304,9 @@ TEST_F(OpMulOutTest, BroadcastAB2CTest) {
 }
 
 TEST_F(OpMulOutTest, ScalarInputBroadcastTest) {
-  TensorFactory<ScalarType::Int> tf_a;
-
-  // a is a 1d tensor and b is a scalar
-  Tensor a = tf_a.make({2}, /*data=*/{2, 2});
-  Tensor b = tf_a.make({}, /*data=*/{2});
-
-  // Destination for output of mul.
-  Tensor out = tf_a.make({2}, /*data=*/{2, 2});
-  Tensor expected = tf_a.make({2}, /*data=*/{4, 4});
-
-  // Check that it matches the expected output.
-  EXPECT_TENSOR_CLOSE(op_mul_out(a, b, out), expected);
-  EXPECT_TENSOR_CLOSE(op_mul_out(b, a, out), expected);
+  test_scalar_input_broadcast<ScalarType::Int>();
+  test_scalar_input_broadcast<ScalarType::Half>();
+  test_scalar_input_broadcast<ScalarType::BFloat16>();
 }
 
 TEST_F(OpMulOutTest, MismatchedOutputShapesDies) {
diff --git a/kernels/test/op_to_copy_test.cpp b/kernels/test/op_to_copy_test.cpp
index 1cc892dedb..0a6529e736 100644
--- a/kernels/test/op_to_copy_test.cpp
+++ b/kernels/test/op_to_copy_test.cpp
@@ -36,7 +36,9 @@ typedef std::map<
           std::type_index,
           std::variant<
             std::vector<float>,
-            std::vector<double>>>
+            std::vector<double>,
+            std::vector<exec_aten::Half>,
+            std::vector<exec_aten::BFloat16>>>
         FloatingTypeToDataMap;
 
 typedef std::map<
@@ -309,9 +311,9 @@ TEST_F(OpToTest, AllDtypesSupported) {
       ScalarType::OUTPUT_DTYPE>(test_cases);
 
 #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
-  ET_FORALL_REAL_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
+  ET_FORALL_REALHBF16_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
 
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
 
 #undef TEST_ENTRY
 #undef TEST_KERNEL
@@ -323,14 +325,14 @@ TEST_F(OpToTest, BoolTests) {
 #define TEST_TO_BOOL(INPUT_CTYPE, INPUT_DTYPE)               \
   test_runner_to_bool<INPUT_CTYPE, ScalarType::INPUT_DTYPE>( \
       test_case_to_bool, result_to_bool);
-  ET_FORALL_REAL_TYPES(TEST_TO_BOOL);
+  ET_FORALL_REALHBF16_TYPES(TEST_TO_BOOL);
 
   std::vector<uint8_t> test_case_from_bool = {true, true, false};
   std::vector<double> result_from_bool = {1.0, 1.0, 0};
 #define TEST_FROM_BOOL(OUTPUT_CTYPE, OUTPUT_DTYPE)               \
   test_runner_from_bool<OUTPUT_CTYPE, ScalarType::OUTPUT_DTYPE>( \
       test_case_from_bool, result_from_bool);
-  ET_FORALL_REAL_TYPES(TEST_FROM_BOOL);
+  ET_FORALL_REALHBF16_TYPES(TEST_FROM_BOOL);
 }
 
 TEST_F(OpToTest, NanInfSupported) {
@@ -349,9 +351,9 @@ TEST_F(OpToTest, NanInfSupported) {
       ScalarType::OUTPUT_DTYPE>(test_cases);
 
 #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
-  ET_FORALL_FLOAT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
+  ET_FORALL_FLOATHBF16_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
 
-  ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY);
 
 #undef TEST_ENTRY
 #undef TEST_KERNEL
@@ -381,6 +383,13 @@ TEST_F(OpToTest, HardcodeFloatConvertInt) {
       -0.30919688936285893988};
   // clang-format on
 
+  std::vector<exec_aten::Half> half_data;
+  std::vector<exec_aten::BFloat16> bf16_data;
+  for (auto d : double_data) {
+    half_data.emplace_back(d);
+    bf16_data.emplace_back(d);
+  }
+
   std::vector<int64_t> int64_data = {
       -1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0};
   std::vector<int32_t> int32_data = {
@@ -394,6 +403,8 @@ TEST_F(OpToTest, HardcodeFloatConvertInt) {
   FloatingTypeToDataMap floating_point_data;
   floating_point_data[typeid(float)] = float_data;
   floating_point_data[typeid(double)] = double_data;
+  floating_point_data[typeid(exec_aten::Half)] = half_data;
+  floating_point_data[typeid(exec_aten::BFloat16)] = bf16_data;
 
   // Gathering all int data together for better traversial
   IntTypeToDataMap int_data;
@@ -412,7 +423,7 @@ TEST_F(OpToTest, HardcodeFloatConvertInt) {
 #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
   ET_FORALL_INT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
 
-  ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY);
 }
 
 TEST_F(OpToTest, MismatchedSizesDie) {
diff --git a/runtime/core/exec_aten/exec_aten.h b/runtime/core/exec_aten/exec_aten.h
index 919b5420b3..808d31502a 100644
--- a/runtime/core/exec_aten/exec_aten.h
+++ b/runtime/core/exec_aten/exec_aten.h
@@ -17,6 +17,7 @@
 #include <c10/core/MemoryFormat.h> // @manual
 #include <c10/core/Scalar.h> // @manual
 #include <c10/util/ArrayRef.h> // @manual
+#include <c10/util/BFloat16-math.h> // @manual
 #include <c10/util/BFloat16.h> // @manual
 #include <c10/util/Half.h> // @manual
 #include <c10/util/Optional.h> // @manual
@@ -31,6 +32,7 @@
 #else // use executor
 #include <executorch/runtime/core/array_ref.h> // @manual
 #include <executorch/runtime/core/portable_type/bfloat16.h> // @manual
+#include <executorch/runtime/core/portable_type/bfloat16_math.h> // @manual
 #include <executorch/runtime/core/portable_type/complex.h> // @manual
 #include <executorch/runtime/core/portable_type/device.h> // @manual
 #include <executorch/runtime/core/portable_type/half.h> // @manual
diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp
index 03dffd208f..0301cc9a51 100644
--- a/runtime/core/exec_aten/testing_util/tensor_util.cpp
+++ b/runtime/core/exec_aten/testing_util/tensor_util.cpp
@@ -16,6 +16,8 @@
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
+using exec_aten::BFloat16;
+using exec_aten::Half;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
 
@@ -32,9 +34,7 @@ namespace {
  * T must be a floating point type. Non-floating point data should be compared
  * directly.
  */
-template <
-    typename T,
-    typename = std::enable_if_t<std::is_floating_point<T>::value>>
+template <typename T>
 bool data_is_close(
     const T* a,
     const T* b,
@@ -119,6 +119,20 @@ bool tensors_are_close(
         a.numel(),
         rtol,
         atol);
+  } else if (a.scalar_type() == ScalarType::Half) {
+    return data_is_close<Half>(
+        a.const_data_ptr<Half>(),
+        b.const_data_ptr<Half>(),
+        a.numel(),
+        rtol,
+        atol);
+  } else if (a.scalar_type() == ScalarType::BFloat16) {
+    return data_is_close<BFloat16>(
+        a.const_data_ptr<BFloat16>(),
+        b.const_data_ptr<BFloat16>(),
+        a.numel(),
+        rtol,
+        atol);
   } else {
     // Non-floating-point types can be compared bitwise.
     return memcmp(a.const_data_ptr(), b.const_data_ptr(), a.nbytes()) == 0;
@@ -269,7 +283,7 @@ std::ostream& operator<<(std::ostream& os, const Tensor& t) {
     break;
 
   switch (t.scalar_type()) {
-    ET_FORALL_REAL_TYPES_AND2(Half, Bool, PRINT_CASE)
+    ET_FORALL_REAL_TYPES_AND3(Half, Bool, BFloat16, PRINT_CASE)
     default:
       ET_CHECK_MSG(
           false,
diff --git a/runtime/core/exec_aten/util/genScalarTypeTable.py b/runtime/core/exec_aten/util/genScalarTypeTable.py
index 07100472ae..c2bc84c270 100644
--- a/runtime/core/exec_aten/util/genScalarTypeTable.py
+++ b/runtime/core/exec_aten/util/genScalarTypeTable.py
@@ -4,20 +4,35 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-indexToType = ["U1", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "B1"]
+indexToType = [
+    "U1",
+    "I1",
+    "I2",
+    "I4",
+    "I8",
+    "F2",
+    "F4",
+    "F8",
+    "C2",
+    "C4",
+    "C8",
+    "B1",
+    "BF",
+]
 promoteTypesLookup = [
-    ["U1", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "U1"],
-    ["I2", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I1"],
-    ["I2", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I2"],
-    ["I4", "I4", "I4", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I4"],
-    ["I8", "I8", "I8", "I8", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I8"],
-    ["F2", "F2", "F2", "F2", "F2", "F2", "F4", "F8", "C2", "C4", "C8", "F2"],
-    ["F4", "F4", "F4", "F4", "F4", "F4", "F4", "F8", "C4", "C4", "C8", "F4"],
-    ["F8", "F8", "F8", "F8", "F8", "F8", "F8", "F8", "C8", "C8", "C8", "F8"],
-    ["C2", "C2", "C2", "C2", "C2", "C2", "C4", "C8", "C2", "C4", "C8", "C2"],
-    ["C4", "C4", "C4", "C4", "C4", "C4", "C4", "C8", "C4", "C4", "C8", "C4"],
-    ["C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8"],
-    ["U1", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "B1"],
+    ["U1", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "U1", "BF"],
+    ["I2", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I1", "BF"],
+    ["I2", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I2", "BF"],
+    ["I4", "I4", "I4", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I4", "BF"],
+    ["I8", "I8", "I8", "I8", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I8", "BF"],
+    ["F2", "F2", "F2", "F2", "F2", "F2", "F4", "F8", "C2", "C4", "C8", "F2", "F4"],
+    ["F4", "F4", "F4", "F4", "F4", "F4", "F4", "F8", "C4", "C4", "C8", "F4", "F4"],
+    ["F8", "F8", "F8", "F8", "F8", "F8", "F8", "F8", "C8", "C8", "C8", "F8", "F8"],
+    ["C2", "C2", "C2", "C2", "C2", "C2", "C4", "C8", "C2", "C4", "C8", "C2", "C4"],
+    ["C4", "C4", "C4", "C4", "C4", "C4", "C4", "C8", "C4", "C4", "C8", "C4", "C4"],
+    ["C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8"],
+    ["U1", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "B1", "BF"],
+    ["BF", "BF", "BF", "BF", "BF", "F4", "F4", "F8", "C4", "C4", "C8", "BF", "BF"],
 ]
 for rowIndex, row in enumerate(promoteTypesLookup):
     for colIndex, col in enumerate(row):
diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h
index c92f910431..479767b4ab 100644
--- a/runtime/core/exec_aten/util/scalar_type_util.h
+++ b/runtime/core/exec_aten/util/scalar_type_util.h
@@ -21,6 +21,7 @@
 
 #pragma once
 
+#include <array>
 #include <cinttypes>
 #include <cstdint>
 #include <limits>
@@ -164,8 +165,21 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
         ::exec_aten::ScalarType::SCALARTYPE>::type, \
     SCALARTYPE)
 
+#define ET_FORALL_FLOAT_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
+  _(float, Float)                                               \
+  _(double, Double)                                             \
+  _(::executorch::runtime::ScalarTypeToCppType<                 \
+        ::exec_aten::ScalarType::SCALARTYPE1>::type,            \
+    SCALARTYPE1)                                                \
+  _(::executorch::runtime::ScalarTypeToCppType<                 \
+        ::exec_aten::ScalarType::SCALARTYPE2>::type,            \
+    SCALARTYPE2)
+
 #define ET_FORALL_FLOATH_TYPES(_) ET_FORALL_FLOAT_TYPES_AND(Half, _)
 
+#define ET_FORALL_FLOATHBF16_TYPES(_) \
+  ET_FORALL_FLOAT_TYPES_AND2(Half, BFloat16, _)
+
 // Here `ANOTHER_INPUT` should be another variable to be forwarded to a given
 // function. Not to be confused with another scalar type as in
 // `ET_FORALL_FLOAT_TYPES_AND`.
@@ -177,6 +191,12 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float)                      \
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double)
 
+#define ET_FORALL_FLOATHBF16_TYPES_WITH2(ANOTHER_INPUT1, ANOTHER_INPUT2, _) \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float)                           \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double)                         \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::exec_aten::Half, Half)                \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::exec_aten::BFloat16, BFloat16)
+
 // In this context, "REAL" means integer/float C types, which is why BFloat16
 // and Half are not included.
 #define ET_FORALL_REAL_TYPES(_) \
@@ -209,6 +229,17 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float)                     \
   _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double)
 
+#define ET_FORALL_REALHBF16_TYPES_WITH2(ANOTHER_INPUT1, ANOTHER_INPUT2, _) \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, uint8_t, Byte)                         \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int8_t, Char)                          \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int16_t, Short)                        \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int32_t, Int)                          \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, int64_t, Long)                         \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float)                          \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double)                        \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, exec_aten::Half, Half)                 \
+  _(ANOTHER_INPUT1, ANOTHER_INPUT2, exec_aten::BFloat16, BFloat16)
+
 // For macros that take `SCALARTYPEn` parameters, those parameters should be
 // an unquoted/unqualified enumerator name like `Int` or `Float`.
 #define ET_FORALL_REAL_TYPES_AND(SCALARTYPE, _)     \
@@ -223,8 +254,29 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
         ::exec_aten::ScalarType::SCALARTYPE>::type, \
     SCALARTYPE)
 
+#define ET_FORALL_REAL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
+  _(uint8_t, Byte)                                             \
+  _(int8_t, Char)                                              \
+  _(int16_t, Short)                                            \
+  _(int32_t, Int)                                              \
+  _(int64_t, Long)                                             \
+  _(float, Float)                                              \
+  _(double, Double)                                            \
+  _(::executorch::runtime::ScalarTypeToCppType<                \
+        ::exec_aten::ScalarType::SCALARTYPE1>::type,           \
+    SCALARTYPE1)                                               \
+  _(::executorch::runtime::ScalarTypeToCppType<                \
+        ::exec_aten::ScalarType::SCALARTYPE2>::type,           \
+    SCALARTYPE2)
+
 #define ET_FORALL_REALH_TYPES(_) ET_FORALL_REAL_TYPES_AND(Half, _)
 
+#define ET_FORALL_REALHBF16_TYPES(_) \
+  ET_FORALL_REAL_TYPES_AND2(Half, BFloat16, _)
+
+#define ET_FORALL_REALHBBF16_TYPES(_) \
+  ET_FORALL_REAL_TYPES_AND3(Bool, Half, BFloat16, _)
+
 #define ET_FORALL_REAL_TYPES_AND_WITH(SCALARTYPE, ANOTHER_INPUT, _) \
   _(ANOTHER_INPUT, uint8_t, Byte)                                   \
   _(ANOTHER_INPUT, int8_t, Char)                                    \
@@ -381,6 +433,10 @@ inline bool isRealHBType(exec_aten::ScalarType t) {
   return (isRealHType(t) || t == exec_aten::ScalarType::Bool);
 }
 
+inline bool isRealHBBF16Type(exec_aten::ScalarType t) {
+  return (isRealHBType(t) || t == exec_aten::ScalarType::BFloat16);
+}
+
 inline constexpr bool isComplexType(exec_aten::ScalarType t) {
   return (
       t == exec_aten::ScalarType::ComplexHalf ||
@@ -589,6 +645,7 @@ using C4 =
 using C8 =
     typename ScalarTypeToCppType<exec_aten::ScalarType::ComplexDouble>::type;
 using B1 = typename ScalarTypeToCppType<exec_aten::ScalarType::Bool>::type;
+using BF = typename ScalarTypeToCppType<exec_aten::ScalarType::BFloat16>::type;
 
 #define TABLE_ENTRY(key1, key2, value)      \
   template <>                               \
@@ -613,6 +670,7 @@ TABLE_ENTRY(U1, C2, C2);
 TABLE_ENTRY(U1, C4, C4);
 TABLE_ENTRY(U1, C8, C8);
 TABLE_ENTRY(U1, B1, U1);
+TABLE_ENTRY(U1, BF, BF);
 TABLE_ENTRY(I1, U1, I2);
 TABLE_ENTRY(I1, I1, I1);
 TABLE_ENTRY(I1, I2, I2);
@@ -625,6 +683,7 @@ TABLE_ENTRY(I1, C2, C2);
 TABLE_ENTRY(I1, C4, C4);
 TABLE_ENTRY(I1, C8, C8);
 TABLE_ENTRY(I1, B1, I1);
+TABLE_ENTRY(I1, BF, BF);
 TABLE_ENTRY(I2, U1, I2);
 TABLE_ENTRY(I2, I1, I2);
 TABLE_ENTRY(I2, I2, I2);
@@ -637,6 +696,7 @@ TABLE_ENTRY(I2, C2, C2);
 TABLE_ENTRY(I2, C4, C4);
 TABLE_ENTRY(I2, C8, C8);
 TABLE_ENTRY(I2, B1, I2);
+TABLE_ENTRY(I2, BF, BF);
 TABLE_ENTRY(I4, U1, I4);
 TABLE_ENTRY(I4, I1, I4);
 TABLE_ENTRY(I4, I2, I4);
@@ -649,6 +709,7 @@ TABLE_ENTRY(I4, C2, C2);
 TABLE_ENTRY(I4, C4, C4);
 TABLE_ENTRY(I4, C8, C8);
 TABLE_ENTRY(I4, B1, I4);
+TABLE_ENTRY(I4, BF, BF);
 TABLE_ENTRY(I8, U1, I8);
 TABLE_ENTRY(I8, I1, I8);
 TABLE_ENTRY(I8, I2, I8);
@@ -661,6 +722,7 @@ TABLE_ENTRY(I8, C2, C2);
 TABLE_ENTRY(I8, C4, C4);
 TABLE_ENTRY(I8, C8, C8);
 TABLE_ENTRY(I8, B1, I8);
+TABLE_ENTRY(I8, BF, BF);
 TABLE_ENTRY(F2, U1, F2);
 TABLE_ENTRY(F2, I1, F2);
 TABLE_ENTRY(F2, I2, F2);
@@ -673,6 +735,7 @@ TABLE_ENTRY(F2, C2, C2);
 TABLE_ENTRY(F2, C4, C4);
 TABLE_ENTRY(F2, C8, C8);
 TABLE_ENTRY(F2, B1, F2);
+TABLE_ENTRY(F2, BF, F4);
 TABLE_ENTRY(F4, U1, F4);
 TABLE_ENTRY(F4, I1, F4);
 TABLE_ENTRY(F4, I2, F4);
@@ -685,6 +748,7 @@ TABLE_ENTRY(F4, C2, C4);
 TABLE_ENTRY(F4, C4, C4);
 TABLE_ENTRY(F4, C8, C8);
 TABLE_ENTRY(F4, B1, F4);
+TABLE_ENTRY(F4, BF, F4);
 TABLE_ENTRY(F8, U1, F8);
 TABLE_ENTRY(F8, I1, F8);
 TABLE_ENTRY(F8, I2, F8);
@@ -697,6 +761,7 @@ TABLE_ENTRY(F8, C2, C8);
 TABLE_ENTRY(F8, C4, C8);
 TABLE_ENTRY(F8, C8, C8);
 TABLE_ENTRY(F8, B1, F8);
+TABLE_ENTRY(F8, BF, F8);
 TABLE_ENTRY(C2, U1, C2);
 TABLE_ENTRY(C2, I1, C2);
 TABLE_ENTRY(C2, I2, C2);
@@ -709,6 +774,7 @@ TABLE_ENTRY(C2, C2, C2);
 TABLE_ENTRY(C2, C4, C4);
 TABLE_ENTRY(C2, C8, C8);
 TABLE_ENTRY(C2, B1, C2);
+TABLE_ENTRY(C2, BF, C4);
 TABLE_ENTRY(C4, U1, C4);
 TABLE_ENTRY(C4, I1, C4);
 TABLE_ENTRY(C4, I2, C4);
@@ -721,6 +787,7 @@ TABLE_ENTRY(C4, C2, C4);
 TABLE_ENTRY(C4, C4, C4);
 TABLE_ENTRY(C4, C8, C8);
 TABLE_ENTRY(C4, B1, C4);
+TABLE_ENTRY(C4, BF, C4);
 TABLE_ENTRY(C8, U1, C8);
 TABLE_ENTRY(C8, I1, C8);
 TABLE_ENTRY(C8, I2, C8);
@@ -733,6 +800,7 @@ TABLE_ENTRY(C8, C2, C8);
 TABLE_ENTRY(C8, C4, C8);
 TABLE_ENTRY(C8, C8, C8);
 TABLE_ENTRY(C8, B1, C8);
+TABLE_ENTRY(C8, BF, C8);
 TABLE_ENTRY(B1, U1, U1);
 TABLE_ENTRY(B1, I1, I1);
 TABLE_ENTRY(B1, I2, I2);
@@ -745,6 +813,20 @@ TABLE_ENTRY(B1, C2, C2);
 TABLE_ENTRY(B1, C4, C4);
 TABLE_ENTRY(B1, C8, C8);
 TABLE_ENTRY(B1, B1, B1);
+TABLE_ENTRY(B1, BF, BF);
+TABLE_ENTRY(BF, U1, BF);
+TABLE_ENTRY(BF, I1, BF);
+TABLE_ENTRY(BF, I2, BF);
+TABLE_ENTRY(BF, I4, BF);
+TABLE_ENTRY(BF, I8, BF);
+TABLE_ENTRY(BF, F2, F4);
+TABLE_ENTRY(BF, F4, F4);
+TABLE_ENTRY(BF, F8, F8);
+TABLE_ENTRY(BF, C2, C4);
+TABLE_ENTRY(BF, C4, C4);
+TABLE_ENTRY(BF, C8, C8);
+TABLE_ENTRY(BF, B1, BF);
+TABLE_ENTRY(BF, BF, BF);
 
 } // namespace internal
 
@@ -760,26 +842,20 @@ struct promote_types {
           (!is_bits_type<T1>::value && !is_bits_type<T2>::value),
       "promote_types not valid for bits dtypes");
 
-  static_assert(
-      !std::is_same<
-          T1,
-          typename ScalarTypeToCppType<exec_aten::ScalarType::BFloat16>::type>::
-              value &&
-          !std::is_same<
-              T2,
-              typename ScalarTypeToCppType<
-                  exec_aten::ScalarType::BFloat16>::type>::value,
-      "promote_types not valid for BFloat16");
   using promoted_type_not_respecting_half_to_float =
       typename internal::promote_types_lookup<T1, T2>::type;
 
  public:
   using type = typename std::conditional<
       half_to_float &&
-          std::is_same<
-              promoted_type_not_respecting_half_to_float,
-              typename ScalarTypeToCppType<exec_aten::ScalarType::Half>::type>::
-              value,
+          (std::is_same<
+               promoted_type_not_respecting_half_to_float,
+               typename ScalarTypeToCppType<
+                   exec_aten::ScalarType::Half>::type>::value ||
+           std::is_same<
+               promoted_type_not_respecting_half_to_float,
+               typename ScalarTypeToCppType<
+                   exec_aten::ScalarType::BFloat16>::type>::value),
       typename ScalarTypeToCppType<exec_aten::ScalarType::Float>::type,
       promoted_type_not_respecting_half_to_float>::type;
 };
@@ -787,7 +863,8 @@ struct promote_types {
 /**
  * Implements type promotion rules that are consistent with ATen behaviour,
  * which in turn is consistent with NumPy's promote_types.
- * If half_to_float is set to true, then half will be promoted to float instead
+ * If half_to_float is set to true, then half and bfloat16 will be promoted to
+ * float instead
  */
 inline exec_aten::ScalarType promoteTypes(
     exec_aten::ScalarType a,
@@ -806,6 +883,7 @@ inline exec_aten::ScalarType promoteTypes(
   constexpr auto c4 = exec_aten::ScalarType::ComplexFloat;
   constexpr auto c8 = exec_aten::ScalarType::ComplexDouble;
   constexpr auto b1 = exec_aten::ScalarType::Bool;
+  constexpr auto bf = exec_aten::ScalarType::BFloat16;
 
   // For QInt types, only allow exact match
   if (executorch::runtime::isQIntType(a) && a == b) {
@@ -825,34 +903,41 @@ inline exec_aten::ScalarType promoteTypes(
     ET_CHECK_MSG(false, "promoteTypes not valid for bits dtypes");
   }
 
-  ET_CHECK_MSG(
-      a != exec_aten::ScalarType::BFloat16 &&
-          b != exec_aten::ScalarType::BFloat16,
-      "promoteTypes not valid for BFloat16");
   // 12 types are handled by this function, see the constexpr definitions above
-  const int NUM_PROMOTE_TYPES = 12;
-
+  const int NUM_PROMOTE_TYPES = 13;
+
+  static constexpr std::array<int, int(exec_aten::ScalarType::NumOptions)>
+      dtype2index = {{
+          0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+          -1, -1, -1, 12, -1, -1, -1, -1, -1, -1, -1, -1,
+      }};
+  auto ix_a = dtype2index[(int)a];
+  ET_CHECK(ix_a != -1);
+  auto ix_b = dtype2index[(int)b];
+  ET_CHECK(ix_b != -1);
   static constexpr exec_aten::ScalarType
       _promoteTypesLookup[NUM_PROMOTE_TYPES][NUM_PROMOTE_TYPES] = {
-          /*        u1  i1  i2  i4  i8  f2  f4  f8  c2  c4  c8  b1  */
-          /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1},
-          /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, i1},
-          /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, i2},
-          /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, c2, c4, c8, i4},
-          /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, c2, c4, c8, i8},
-          /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, c2, c4, c8, f2},
-          /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, c4, c4, c8, f4},
-          /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, c8, c8, c8, f8},
-          /* c2 */ {c2, c2, c2, c2, c2, c2, c4, c8, c2, c4, c8, c2},
-          /* c4 */ {c4, c4, c4, c4, c4, c4, c4, c8, c4, c4, c8, c4},
-          /* c8 */ {c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8},
-          /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1},
+          /*        u1  i1  i2  i4  i8  f2  f4  f8  c2  c4  c8  b1  bf*/
+          /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1, bf},
+          /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, i1, bf},
+          /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, i2, bf},
+          /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, c2, c4, c8, i4, bf},
+          /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, c2, c4, c8, i8, bf},
+          /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, c2, c4, c8, f2, f4},
+          /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, c4, c4, c8, f4, f4},
+          /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, c8, c8, c8, f8, f8},
+          /* c2 */ {c2, c2, c2, c2, c2, c2, c4, c8, c2, c4, c8, c2, c4},
+          /* c4 */ {c4, c4, c4, c4, c4, c4, c4, c8, c4, c4, c8, c4, c4},
+          /* c8 */ {c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8},
+          /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1, bf},
+          /* bf */ {bf, bf, bf, bf, bf, f4, f4, f8, c4, c4, c8, bf, bf},
       };
 
-  exec_aten::ScalarType promoted_type =
-      _promoteTypesLookup[static_cast<int>(a)][static_cast<int>(b)];
+  exec_aten::ScalarType promoted_type = _promoteTypesLookup[ix_a][ix_b];
 
-  if (half_to_float && promoted_type == exec_aten::ScalarType::Half) {
+  if (half_to_float &&
+      (promoted_type == exec_aten::ScalarType::Half ||
+       promoted_type == exec_aten::ScalarType::BFloat16)) {
     promoted_type = exec_aten::ScalarType::Float;
   }
 
@@ -974,6 +1059,13 @@ inline exec_aten::ScalarType promoteTypes(
   ET_INTERNAL_SWITCH_CASE(                                          \
       exec_aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)
 
+#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3(             \
+    ADDITIONAL1, ADDITIONAL2, ADDITIONAL3, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2(                   \
+      ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)    \
+  ET_INTERNAL_SWITCH_CASE(                                   \
+      exec_aten::ScalarType::ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__)
+
 #define ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, ...)   \
   ET_INTERNAL_SWITCH_CASE(                                    \
       exec_aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__)  \
@@ -1001,6 +1093,13 @@ inline exec_aten::ScalarType promoteTypes(
   ET_INTERNAL_SWITCH_CASE(                                                    \
       exec_aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)
 
+#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND2( \
+    ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, ...)   \
+  ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND(        \
+      ADDITIONAL1, CTYPE_ALIAS, __VA_ARGS__)      \
+  ET_INTERNAL_SWITCH_CASE(                        \
+      exec_aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)
+
 #define ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, ...)     \
   ET_INTERNAL_SWITCH_CASE(                                       \
       exec_aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__)    \
@@ -1112,6 +1211,22 @@ inline exec_aten::ScalarType promoteTypes(
       ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2(                       \
           ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__))
 
+#define ET_SWITCH_REAL_TYPES_AND3(             \
+    ADDITIONAL1,                               \
+    ADDITIONAL2,                               \
+    ADDITIONAL3,                               \
+    TYPE,                                      \
+    CONTEXT,                                   \
+    NAME,                                      \
+    CTYPE_ALIAS,                               \
+    ...)                                       \
+  ET_INTERNAL_SWITCH(                          \
+      TYPE,                                    \
+      CONTEXT,                                 \
+      NAME,                                    \
+      ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3( \
+          ADDITIONAL1, ADDITIONAL2, ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__))
+
 #define ET_SWITCH_REALH_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
   ET_SWITCH_REAL_TYPES_AND(Half, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
 
@@ -1122,6 +1237,10 @@ inline exec_aten::ScalarType promoteTypes(
   ET_SWITCH_REAL_TYPES_AND2(                                          \
       Half, Bool, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
 
+#define ET_SWITCH_REALHBBF16_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_SWITCH_REAL_TYPES_AND3(                                              \
+      Half, Bool, BFloat16, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
+
 #define ET_SWITCH_INT_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH(                                              \
       TYPE,                                                        \
@@ -1154,9 +1273,22 @@ inline exec_aten::ScalarType promoteTypes(
       ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND(         \
           ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__))
 
+#define ET_SWITCH_FLOAT_TYPES_AND2(                                  \
+    ADDITIONAL1, ADDITIONAL2, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                                \
+      TYPE,                                                          \
+      CONTEXT,                                                       \
+      NAME,                                                          \
+      ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND2(                      \
+          ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__))
+
 #define ET_SWITCH_FLOATH_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
   ET_SWITCH_FLOAT_TYPES_AND(Half, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
 
+#define ET_SWITCH_FLOATHBF16_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_SWITCH_FLOAT_TYPES_AND2(                                             \
+      Half, BFloat16, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
+
 #define ET_SWITCH_QINT_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH(                                               \
       TYPE,                                                         \
diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h
index cadb5ecd9a..630f0cdb4a 100644
--- a/runtime/core/exec_aten/util/tensor_util.h
+++ b/runtime/core/exec_aten/util/tensor_util.h
@@ -516,6 +516,15 @@ inline bool tensor_is_realhb_type(exec_aten::Tensor t) {
   return true;
 }
 
+inline bool tensor_is_realhbbf16_type(exec_aten::Tensor t) {
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      executorch::runtime::isRealHBBF16Type(t.scalar_type()),
+      "Expected to find a real type, but tensor has type %s",
+      torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
 inline bool tensor_is_complex_type(exec_aten::Tensor t) {
   ET_LOG_MSG_AND_RETURN_IF_FALSE(
       torch::executor::isComplexType(t.scalar_type()),
diff --git a/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp b/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp
index b91c7009f4..9df01b7be9 100644
--- a/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp
+++ b/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp
@@ -139,37 +139,38 @@ TEST(ScalarTypeUtilTest, promoteTypesTest) {
 
   // Check some common cases
 
-  ET_CHECK(
-      promoteTypes(ScalarType::Float, ScalarType::Double) ==
-      ScalarType::Double);
-  ET_CHECK(
-      promoteTypes(ScalarType::Float, ScalarType::Short) == ScalarType::Float);
-
-  ET_CHECK(
-      promoteTypes(ScalarType::Float, ScalarType::Int) == ScalarType::Float);
-  ET_CHECK(
-      promoteTypes(ScalarType::Long, ScalarType::Float) == ScalarType::Float);
-
-  ET_CHECK(
-      promoteTypes(ScalarType::Bool, ScalarType::Bool) == ScalarType::Bool);
-
-  ET_CHECK(promoteTypes(ScalarType::Byte, ScalarType::Int) == ScalarType::Int);
-  ET_CHECK(
-      promoteTypes(ScalarType::Char, ScalarType::Bool) == ScalarType::Char);
-  ET_CHECK(promoteTypes(ScalarType::Bool, ScalarType::Int) == ScalarType::Int);
+  EXPECT_EQ(
+      promoteTypes(ScalarType::Float, ScalarType::Double), ScalarType::Double);
+  EXPECT_EQ(
+      promoteTypes(ScalarType::Float, ScalarType::Short), ScalarType::Float);
+
+  EXPECT_EQ(
+      promoteTypes(ScalarType::Float, ScalarType::Int), ScalarType::Float);
+  EXPECT_EQ(
+      promoteTypes(ScalarType::Long, ScalarType::Float), ScalarType::Float);
+
+  EXPECT_EQ(promoteTypes(ScalarType::Bool, ScalarType::Bool), ScalarType::Bool);
+
+  EXPECT_EQ(promoteTypes(ScalarType::Byte, ScalarType::Int), ScalarType::Int);
+  EXPECT_EQ(promoteTypes(ScalarType::Char, ScalarType::Bool), ScalarType::Char);
+  EXPECT_EQ(promoteTypes(ScalarType::Bool, ScalarType::Int), ScalarType::Int);
+
+  EXPECT_EQ(
+      promoteTypes(ScalarType::BFloat16, ScalarType::Half), ScalarType::Float);
+  EXPECT_EQ(
+      promoteTypes(ScalarType::BFloat16, ScalarType::Bool),
+      ScalarType::BFloat16);
 }
 
 template <typename T1, typename T2>
 struct promote_types_is_valid
     : std::integral_constant<
           bool,
-          !std::is_same<T1, exec_aten::BFloat16>::value &&
-              !std::is_same<T2, exec_aten::BFloat16>::value &&
-              (std::is_same<T1, T2>::value ||
-               (!executorch::runtime::is_qint_type<T1>::value &&
-                !executorch::runtime::is_qint_type<T2>::value &&
-                !executorch::runtime::is_bits_type<T1>::value &&
-                !executorch::runtime::is_bits_type<T2>::value))> {};
+          (std::is_same<T1, T2>::value ||
+           (!executorch::runtime::is_qint_type<T1>::value &&
+            !executorch::runtime::is_qint_type<T2>::value &&
+            !executorch::runtime::is_bits_type<T1>::value &&
+            !executorch::runtime::is_bits_type<T2>::value))> {};
 
 template <typename T1, bool half_to_float>
 struct CompileTimePromoteTypesTestCase {
@@ -195,7 +196,8 @@ struct CompileTimePromoteTypesTestCase {
     auto expected = executorch::runtime::promoteTypes(
         scalarType1, scalarType2, half_to_float);
     EXPECT_EQ(actual, expected)
-        << "promoting " << (int)scalarType1 << " to " << (int)scalarType2;
+        << "promoting " << (int)scalarType1 << " to " << (int)scalarType2
+        << " (half to float: " << half_to_float << ')';
   }
 
   template <
diff --git a/runtime/core/portable_type/bfloat16.h b/runtime/core/portable_type/bfloat16.h
index a1ceb0c56a..e665e6152e 100644
--- a/runtime/core/portable_type/bfloat16.h
+++ b/runtime/core/portable_type/bfloat16.h
@@ -8,11 +8,41 @@
 
 #pragma once
 
+#include <cmath>
 #include <cstdint>
+#include <cstring>
+#include <limits>
+#include <ostream>
 
 namespace torch {
 namespace executor {
 
+namespace internal {
+inline float f32_from_bits(uint16_t src) {
+  float res = 0;
+  uint32_t tmp = src;
+  tmp <<= 16;
+  std::memcpy(&res, &tmp, sizeof(tmp));
+  return res;
+}
+
+inline uint16_t bits_from_f32(float src) {
+  uint32_t res = 0;
+  std::memcpy(&res, &src, sizeof(res));
+  return res >> 16;
+}
+
+inline uint16_t round_to_nearest_even(float src) {
+  if (std::isnan(src)) {
+    return UINT16_C(0x7FC0);
+  }
+  uint32_t U32 = 0;
+  std::memcpy(&U32, &src, sizeof(U32));
+  uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
+  return static_cast<uint16_t>((U32 + rounding_bias) >> 16);
+}
+} // namespace internal
+
 /**
  * The "brain floating-point" type, compatible with c10/util/BFloat16.h from
  * pytorch core.
@@ -22,7 +52,288 @@ namespace executor {
  */
 struct alignas(2) BFloat16 {
   uint16_t x;
+
+  BFloat16() = default;
+  struct from_bits_t {};
+  static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  constexpr BFloat16(unsigned short bits, from_bits_t) : x(bits) {}
+  /* implicit */ BFloat16(float value)
+      : x(internal::round_to_nearest_even(value)) {}
+  operator float() const {
+    return internal::f32_from_bits(x);
+  }
 };
 
+inline std::ostream& operator<<(std::ostream& out, const BFloat16& value) {
+  out << (float)value;
+  return out;
+}
+
+/// Arithmetic
+
+inline BFloat16 operator+(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline BFloat16 operator-(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline BFloat16 operator*(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline BFloat16 operator/(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline BFloat16 operator-(const BFloat16& a) {
+  return -static_cast<float>(a);
+}
+
+inline BFloat16& operator+=(BFloat16& a, const BFloat16& b) {
+  a = a + b;
+  return a;
+}
+
+inline BFloat16& operator-=(BFloat16& a, const BFloat16& b) {
+  a = a - b;
+  return a;
+}
+
+inline BFloat16& operator*=(BFloat16& a, const BFloat16& b) {
+  a = a * b;
+  return a;
+}
+
+inline BFloat16& operator/=(BFloat16& a, const BFloat16& b) {
+  a = a / b;
+  return a;
+}
+
+inline BFloat16& operator|(BFloat16& a, const BFloat16& b) {
+  a.x = a.x | b.x;
+  return a;
+}
+
+inline BFloat16& operator^(BFloat16& a, const BFloat16& b) {
+  a.x = a.x ^ b.x;
+  return a;
+}
+
+inline BFloat16& operator&(BFloat16& a, const BFloat16& b) {
+  a.x = a.x & b.x;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline float operator+(BFloat16 a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline float operator-(BFloat16 a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline float operator*(BFloat16 a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline float operator/(BFloat16 a, float b) {
+  return static_cast<float>(a) / b;
+}
+
+inline float operator+(float a, BFloat16 b) {
+  return a + static_cast<float>(b);
+}
+inline float operator-(float a, BFloat16 b) {
+  return a - static_cast<float>(b);
+}
+inline float operator*(float a, BFloat16 b) {
+  return a * static_cast<float>(b);
+}
+inline float operator/(float a, BFloat16 b) {
+  return a / static_cast<float>(b);
+}
+
+inline float& operator+=(float& a, const BFloat16& b) {
+  return a += static_cast<float>(b);
+}
+inline float& operator-=(float& a, const BFloat16& b) {
+  return a -= static_cast<float>(b);
+}
+inline float& operator*=(float& a, const BFloat16& b) {
+  return a *= static_cast<float>(b);
+}
+inline float& operator/=(float& a, const BFloat16& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline double operator+(BFloat16 a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline double operator-(BFloat16 a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline double operator*(BFloat16 a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline double operator/(BFloat16 a, double b) {
+  return static_cast<double>(a) / b;
+}
+
+inline double operator+(double a, BFloat16 b) {
+  return a + static_cast<double>(b);
+}
+inline double operator-(double a, BFloat16 b) {
+  return a - static_cast<double>(b);
+}
+inline double operator*(double a, BFloat16 b) {
+  return a * static_cast<double>(b);
+}
+inline double operator/(double a, BFloat16 b) {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline BFloat16 operator+(BFloat16 a, int b) {
+  return a + static_cast<BFloat16>(b);
+}
+inline BFloat16 operator-(BFloat16 a, int b) {
+  return a - static_cast<BFloat16>(b);
+}
+inline BFloat16 operator*(BFloat16 a, int b) {
+  return a * static_cast<BFloat16>(b);
+}
+inline BFloat16 operator/(BFloat16 a, int b) {
+  return a / static_cast<BFloat16>(b);
+}
+
+inline BFloat16 operator+(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) + b;
+}
+inline BFloat16 operator-(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) - b;
+}
+inline BFloat16 operator*(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) * b;
+}
+inline BFloat16 operator/(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline BFloat16 operator+(BFloat16 a, int64_t b) {
+  return a + static_cast<BFloat16>(b);
+}
+inline BFloat16 operator-(BFloat16 a, int64_t b) {
+  return a - static_cast<BFloat16>(b);
+}
+inline BFloat16 operator*(BFloat16 a, int64_t b) {
+  return a * static_cast<BFloat16>(b);
+}
+inline BFloat16 operator/(BFloat16 a, int64_t b) {
+  return a / static_cast<BFloat16>(b);
+}
+
+inline BFloat16 operator+(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) + b;
+}
+inline BFloat16 operator-(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) - b;
+}
+inline BFloat16 operator*(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) * b;
+}
+inline BFloat16 operator/(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) / b;
+}
+
+// Overloading < and > operators, because std::max and std::min use them.
+
+inline bool operator>(BFloat16& lhs, BFloat16& rhs) {
+  return float(lhs) > float(rhs);
+}
+
+inline bool operator<(BFloat16& lhs, BFloat16& rhs) {
+  return float(lhs) < float(rhs);
+}
+
 } // namespace executor
 } // namespace torch
+
+namespace std {
+
+template <>
+class numeric_limits<torch::executor::BFloat16> {
+ public:
+  static constexpr bool is_signed = true;
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = true;
+  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
+  static constexpr auto has_denorm_loss =
+      numeric_limits<float>::has_denorm_loss;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 8;
+  static constexpr int digits10 = 2;
+  static constexpr int max_digits10 = 4;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -125;
+  static constexpr int min_exponent10 = -37;
+  static constexpr int max_exponent = 128;
+  static constexpr int max_exponent10 = 38;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before =
+      numeric_limits<float>::tinyness_before;
+
+  static constexpr torch::executor::BFloat16 min() {
+    return torch::executor::BFloat16(
+        0x0080, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 lowest() {
+    return torch::executor::BFloat16(
+        0xFF7F, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 max() {
+    return torch::executor::BFloat16(
+        0x7F7F, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 epsilon() {
+    return torch::executor::BFloat16(
+        0x3C00, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 round_error() {
+    return torch::executor::BFloat16(
+        0x3F00, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 infinity() {
+    return torch::executor::BFloat16(
+        0x7F80, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 quiet_NaN() {
+    return torch::executor::BFloat16(
+        0x7FC0, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 signaling_NaN() {
+    return torch::executor::BFloat16(
+        0x7F80, torch::executor::BFloat16::from_bits());
+  }
+  static constexpr torch::executor::BFloat16 denorm_min() {
+    return torch::executor::BFloat16(
+        0x0001, torch::executor::BFloat16::from_bits());
+  }
+};
+
+} // namespace std
diff --git a/runtime/core/portable_type/bfloat16_math.h b/runtime/core/portable_type/bfloat16_math.h
new file mode 100644
index 0000000000..68ee77cf34
--- /dev/null
+++ b/runtime/core/portable_type/bfloat16_math.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/portable_type/bfloat16.h>
+#include <executorch/runtime/core/portable_type/half.h>
+
+namespace std {
+
+template <typename T>
+struct is_reduced_floating_point
+    : std::integral_constant<
+          bool,
+          std::is_same<T, torch::executor::BFloat16>::value ||
+              std::is_same<T, torch::executor::Half>::value> {};
+
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T acos(T a) {
+  return std::acos(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T asin(T a) {
+  return std::asin(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T atan(T a) {
+  return std::atan(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T atanh(T a) {
+  return std::atanh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T erf(T a) {
+  return std::erf(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T erfc(T a) {
+  return std::erfc(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T exp(T a) {
+  return std::exp(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T expm1(T a) {
+  return std::expm1(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline bool isfinite(T a) {
+  return std::isfinite(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T log(T a) {
+  return std::log(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T log10(T a) {
+  return std::log10(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T log1p(T a) {
+  return std::log1p(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T log2(T a) {
+  return std::log2(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T ceil(T a) {
+  return std::ceil(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T cos(T a) {
+  return std::cos(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T floor(T a) {
+  return std::floor(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T nearbyint(T a) {
+  return std::nearbyint(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T sin(T a) {
+  return std::sin(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T tan(T a) {
+  return std::tan(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T sinh(T a) {
+  return std::sinh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T cosh(T a) {
+  return std::cosh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T tanh(T a) {
+  return std::tanh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T trunc(T a) {
+  return std::trunc(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T lgamma(T a) {
+  return std::lgamma(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T sqrt(T a) {
+  return std::sqrt(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T rsqrt(T a) {
+  return 1.0 / std::sqrt(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T abs(T a) {
+  return std::abs(float(a));
+}
+#if defined(_MSC_VER) && defined(__CUDACC__)
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T pow(T a, double b) {
+  return std::pow(float(a), float(b));
+}
+#else
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T pow(T a, double b) {
+  return std::pow(float(a), b);
+}
+#endif
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T pow(T a, T b) {
+  return std::pow(float(a), float(b));
+}
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T fmod(T a, T b) {
+  return std::fmod(float(a), float(b));
+}
+
+/*
+  The following function is inspired from the implementation in `musl`
+  Link to License: https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
+  ----------------------------------------------------------------------
+  Copyright © 2005-2020 Rich Felker, et al.
+
+  Permission is hereby granted, free of charge, to any person obtaining
+  a copy of this software and associated documentation files (the
+  "Software"), to deal in the Software without restriction, including
+  without limitation the rights to use, copy, modify, merge, publish,
+  distribute, sublicense, and/or sell copies of the Software, and to
+  permit persons to whom the Software is furnished to do so, subject to
+  the following conditions:
+
+  The above copyright notice and this permission notice shall be
+  included in all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  ----------------------------------------------------------------------
+ */
+template <
+    typename T,
+    typename std::enable_if<is_reduced_floating_point<T>::value, int>::type = 0>
+inline T nextafter(T from, T to) {
+  // Reference:
+  // https://git.musl-libc.org/cgit/musl/tree/src/math/nextafter.c
+  using int_repr_t = uint16_t;
+  constexpr uint8_t bits = 16;
+  union {
+    T f;
+    int_repr_t i;
+  } ufrom = {from}, uto = {to};
+
+  // get a mask to get the sign bit i.e. MSB
+  int_repr_t sign_mask = int_repr_t{1} << (bits - 1);
+
+  // short-circuit: if either is NaN, return NaN
+  if (from != from || to != to) {
+    return from + to;
+  }
+
+  // short-circuit: if they are exactly the same.
+  if (ufrom.i == uto.i) {
+    return from;
+  }
+
+  // mask the sign-bit to zero i.e. positive
+  // equivalent to abs(x)
+  int_repr_t abs_from = ufrom.i & ~sign_mask;
+  int_repr_t abs_to = uto.i & ~sign_mask;
+  if (abs_from == 0) {
+    // if both are zero but with different sign,
+    // preserve the sign of `to`.
+    if (abs_to == 0) {
+      return to;
+    }
+    // smallest subnormal with sign of `to`.
+    ufrom.i = (uto.i & sign_mask) | int_repr_t{1};
+    return ufrom.f;
+  }
+
+  // if abs(from) > abs(to) or sign(from) != sign(to)
+  if (abs_from > abs_to || ((ufrom.i ^ uto.i) & sign_mask)) {
+    ufrom.i--;
+  } else {
+    ufrom.i++;
+  }
+
+  return ufrom.f;
+}
+
+} // namespace std
diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl
index 0d65ef36b8..b8ccbe602e 100644
--- a/runtime/core/portable_type/targets.bzl
+++ b/runtime/core/portable_type/targets.bzl
@@ -43,6 +43,7 @@ def define_common_targets():
         name = "scalar_type",
         exported_headers = [
             "bfloat16.h",
+            "bfloat16_math.h",
             "complex.h",
             "half.h",
             "scalar_type.h",
diff --git a/runtime/core/portable_type/test/CMakeLists.txt b/runtime/core/portable_type/test/CMakeLists.txt
index 21eb4feae0..58a69f656e 100644
--- a/runtime/core/portable_type/test/CMakeLists.txt
+++ b/runtime/core/portable_type/test/CMakeLists.txt
@@ -24,7 +24,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
 include(${EXECUTORCH_ROOT}/build/Test.cmake)
 
 set(_test_srcs optional_test.cpp tensor_test.cpp half_test.cpp scalar_test.cpp
-               tensor_impl_test.cpp
+               tensor_impl_test.cpp bfloat16_test.cpp
 )
 
 et_cxx_test(runtime_core_portable_type_test SOURCES ${_test_srcs} EXTRA_LIBS)
diff --git a/runtime/core/portable_type/test/bfloat16_test.cpp b/runtime/core/portable_type/test/bfloat16_test.cpp
new file mode 100644
index 0000000000..9ea53e6cba
--- /dev/null
+++ b/runtime/core/portable_type/test/bfloat16_test.cpp
@@ -0,0 +1,191 @@
+#include <executorch/runtime/core/portable_type/bfloat16.h>
+
+#include <gtest/gtest.h>
+
+using torch::executor::BFloat16;
+
+namespace {
+float float_from_bytes(uint32_t sign, uint32_t exponent, uint32_t fraction) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  uint32_t bytes;
+  bytes = 0;
+  bytes |= sign;
+  bytes <<= 8;
+  bytes |= exponent;
+  bytes <<= 23;
+  bytes |= fraction;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  float res;
+  std::memcpy(&res, &bytes, sizeof(res));
+  return res;
+}
+
+TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
+  float in[100];
+  for (int i = 0; i < 100; ++i) {
+    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
+    in[i] = i + 1.25;
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
+  BFloat16 bfloats[100];
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
+  float out[100];
+
+  for (int i = 0; i < 100; ++i) {
+    bfloats[i].x = torch::executor::internal::bits_from_f32(in[i]);
+    out[i] = torch::executor::internal::f32_from_bits(bfloats[i].x);
+
+    // The relative error should be less than 1/(2^7) since BFloat16
+    // has 7 bits mantissa.
+    EXPECT_LE(std::fabs(out[i] - in[i]) / in[i], 1.0 / 128);
+  }
+}
+
+TEST(BFloat16Conversion, FloatToBFloat16RNEAndBack) {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
+  float in[100];
+  for (int i = 0; i < 100; ++i) {
+    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
+    in[i] = i + 1.25;
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
+  BFloat16 bfloats[100];
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
+  float out[100];
+
+  for (int i = 0; i < 100; ++i) {
+    bfloats[i].x = torch::executor::internal::round_to_nearest_even(in[i]);
+    out[i] = torch::executor::internal::f32_from_bits(bfloats[i].x);
+
+    // The relative error should be less than 1/(2^7) since BFloat16
+    // has 7 bits mantissa.
+    EXPECT_LE(std::fabs(out[i] - in[i]) / in[i], 1.0 / 128);
+  }
+}
+
+TEST(BFloat16Conversion, NaN) {
+  float inNaN = float_from_bytes(0, 0xFF, 0x7FFFFF);
+  EXPECT_TRUE(std::isnan(inNaN));
+
+  BFloat16 a = BFloat16(inNaN);
+  float out = torch::executor::internal::f32_from_bits(a.x);
+
+  EXPECT_TRUE(std::isnan(out));
+}
+
+TEST(BFloat16Conversion, Inf) {
+  float inInf = float_from_bytes(0, 0xFF, 0);
+  EXPECT_TRUE(std::isinf(inInf));
+
+  BFloat16 a = BFloat16(inInf);
+  float out = torch::executor::internal::f32_from_bits(a.x);
+
+  EXPECT_TRUE(std::isinf(out));
+}
+
+TEST(BFloat16Conversion, SmallestDenormal) {
+  float in = std::numeric_limits<float>::denorm_min(); // The smallest non-zero
+                                                       // subnormal number
+  BFloat16 a = BFloat16(in);
+  float out = torch::executor::internal::f32_from_bits(a.x);
+
+  EXPECT_FLOAT_EQ(in, out);
+}
+
+TEST(BFloat16Math, Addition) {
+  // This test verifies that if only first 7 bits of float's mantissa are
+  // changed after addition, we should have no loss in precision.
+
+  // input bits
+  // S | Exponent | Mantissa
+  // 0 | 10000000 | 10010000000000000000000 = 3.125
+  float input = float_from_bytes(0, 0, 0x40480000);
+
+  // expected bits
+  // S | Exponent | Mantissa
+  // 0 | 10000001 | 10010000000000000000000 = 6.25
+  float expected = float_from_bytes(0, 0, 0x40c80000);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  BFloat16 b;
+  b.x = torch::executor::internal::bits_from_f32(input);
+  b = b + b;
+
+  float res = torch::executor::internal::f32_from_bits(b.x);
+  EXPECT_EQ(res, expected);
+}
+
+TEST(BFloat16Math, Subtraction) {
+  // This test verifies that if only first 7 bits of float's mantissa are
+  // changed after subtraction, we should have no loss in precision.
+
+  // input bits
+  // S | Exponent | Mantissa
+  // 0 | 10000001 | 11101000000000000000000 = 7.625
+  float input = float_from_bytes(0, 0, 0x40f40000);
+
+  // expected bits
+  // S | Exponent | Mantissa
+  // 0 | 10000000 | 01010000000000000000000 = 2.625
+  float expected = float_from_bytes(0, 0, 0x40280000);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  BFloat16 b;
+  b.x = torch::executor::internal::bits_from_f32(input);
+  b = b - 5;
+
+  float res = torch::executor::internal::f32_from_bits(b.x);
+  EXPECT_EQ(res, expected);
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+TEST(BFloat16Math, NextAfterZero) {
+  const BFloat16 zero{0};
+
+  auto check_nextafter = [](BFloat16 from, BFloat16 to, BFloat16 expected) {
+    BFloat16 actual = std::nextafter(from, to);
+    // Check for bitwise equality!
+    ASSERT_EQ(actual.x ^ expected.x, uint16_t{0});
+  };
+  check_nextafter(zero, zero, /*expected=*/zero);
+  check_nextafter(zero, -zero, /*expected=*/-zero);
+  check_nextafter(-zero, zero, /*expected=*/zero);
+  check_nextafter(-zero, -zero, /*expected=*/-zero);
+}
+
+float BinaryToFloat(uint32_t bytes) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  float res;
+  std::memcpy(&res, &bytes, sizeof(res));
+  return res;
+}
+
+struct BFloat16TestParam {
+  uint32_t input;
+  uint16_t rne;
+};
+
+class BFloat16Test : public ::testing::Test,
+                     public ::testing::WithParamInterface<BFloat16TestParam> {};
+
+TEST_P(BFloat16Test, BFloat16RNETest) {
+  float value = BinaryToFloat(GetParam().input);
+  uint16_t rounded = torch::executor::internal::round_to_nearest_even(value);
+  EXPECT_EQ(GetParam().rne, rounded);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BFloat16TestInstantiation,
+    BFloat16Test,
+    ::testing::Values(
+        BFloat16TestParam{0x3F848000, 0x3F84},
+        BFloat16TestParam{0x3F848010, 0x3F85},
+        BFloat16TestParam{0x3F850000, 0x3F85},
+        BFloat16TestParam{0x3F858000, 0x3F86},
+        BFloat16TestParam{0x3FFF8000, 0x4000}));
+
+} // namespace
diff --git a/runtime/core/portable_type/test/targets.bzl b/runtime/core/portable_type/test/targets.bzl
index af55f95e45..c0b4ef00c7 100644
--- a/runtime/core/portable_type/test/targets.bzl
+++ b/runtime/core/portable_type/test/targets.bzl
@@ -6,6 +6,14 @@ def define_common_targets():
     The directory containing this targets.bzl file should also contain both
     TARGETS and BUCK files that call this function.
     """
+    runtime.cxx_test(
+        name = "bfloat16_test",
+        srcs = ["bfloat16_test.cpp"],
+        deps = [
+            "//executorch/runtime/core/portable_type:portable_type",
+        ],
+    )
+
     runtime.cxx_test(
         name = "optional_test",
         srcs = ["optional_test.cpp"],
diff --git a/runtime/executor/test/method_test.cpp b/runtime/executor/test/method_test.cpp
index 06b84d338e..0163c8ceef 100644
--- a/runtime/executor/test/method_test.cpp
+++ b/runtime/executor/test/method_test.cpp
@@ -59,11 +59,9 @@ class MethodTest : public ::testing::Test {
     load_program(std::getenv("ET_MODULE_INDEX_PATH"), "index");
     load_program(
         std::getenv("ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH"), "cat");
+    load_program(std::getenv("ET_MODULE_LINEAR_PATH"), "linear");
     load_program(
-        std::getenv("ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH"),
-        "linear_constant_segment");
-    load_program(
-        std::getenv("ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH"),
+        std::getenv("DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH"),
         "linear_constant_buffer");
   }
 
@@ -274,7 +272,7 @@ TEST_F(MethodTest, ConstantSegmentTest) {
   // Execute model with constants stored in segment.
   ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
   Result<Method> method =
-      programs_["linear_constant_segment"]->load_method("forward", &mmm.get());
+      programs_["linear"]->load_method("forward", &mmm.get());
   ASSERT_EQ(method.error(), Error::Ok);
 
   // Can execute the method.
diff --git a/runtime/executor/test/program_test.cpp b/runtime/executor/test/program_test.cpp
index 00e8b0e234..80f91f1af6 100644
--- a/runtime/executor/test/program_test.cpp
+++ b/runtime/executor/test/program_test.cpp
@@ -379,11 +379,32 @@ TEST_F(ProgramTest, DEPRECATEDLoad) {
   EXPECT_EQ(program_res.error(), Error::Ok);
 }
 
+TEST_F(ProgramTest, LoadConstantSegmentWithNoConstantSegment) {
+  Result<Program> program =
+      Program::load(add_loader_.get(), kDefaultVerification);
+  ASSERT_EQ(program.error(), Error::Ok);
+
+  // Load constant segment data should fail.
+  const auto segment_info = DataLoader::SegmentInfo(
+      DataLoader::SegmentInfo::Type::Constant,
+      /*segment_index=*/0);
+  Result<FreeableBuffer> segment =
+      ProgramTestFriend::LoadSegment(&program.get(), segment_info);
+  EXPECT_NE(segment.error(), Error::Ok);
+
+  const executorch_flatbuffer::Program* flatbuffer_program =
+      ProgramTestFriend::GetInternalProgram(&program.get());
+
+  // The constant buffer should be empty.
+  EXPECT_EQ(flatbuffer_program->constant_buffer()->size(), 0);
+
+  // Expect 1 constant segment, placeholder for non-const tensors.
+  EXPECT_EQ(flatbuffer_program->segments()->size(), 1);
+}
+
 TEST_F(ProgramTest, LoadConstantSegment) {
-  // Load the serialized ModuleLinear data, with constants in the segment and no
-  // constants in the flatbuffer.
-  const char* linear_path =
-      std::getenv("ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH");
+  // Load the serialized ModuleLinear data, with constants in the segment.
+  const char* linear_path = std::getenv("ET_MODULE_LINEAR_PATH");
   Result<FileDataLoader> linear_loader = FileDataLoader::from(linear_path);
   ASSERT_EQ(linear_loader.error(), Error::Ok);
 
@@ -424,11 +445,11 @@ TEST_F(ProgramTest, LoadConstantSegment) {
   EXPECT_GE(flatbuffer_program->constant_segment()->offsets()->size(), 1);
 }
 
-TEST_F(ProgramTest, LoadConstantSegmentWithNoConstantSegment) {
+TEST_F(ProgramTest, LoadConstantSegmentWhenConstantBufferExists) {
   // Load the serialized ModuleLinear data, with constants in the flatbuffer and
   // no constants in the segment.
   const char* linear_path =
-      std::getenv("ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH");
+      std::getenv("DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH");
   Result<FileDataLoader> linear_loader = FileDataLoader::from(linear_path);
   ASSERT_EQ(linear_loader.error(), Error::Ok);
 
@@ -505,8 +526,8 @@ TEST_F(ProgramTest, LoadFromMutableSegment) {
   const executorch_flatbuffer::Program* flatbuffer_program =
       ProgramTestFriend::GetInternalProgram(&program.get());
 
-  // Expect 1 segment. 1 mutable segment and no constant segment.
-  EXPECT_EQ(flatbuffer_program->segments()->size(), 1);
+  // Expect 2 segments. 1 mutable segment and 1 constant segment.
+  EXPECT_EQ(flatbuffer_program->segments()->size(), 2);
 
   // Expect a mutable data segment.
   EXPECT_EQ(flatbuffer_program->mutable_data_segments()->size(), 1);
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
index d6e3bc3d89..72923e9868 100644
--- a/runtime/executor/test/targets.bzl
+++ b/runtime/executor/test/targets.bzl
@@ -97,6 +97,8 @@ def define_common_targets(is_fbcode = False):
     # file in fbcode. See https://fburl.com/9esapdmd
     if not runtime.is_oss and is_fbcode:
         modules_env = {
+            # Deprecated model that still works with ExecuTorch runtime.
+            "DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH": "$(location fbcode//executorch/test/models/deprecated:ModuleLinear-no-constant-segment.pte)",
             # The tests use this var to find the program file to load. This uses
             # an fbcode target path because the authoring/export tools
             # intentionally don't work in xplat (since they're host-only tools).
@@ -104,8 +106,7 @@ def define_common_targets(is_fbcode = False):
             "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])",
             "ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleDynamicCatUnallocatedIO.pte])",
             "ET_MODULE_INDEX_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleIndex.pte])",
-            "ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear-no-constant-segment.pte])",
-            "ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear.pte])",
+            "ET_MODULE_LINEAR_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear.pte])",
             "ET_MODULE_MULTI_ENTRY_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleMultipleEntry.pte])",
             "ET_MODULE_SIMPLE_TRAIN_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleSimpleTrain.pte])",
         }
diff --git a/schema/program.fbs b/schema/program.fbs
index cbdda2d360..e3c7597fcd 100644
--- a/schema/program.fbs
+++ b/schema/program.fbs
@@ -429,6 +429,7 @@ table Program {
   // Each constant is assigned an index into the table which are each individually aligned.
   // 0 index is reserved to be pointed to by non-constant Tensors.
   // If this field is non-empty, constant_segment.offsets must be empty.
+  // DEPRECATED: After D61996249 on 2024-09-05, no new PTE files will use this field.
   constant_buffer:[Buffer];
 
   // List of delegate data. Pointed to by BackendDelegateDataReference.
diff --git a/test/end2end/exported_module.py b/test/end2end/exported_module.py
index 656b570512..6e6b97b718 100644
--- a/test/end2end/exported_module.py
+++ b/test/end2end/exported_module.py
@@ -67,7 +67,6 @@ def export(
         ignore_to_out_var_failure: bool = False,
         dynamic_memory_planning_mode: DynamicMemoryPlanningMode = DynamicMemoryPlanningMode.UPPER_BOUND,
         capture_config=None,
-        extract_constant_segment: bool = True,
         skip_type_promotion: bool = False,
         export_joint_graph: bool = False,
     ) -> "ExportedModule":
@@ -206,7 +205,6 @@ def __init__(self, method):
                 dynamic_memory_planning_mode=dynamic_memory_planning_mode,
                 memory_planning_pass=memory_planning_pass,
                 to_out_var_pass=ToOutVarPass(ignore_to_out_var_failure),
-                extract_constant_segment=extract_constant_segment,
             )
         )
 
diff --git a/test/models/deprecated/ModuleLinear-no-constant-segment.pte b/test/models/deprecated/ModuleLinear-no-constant-segment.pte
new file mode 100644
index 0000000000000000000000000000000000000000..42b8643fb91a6709d40a70b0ac68d0a194d9e878
GIT binary patch
literal 1040
zcmaJ=y-osQ5F8NBlMr(;giugi!WBnC!4V5XMWG#)xyo2bg2qB6ArKQ|Vc{eA7(Ruu
z<Pj`+0AE0H=Dz!mAH^lZx3@dr?(A{|08Zkl!y_{fHTng(D8oaA0A7fY@ClOz&c7Fn
z0-rZ+hk-sJzVh8K!&4nvZv!}6%lDXH1@MsOd2h{fe5xmrDtj&U))@5>pvqi8F`s$h
zkmQi46HkoWS)`BY%>2wVj&dzRybJ2k4DvPOXga|j&%xyKuG<|9hxDKS<=jqh$f1N}
zr$C9P(OqKfk(dxx(^2XN+%w&r;&ay5efdQ~I`k}GF(FR*W%@2ZikfeeXCRqbcLi&<
zu1?ME5Z8Rr-e&Zry+x=;f01a)tB07HLwwz}_F;4s#eXw=N57Zd;VEObmoBQO#jD{L
z^{xR2d<%DZ>W!3nj3nmxJSv2GAg!nRs<T<s#XhT;jWC<lu5PIvkfi>0Mt5b#zv|CY
z&C>F2d`H@}dpp-}P31LB;_E-RoYqVB#c)<@+s?Ed{nF}ZcPy^vP~EuGh#KuMjv7%I
VG+W2*u-Rld>2)<4-w(z^`~V(NT@e5P

literal 0
HcmV?d00001

diff --git a/test/models/deprecated/README.md b/test/models/deprecated/README.md
new file mode 100644
index 0000000000..f1d47d0326
--- /dev/null
+++ b/test/models/deprecated/README.md
@@ -0,0 +1,14 @@
+## Deprecated Models
+
+This readme documents deprecated models that remain compatible with versions of the ExecuTorch runtime.
+
+ModuleLinear-no-constant-segment.pte
+- This file contains constants stored in the constant_buffer, which was deprecated in D61996249 on 2024-09-05. Now, constants are stored in a separate segment.
+- This .pte file was generated internally using hg commit hash rFBS5e49dc0319b1d2d9969bbcef92857ab76a899c34, with command:
+    ```
+    buck2 build fbcode//executorch/test/models:exported_programs[ModuleLinear-no-constant-segment.pte] --show-output
+    ```
+- In OSS, the same .pte file can be generated with https://github.com/pytorch/executorch/commit/cea5abbcdded, via:
+    ```
+    python -m test.models.export_program --modules "ModuleLinear" --outdir .
+    ```
diff --git a/test/models/deprecated/TARGETS b/test/models/deprecated/TARGETS
new file mode 100644
index 0000000000..369fc3c406
--- /dev/null
+++ b/test/models/deprecated/TARGETS
@@ -0,0 +1,12 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+oncall("executorch")
+
+runtime.export_file(
+    name = "ModuleLinear-no-constant-segment.pte",
+    src = "ModuleLinear-no-constant-segment.pte",
+    visibility = [
+        "//executorch/runtime/executor/test/...",
+        "//executorch/test/...",
+    ],
+)
diff --git a/test/models/export_program.py b/test/models/export_program.py
index 7941af376f..d753475b82 100644
--- a/test/models/export_program.py
+++ b/test/models/export_program.py
@@ -190,7 +190,6 @@ def export_joint():
 
 def export_module_to_program(
     module_class: Type[nn.Module],
-    extract_constant_segment: bool,
     skip_type_promotion: bool,
 ):
     """Exports the module and returns the serialized program data."""
@@ -211,7 +210,6 @@ def export_module_to_program(
     module = ExportedModule.export(
         module_class,
         methods,
-        extract_constant_segment=extract_constant_segment,
         skip_type_promotion=skip_type_promotion,
         export_joint_graph=export_joint,
         **export_kwargs,
@@ -259,18 +257,15 @@ def main() -> None:
             # Skip type promotion to keep the model in fp16.
             # Type promotion will convert to fp32.
             skip_type_promotion = True
-        for extract_constant_segment in (True, False):
-            suffix = "" if extract_constant_segment else "-no-constant-segment"
-            outfile = os.path.join(args.outdir, f"{module_name}{suffix}.pte")
-            with open(outfile, "wb") as fp:
-                fp.write(
-                    export_module_to_program(
-                        module_class,
-                        extract_constant_segment=extract_constant_segment,
-                        skip_type_promotion=skip_type_promotion,
-                    )
+        outfile = os.path.join(args.outdir, f"{module_name}.pte")
+        with open(outfile, "wb") as fp:
+            fp.write(
+                export_module_to_program(
+                    module_class,
+                    skip_type_promotion=skip_type_promotion,
                 )
-            print(f"Exported {module_name} and wrote program data to {outfile}")
+            )
+        print(f"Exported {module_name} and wrote program data to {outfile}")
 
 
 if __name__ == "__main__":
diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh
index 3693700e83..078196bfc1 100755
--- a/test/run_oss_cpp_tests.sh
+++ b/test/run_oss_cpp_tests.sh
@@ -56,23 +56,23 @@ export_test_model() {
   python3 -m test.models.export_program --modules "ModuleAdd,ModuleAddHalf,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleLinear,ModuleMultipleEntry,ModuleSimpleTrain" --outdir "cmake-out" 2> /dev/null
   python3 -m test.models.export_delegated_program --modules "ModuleAddMul" --backend_id "StubBackend" --outdir "cmake-out" || true
 
+  DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH="$(realpath test/models/deprecated/ModuleLinear-no-constant-segment.pte)"
   ET_MODULE_ADD_HALF_PATH="$(realpath cmake-out/ModuleAddHalf.pte)"
   ET_MODULE_ADD_PATH="$(realpath cmake-out/ModuleAdd.pte)"
   ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH="$(realpath cmake-out/ModuleDynamicCatUnallocatedIO.pte)"
   ET_MODULE_INDEX_PATH="$(realpath cmake-out/ModuleIndex.pte)"
-  ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH="$(realpath cmake-out/ModuleLinear-no-constant-segment.pte)"
-  ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH="$(realpath cmake-out/ModuleLinear.pte)"
+  ET_MODULE_LINEAR_PATH="$(realpath cmake-out/ModuleLinear.pte)"
   ET_MODULE_MULTI_ENTRY_PATH="$(realpath cmake-out/ModuleMultipleEntry.pte)"
   ET_MODULE_ADD_MUL_NOSEGMENTS_DA1024_PATH="$(realpath cmake-out/ModuleAddMul-nosegments-da1024.pte)"
   ET_MODULE_ADD_MUL_NOSEGMENTS_PATH="$(realpath cmake-out/ModuleAddMul-nosegments.pte)"
   ET_MODULE_ADD_MUL_PATH="$(realpath cmake-out/ModuleAddMul.pte)"
   ET_MODULE_SIMPLE_TRAIN_PATH="$(realpath cmake-out/ModuleSimpleTrain.pte)"
+  export DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH
   export ET_MODULE_ADD_HALF_PATH
   export ET_MODULE_ADD_PATH
   export ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH
   export ET_MODULE_INDEX_PATH
-  export ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH
-  export ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH
+  export ET_MODULE_LINEAR_PATH
   export ET_MODULE_MULTI_ENTRY_PATH
   export ET_MODULE_ADD_MUL_NOSEGMENTS_DA1024_PATH
   export ET_MODULE_ADD_MUL_NOSEGMENTS_PATH
diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json
index 93ae82acc3..dca2a7bbbc 100644
--- a/test/utils/OSSTestConfig.json
+++ b/test/utils/OSSTestConfig.json
@@ -52,8 +52,7 @@
     {
         "directory": "extension/runner_util/test",
         "sources": [
-            "inputs_test.cpp",
-            "managed_tensor_test.cpp"
+            "inputs_test.cpp"
         ],
         "additional_libs": [
             "extension_data_loader",