From 0601dd618099af9956e56a236c7ed3ac098fc402 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Sat, 7 Sep 2024 11:35:10 +0000 Subject: [PATCH] 2024-09-07 nightly release (1cc8503056eab95eaf2f753c5a1bf237102a26ba) --- .ci/scripts/build-qnn-sdk.sh | 1 + .ci/scripts/build_llama_android.sh | 3 +- .ci/scripts/test_llama.sh | 3 +- .ci/scripts/test_llava.sh | 145 +++-- .github/workflows/android-perf.yml | 4 +- .github/workflows/android.yml | 2 + .github/workflows/apple-perf.yml | 1 + .../workflows/upload-android-test-specs.yml | 2 +- backends/qualcomm/scripts/build.sh | 2 + backends/vulkan/docs/android_demo.md | 3 +- .../vulkan/runtime/api/containers/Tensor.cpp | 129 ++++- .../vulkan/runtime/api/containers/Tensor.h | 65 ++- backends/vulkan/runtime/graph/ComputeGraph.h | 4 + .../runtime/graph/ops/glsl/image_to_nchw.glsl | 9 +- .../runtime/graph/ops/glsl/indexing_utils.h | 93 +++ .../ops/glsl/int8_image_to_nchw_noint8.glsl | 9 +- .../runtime/graph/ops/glsl/nchw_to_image.glsl | 9 +- .../ops/glsl/nchw_to_int8_image_noint8.glsl | 15 +- .../runtime/graph/ops/impl/Convolution.cpp | 2 +- .../vulkan/runtime/graph/ops/impl/Staging.cpp | 8 +- .../runtime/graph/ops/impl/utils/DimUtils.h | 3 +- backends/vulkan/test/utils/test_utils.cpp | 7 +- .../vulkan/test/vulkan_compute_api_test.cpp | 25 +- backends/xnnpack/README.md | 3 +- build/build_android_llm_demo.sh | 5 +- build/build_apple_frameworks.sh | 1 + ...d-run-qualcomm-ai-engine-direct-backend.md | 2 + docs/source/llm/getting-started.md | 14 +- .../tutorial-xnnpack-delegate-lowering.md | 3 +- .../android/ExecuTorchDemo/README.md | 2 + .../demo-apps/android/ExecuTorchDemo/setup.sh | 1 + .../android-llm-device-farm-test-spec.yml | 22 + .../LlmBenchmarkRunner.java | 22 +- .../executorchllamademo/MainActivity.java | 16 +- .../android/LlamaDemo/setup-with-qnn.sh | 1 + examples/demo-apps/android/LlamaDemo/setup.sh | 1 + examples/llm_manual/CMakeLists.txt | 2 + examples/llm_manual/main.cpp | 7 +- examples/llm_manual/managed_tensor.h | 44 -- .../cross_attention/cross_attention_mask.cpp | 12 +- .../cross_attention/cross_attention_mask.h | 8 +- .../cross_attention_mask_test.cpp | 30 +- .../flamingo/cross_attention/targets.bzl | 2 +- examples/models/llama2/README.md | 6 +- examples/models/llama2/export_llama_lib.py | 4 +- examples/models/llama2/runner/CMakeLists.txt | 4 +- examples/models/llama2/runner/runner.cpp | 25 +- examples/models/llama2/runner/runner.h | 3 +- examples/models/llama2/runner/targets.bzl | 2 +- examples/models/llava/CMakeLists.txt | 18 +- examples/models/llava/README.md | 3 +- examples/models/llava/export_llava.py | 9 +- examples/models/llava/install_requirements.sh | 2 +- examples/models/llava/main.cpp | 15 + examples/models/llava/runner/CMakeLists.txt | 4 +- .../llava/runner/llava_image_prefiller.h | 14 +- examples/models/llava/runner/llava_runner.cpp | 96 +++- examples/models/llava/runner/llava_runner.h | 42 ++ .../llava/runner/llava_text_decoder_runner.h | 11 +- examples/models/llava/runner/targets.bzl | 2 +- examples/models/phi-3-mini/CMakeLists.txt | 3 +- examples/models/phi-3-mini/README.md | 3 +- examples/models/phi-3-mini/runner.cpp | 14 +- examples/models/test/test_export.py | 2 +- .../oss_scripts/llama2/CMakeLists.txt | 1 + .../oss_scripts/llama2/qnn_llama_runner.cpp | 1 - .../oss_scripts/llama2/runner/runner.cpp | 77 ++- .../oss_scripts/llama2/runner/runner.h | 14 +- .../qaihub_scripts/llama/CMakeLists.txt | 2 + .../llama/llama2/qaihub_llama2_7b_runner.cpp | 1 - .../llama/llama3/qaihub_llama3_8b_runner.cpp | 1 - .../qaihub_scripts/llama/runner/runner.cpp | 1 - .../qaihub_scripts/llama/runner/runner.h | 1 - .../stable_diffusion/CMakeLists.txt | 1 + .../stable_diffusion/runner/runner.cpp | 42 +- examples/xnnpack/README.md | 6 +- exir/_serialize/_program.py | 31 +- exir/_serialize/test/test_program.py | 27 + exir/capture/_config.py | 6 - exir/program/_program.py | 5 - extension/android/CMakeLists.txt | 1 + extension/android/jni/BUCK | 4 +- extension/android/jni/jni_layer.cpp | 23 +- extension/android/jni/jni_layer_llama.cpp | 84 ++- .../org/pytorch/executorch/LlamaModule.java | 87 ++- .../apple/Benchmark/App/App.entitlements | 12 + extension/apple/Benchmark/App/App.swift | 16 + .../Benchmark.xcodeproj/project.pbxproj | 535 ++++++++++++++++++ .../xcshareddata/xcschemes/Benchmark.xcscheme | 107 ++++ extension/apple/Benchmark/Tests/Tests.mm | 105 ++++ .../apple/Benchmark/Tests/Tests.xcconfig | 26 + .../apple/Benchmark/Tests/Tests.xctestplan | 28 + extension/aten_util/test/targets.bzl | 1 - extension/llm/export/builder.py | 25 +- extension/llm/export/partitioner_lib.py | 2 +- extension/llm/runner/CMakeLists.txt | 4 +- extension/llm/runner/multimodal_runner.h | 45 +- extension/llm/runner/targets.bzl | 6 +- extension/llm/runner/text_decoder_runner.cpp | 14 +- extension/llm/runner/text_decoder_runner.h | 6 +- extension/llm/runner/text_prefiller.cpp | 35 +- extension/llm/runner/text_prefiller.h | 2 +- extension/llm/runner/text_token_generator.h | 15 +- extension/llm/runner/util.h | 25 + extension/llm/tokenizer/tiktoken.cpp | 10 + extension/module/test/module_test.cpp | 105 ++-- extension/module/test/resources/README.md | 4 + extension/module/test/resources/add.pte | Bin 0 -> 728 bytes extension/module/test/resources/model.pte | Bin 1600 -> 0 bytes extension/runner_util/managed_tensor.h | 107 ---- extension/runner_util/targets.bzl | 15 - extension/runner_util/test/CMakeLists.txt | 2 +- .../runner_util/test/managed_tensor_test.cpp | 86 --- extension/runner_util/test/targets.bzl | 12 - kernels/README.md | 2 +- kernels/optimized/cpu/binary_ops.h | 3 +- kernels/optimized/cpu/op_mul.cpp | 17 +- kernels/portable/cpu/op_masked_fill.cpp | 3 + kernels/portable/cpu/op_max.cpp | 18 + kernels/portable/cpu/op_maximum.cpp | 3 + kernels/portable/cpu/op_mean.cpp | 5 + kernels/portable/cpu/op_min.cpp | 18 + kernels/portable/cpu/op_minimum.cpp | 3 + kernels/portable/cpu/op_mm.cpp | 5 + kernels/portable/cpu/op_mul.cpp | 24 +- kernels/portable/cpu/op_native_batch_norm.cpp | 22 + kernels/portable/cpu/op_native_group_norm.cpp | 25 + kernels/portable/cpu/op_native_layer_norm.cpp | 27 + kernels/portable/cpu/op_ne.cpp | 6 + kernels/portable/cpu/op_neg.cpp | 3 + kernels/portable/cpu/op_pdist_forward.cpp | 5 + kernels/portable/cpu/op_permute_copy.cpp | 3 + kernels/portable/cpu/op_pixel_shuffle.cpp | 4 + kernels/portable/cpu/op_to_copy.cpp | 9 +- kernels/portable/cpu/scalar_utils.h | 18 +- kernels/test/op_mul_test.cpp | 158 ++++-- kernels/test/op_to_copy_test.cpp | 27 +- runtime/core/exec_aten/exec_aten.h | 2 + .../exec_aten/testing_util/tensor_util.cpp | 22 +- .../core/exec_aten/util/genScalarTypeTable.py | 41 +- .../core/exec_aten/util/scalar_type_util.h | 206 +++++-- runtime/core/exec_aten/util/tensor_util.h | 9 + .../util/test/scalar_type_util_test.cpp | 54 +- runtime/core/portable_type/bfloat16.h | 311 ++++++++++ runtime/core/portable_type/bfloat16_math.h | 290 ++++++++++ runtime/core/portable_type/targets.bzl | 1 + .../core/portable_type/test/CMakeLists.txt | 2 +- .../core/portable_type/test/bfloat16_test.cpp | 191 +++++++ runtime/core/portable_type/test/targets.bzl | 8 + runtime/executor/test/method_test.cpp | 8 +- runtime/executor/test/program_test.cpp | 37 +- runtime/executor/test/targets.bzl | 5 +- schema/program.fbs | 1 + test/end2end/exported_module.py | 2 - .../ModuleLinear-no-constant-segment.pte | Bin 0 -> 1040 bytes test/models/deprecated/README.md | 14 + test/models/deprecated/TARGETS | 12 + test/models/export_program.py | 21 +- test/run_oss_cpp_tests.sh | 8 +- test/utils/OSSTestConfig.json | 3 +- 160 files changed, 3501 insertions(+), 945 deletions(-) delete mode 100644 examples/llm_manual/managed_tensor.h create mode 100644 extension/apple/Benchmark/App/App.entitlements create mode 100644 extension/apple/Benchmark/App/App.swift create mode 100644 extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj create mode 100644 extension/apple/Benchmark/Benchmark.xcodeproj/xcshareddata/xcschemes/Benchmark.xcscheme create mode 100644 extension/apple/Benchmark/Tests/Tests.mm create mode 100644 extension/apple/Benchmark/Tests/Tests.xcconfig create mode 100644 extension/apple/Benchmark/Tests/Tests.xctestplan create mode 100644 extension/module/test/resources/README.md create mode 100644 extension/module/test/resources/add.pte delete mode 100644 extension/module/test/resources/model.pte delete mode 100644 extension/runner_util/managed_tensor.h delete mode 100644 extension/runner_util/test/managed_tensor_test.cpp create mode 100644 runtime/core/portable_type/bfloat16_math.h create mode 100644 runtime/core/portable_type/test/bfloat16_test.cpp create mode 100644 test/models/deprecated/ModuleLinear-no-constant-segment.pte create mode 100644 test/models/deprecated/README.md create mode 100644 test/models/deprecated/TARGETS diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh index ec3a8a39e3..c48ac2056a 100644 --- a/.ci/scripts/build-qnn-sdk.sh +++ b/.ci/scripts/build-qnn-sdk.sh @@ -29,6 +29,7 @@ set_up_aot() { -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \ -DEXECUTORCH_BUILD_SDK=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE=python3 \ -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh index 644fc4c2bb..7d3370ee56 100644 --- a/.ci/scripts/build_llama_android.sh +++ b/.ci/scripts/build_llama_android.sh @@ -22,8 +22,9 @@ install_executorch_and_backend_lib() { -DANDROID_PLATFORM=android-23 \ -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 4fa8c94905..290ece7b8e 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -107,8 +107,9 @@ cmake_install_executorch_libraries() { retry cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Debug \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh index 3543ea3fa5..7dc6d15e40 100644 --- a/.ci/scripts/test_llava.sh +++ b/.ci/scripts/test_llava.sh @@ -9,47 +9,97 @@ set -exu # shellcheck source=/dev/null BUILD_TYPE=${1:-Debug} +TARGET_OS=${2:-Native} +BUILD_DIR=${3:-cmake-out} -echo "Building with BUILD_TYPE: $BUILD_TYPE" +echo "Building with BUILD_TYPE: $BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR" if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then - PYTHON_EXECUTABLE=python3 + PYTHON_EXECUTABLE=python3 fi +TARGET_OS_lower="$(echo "${TARGET_OS}" | awk '{print tolower($0)}')" +if [[ "${TARGET_OS_lower}" == "android" ]]; then + if [[ -z "${ANDROID_NDK}" ]]; then + echo "Set ANDROID_NDK environment variable to build for Android." + exit 1 + fi +fi + +# Number of processes for a parallel build +NPROC=8 +if hash nproc &> /dev/null; then NPROC=$(nproc); fi + +EXECUTORCH_COMMON_CMAKE_ARGS=" \ + -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON \ + -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON" + cmake_install_executorch_libraries() { - cmake \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON \ - -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \ - -Bcmake-out . - - - cmake --build cmake-out -j9 --target install --config ${BUILD_TYPE} + cmake \ + ${EXECUTORCH_COMMON_CMAKE_ARGS} \ + -B${BUILD_DIR} . + + cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE} +} + +cmake_install_executorch_libraries_for_android() { + cmake \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + ${EXECUTORCH_COMMON_CMAKE_ARGS} \ + -B${BUILD_DIR} . + + cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE} } + +LLAVA_COMMON_CMAKE_ARGS=" \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON" + cmake_build_llava_runner() { dir=examples/models/llava python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') - cmake \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DCMAKE_PREFIX_PATH="$python_lib" \ - -Bcmake-out/${dir} \ + cmake \ + ${LLAVA_COMMON_CMAKE_ARGS} \ + -DCMAKE_PREFIX_PATH="$python_lib" \ + -B${BUILD_DIR}/${dir} \ ${dir} + cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE} +} + - cmake --build cmake-out/${dir} -j9 --config ${BUILD_TYPE} +cmake_build_llava_runner_for_android() { + dir=examples/models/llava + python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') + + cmake \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + ${LLAVA_COMMON_CMAKE_ARGS} \ + -DCMAKE_PREFIX_PATH="$python_lib" \ + -DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON \ + -B${BUILD_DIR}/${dir} \ + ${dir} + + cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE} } # only export the one without custom op for now since it's @@ -61,7 +111,7 @@ export_llava() { # Download a new image with different size, to test if the model can handle different image sizes prepare_image_tensor() { echo "Downloading image" - curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg + curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg $PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path image.pt } @@ -80,13 +130,24 @@ run_and_verify() { echo "tokenizer.bin is missing." exit 1 fi - RUNTIME_ARGS="--model_path=llava.pte \ - --tokenizer_path=tokenizer.bin \ - --image_path=image.pt \ - --prompt=ASSISTANT: \ - --temperature=0 \ - --seq_len=650" - cmake-out/examples/models/llava/llava_main ${RUNTIME_ARGS} > result.txt + + + + RUNTIME_ARGS="--model_path=llava.pte \ + --tokenizer_path=tokenizer.bin \ + --image_path=image.pt \ + --prompt=ASSISTANT: \ + --temperature=0 \ + --seq_len=650" + + if [[ "${TARGET_OS_lower}" == "android" ]]; then + echo "Transfer relevant files to the phone via ADB and run llava_main with following args," + echo "$ llava_main ${RUNTIME_ARGS} " + exit 0; + fi + + ${BUILD_DIR}/examples/models/llava/llava_main ${RUNTIME_ARGS} > result.txt + # verify result.txt RESULT=$(cat result.txt) # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes tokens. @@ -109,8 +170,20 @@ run_and_verify() { fi } -cmake_install_executorch_libraries -cmake_build_llava_runner +# Step1. Build stuff +if [[ "${TARGET_OS_lower}" == "android" ]]; then + cmake_install_executorch_libraries_for_android + cmake_build_llava_runner_for_android +elif [[ "${TARGET_OS_lower}" == "native" ]]; then + cmake_install_executorch_libraries + cmake_build_llava_runner +else + echo "Invalid TARGET_OS ($2): ${TARGET_OS}" +fi + +# Step2. Generate the PTE export_llava + +# Step3. Run prepare_image_tensor run_and_verify diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index c44de95533..11950623ea 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -230,9 +230,10 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh fi - + # TODO: This needs to be replaced with a generic loader .apk # Build LLM Demo for Android + export ANDROID_ABIS="arm64-v8a" bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME} # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat @@ -278,6 +279,7 @@ jobs: model: ${{ fromJson(needs.set-parameters.outputs.models) }} delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} device: ${{ fromJson(needs.set-parameters.outputs.devices) }} + fail-fast: false with: device-type: android runner: linux.2xlarge diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml index 4c693a90e6..1ea7f398ce 100644 --- a/.github/workflows/android.yml +++ b/.github/workflows/android.yml @@ -5,6 +5,8 @@ on: branches: - main - release/* + tags: + - ciflow/android/* pull_request: paths: - .ci/docker/** diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index 41e2868bfb..8da58653a8 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -290,6 +290,7 @@ jobs: model: ${{ fromJson(needs.set-parameters.outputs.models) }} delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} device: ${{ fromJson(needs.set-parameters.outputs.devices) }} + fail-fast: false with: device-type: ios # For iOS testing, the runner just needs to call AWS Device Farm, so there is no need to run this on macOS diff --git a/.github/workflows/upload-android-test-specs.yml b/.github/workflows/upload-android-test-specs.yml index 5a468da44f..04f7cf40d7 100644 --- a/.github/workflows/upload-android-test-specs.yml +++ b/.github/workflows/upload-android-test-specs.yml @@ -41,7 +41,7 @@ jobs: with: # Just use a small model here with a minimal amount of configuration to test the spec models: stories110M - devices: samsung_galaxy_s2x + devices: samsung_galaxy_s22 delegates: xnnpack test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/android-llm-device-farm-test-spec.yml diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh index 61b363f1a7..5f77a74740 100755 --- a/backends/qualcomm/scripts/build.sh +++ b/backends/qualcomm/scripts/build.sh @@ -81,6 +81,7 @@ if [ "$BUILD_AARCH64" = true ]; then -DEXECUTORCH_BUILD_QNN=ON \ -DEXECUTORCH_BUILD_SDK=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DQNN_SDK_ROOT=$QNN_SDK_ROOT \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \ @@ -124,6 +125,7 @@ if [ "$BUILD_X86_64" = true ]; then -DEXECUTORCH_BUILD_QNN=ON \ -DEXECUTORCH_BUILD_SDK=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \ -S $PRJ_ROOT \ diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md index aaff7a7a72..8570859ed3 100644 --- a/backends/vulkan/docs/android_demo.md +++ b/backends/vulkan/docs/android_demo.md @@ -94,8 +94,9 @@ binary using the Android NDK toolchain. cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI=$ANDROID_ABI \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_VULKAN=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DPYTHON_EXECUTABLE=python \ diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 7b9d30ef65..6fe6746ec0 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -80,6 +80,42 @@ std::vector calculate_strides( return strides; } +/* + * Axis mapping is somewhat analogous to strides for texture backed tensors. + * + * The axis mapping is normalized to 4 dimensions, similar to the padded sizes. + * The first 3 values of the axis mapping indicate the (X,Y,Z) image texture + * axis that corresponds to the width, height, and channels dimension of the + * tensor. Thus the axis mapping can be considered to be in WHCN dimension + * order. + * + * The last value `axis_mapping.at(3)` indicates the WHCN index of the tensor + * dimension along which batches will be concatenated. This dimension can be + * referred to as the "inner dimension" To determine which image texture axis is + * used for the concatenation, a double lookup will need to be performed + * (axis_mapping.at(axis_mapping.at(3))). + * + * The reason for strucuring axis mapping this way is because for the batch dim, + * two things need to be easily derived: + * + * 1. The dim idx of the inner dimension, so that the size of the inner + * dimension can be easily determined. + * 2. The texture axis used to concatenate batches + * + * By storing the dim index of the inner dimension instead of the texture axis + * it maps to, both pieces of information are readily available. + * + * The axis mapping allows for permuted views of texture-backed tensors. + */ +std::vector default_axis_mapping() { + // Currently, all compute shaders have an assumption that the channels dim is + // used to combine with the batch dim of a tensor. However, once dim mapping + // is integrated into the tensor indexing logic for each compute shader, we + // can be more flexible with mapping the batch dim to different texture axes + // in order to improve performance or memory footprint. + return {0, 1, 2, 2}; +} + bool dim_order_is_valid(const std::vector& dim_order) { int64_t sum = 0; for (size_t i = 0; i < dim_order.size(); ++i) { @@ -137,30 +173,44 @@ std::vector calculate_padded_sizes( utils::uvec3 calculate_image_extents( const std::vector& padded_sizes, + const std::vector& axis_mapping, const utils::GPUMemoryLayout memory_layout) { VK_CHECK_COND(padded_sizes.size() == 4); + VK_CHECK_COND(axis_mapping.size() == 4); + + utils::uvec3 extents({1, 1, 1}); + // First three elements of axis_mapping indicate which (X,Y,Z) image axis the + // width, height, and channels dim of the tensor maps to. + for (int whcn_dim = 0; whcn_dim < 3; ++whcn_dim) { + const int64_t axis = axis_mapping.at(whcn_dim); + const int64_t dim = padded_sizes.size() - 1 - whcn_dim; + extents[axis] = utils::safe_downcast(padded_sizes.at(dim)); + } - uint32_t N = utils::safe_downcast(padded_sizes.at(0)); - uint32_t C = utils::safe_downcast(padded_sizes.at(1)); - uint32_t H = utils::safe_downcast(padded_sizes.at(2)); - uint32_t W = utils::safe_downcast(padded_sizes.at(3)); + // axis_mapping[3] indicates the WHCN index of the dimension used for batch + // concatenation. Thus a double lookup is required to determine the image axis + // used for batch concatenation. + const int64_t concatted_whcn_dim = axis_mapping.at(3); + const int64_t batch_axis = axis_mapping.at(concatted_whcn_dim); + // Multiply the extents of the batch axis by the batch size. + extents[batch_axis] *= padded_sizes.at(0); switch (memory_layout) { case utils::kWidthPacked: - VK_CHECK_COND(W % 4 == 0); - W /= 4; + VK_CHECK_COND(extents[0] % 4 == 0); + extents[0] /= 4; break; case utils::kHeightPacked: - VK_CHECK_COND(H % 4 == 0); - H /= 4; + VK_CHECK_COND(extents[1] % 4 == 0); + extents[1] /= 4; break; case utils::kChannelsPacked: - VK_CHECK_COND(C % 4 == 0); - C /= 4; + VK_CHECK_COND(extents[2] % 4 == 0); + extents[2] /= 4; break; } - return {W, H, C * N}; + return extents; } // @@ -176,9 +226,10 @@ vTensor::vTensor( const bool allocate_memory) : dtype_(dtype), memory_layout_(memory_layout), - // Calculate tensor size metadata + // Calculate tensor metadata sizes_(sizes.begin(), sizes.end()), dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)), + axis_mapping_(default_axis_mapping()), strides_(calculate_strides(sizes, dim_order_)), numel_(utils::multiply_integers(sizes_)), padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)}, @@ -189,12 +240,14 @@ vTensor::vTensor( sizes_uniform_(), strides_uniform_(), numel_uniform_(), + axis_mapping_uniform_(), texture_limits_uniform_(), // Construct Tensor storage storage_( context, storage_type, memory_layout_, + axis_mapping_, padded_sizes_, dtype_, allocate_memory) { @@ -222,6 +275,7 @@ vTensor::vTensor(const vTensor& other) // Copy tensor size metadata sizes_(other.sizes_.begin(), other.sizes_.end()), dim_order_(other.dim_order_.begin(), other.dim_order_.end()), + axis_mapping_(other.axis_mapping_.begin(), other.axis_mapping_.end()), strides_(other.strides_.begin(), other.strides_.end()), numel_(other.numel_), padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()}, @@ -234,6 +288,7 @@ vTensor::vTensor(const vTensor& other) sizes_uniform_(), strides_uniform_(), numel_uniform_(), + axis_mapping_uniform_(), texture_limits_uniform_(), // Copy Tensor storage storage_(other.storage_) {} @@ -248,6 +303,7 @@ vTensor::vTensor( // Copy tensor size metadata sizes_(sizes.begin(), sizes.end()), dim_order_(dim_order.begin(), dim_order.end()), + axis_mapping_(default_axis_mapping()), strides_(calculate_strides(sizes_, dim_order_)), numel_(utils::multiply_integers(sizes_)), padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)}, @@ -258,6 +314,7 @@ vTensor::vTensor( sizes_uniform_(), strides_uniform_(), numel_uniform_(), + axis_mapping_uniform_(), texture_limits_uniform_(), // Copy Tensor storage storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) { @@ -315,6 +372,14 @@ const vkapi::BufferBindInfo vTensor::strides_ubo() { return vkapi::BufferBindInfo(strides_uniform_.buffer()); } +const vkapi::BufferBindInfo vTensor::axis_mapping_ubo() { + if (!axis_mapping_uniform_.buffer()) { + axis_mapping_uniform_ = + ParamsBuffer(storage_.context_, utils::make_ivec4(axis_mapping_)); + } + return vkapi::BufferBindInfo(axis_mapping_uniform_.buffer()); +} + const vkapi::BufferBindInfo vTensor::texture_limits_ubo() { if (!texture_limits_uniform_.buffer()) { texture_limits_uniform_ = ParamsBuffer(storage_.context_, texture_limits_); @@ -376,11 +441,7 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) { } } -void vTensor::update_metadata( - const std::vector& new_sizes, - const std::vector& new_dim_order) { - sizes_ = new_sizes; - dim_order_ = new_dim_order; +void vTensor::update_metadata() { strides_ = calculate_strides(sizes_, dim_order_); // Only update the memory layout for buffer-backed tensors. Strides are // meaningless for texture-backed tensors and do not impact the memory layout. @@ -396,7 +457,7 @@ void vTensor::update_metadata( // Calculate the extents of the image texture that would have been required // for a tensor of the new sizes. utils::uvec3 virtual_extents = - calculate_image_extents(padded_sizes_, memory_layout_); + calculate_image_extents(padded_sizes_, axis_mapping_, memory_layout_); // Update the texture limits to reflect the new virtual extents. texture_limits_.limits = utils::ivec3{ @@ -407,15 +468,18 @@ void vTensor::update_metadata( if (sizes_uniform_.buffer()) { sizes_uniform_.update(utils::make_whcn_ivec4(sizes_)); } - if (texture_limits_uniform_.buffer()) { - texture_limits_uniform_.update(texture_limits_); - } if (strides_uniform_.buffer()) { strides_uniform_.update(utils::make_whcn_ivec4(unsqueezed_strides_)); } if (numel_uniform_.buffer()) { numel_uniform_.update(numel_); } + if (axis_mapping_uniform_.buffer()) { + axis_mapping_uniform_.update(utils::make_ivec4(axis_mapping_)); + } + if (texture_limits_uniform_.buffer()) { + texture_limits_uniform_.update(texture_limits_); + } } void vTensor::check_sizes(const std::vector& sizes) const { @@ -423,7 +487,7 @@ void vTensor::check_sizes(const std::vector& sizes) const { // For texture storage check that the current texture is large enough for // the new sizes of the tensor. utils::uvec3 virtual_extents = - calculate_image_extents(padded_sizes_, memory_layout_); + calculate_image_extents(padded_sizes_, axis_mapping_, memory_layout_); bool valid_resize = virtual_extents[0] <= image_extents()[0]; valid_resize = valid_resize && virtual_extents[1] <= image_extents()[1]; @@ -454,7 +518,9 @@ void vTensor::virtual_reconfigure( VK_CHECK_COND(dim_order_is_valid(new_dim_order)); check_sizes(new_sizes); - update_metadata(new_sizes, new_dim_order); + sizes_ = new_sizes; + dim_order_ = new_dim_order; + update_metadata(); } void vTensor::virtual_resize(const std::vector& new_sizes) { @@ -463,13 +529,16 @@ void vTensor::virtual_resize(const std::vector& new_sizes) { "new sizes cannot modify the dimensionality of the tensor "); check_sizes(new_sizes); - update_metadata(new_sizes, dim_order_); + sizes_ = new_sizes; + update_metadata(); } void vTensor::reallocate(const std::vector& new_sizes) { - update_metadata(new_sizes, dim_order_); + sizes_ = new_sizes; + update_metadata(); storage_.discard_and_reallocate( calculate_padded_sizes(new_sizes, memory_layout_), + axis_mapping_, memory_layout_, dtype_); } @@ -547,12 +616,16 @@ vTensorStorage::vTensorStorage( Context* const context, const utils::StorageType storage_type, const utils::GPUMemoryLayout gpu_memory_layout, + const std::vector& axis_mapping, const std::vector& padded_sizes, const vkapi::ScalarType dtype, const bool allocate_memory) : context_(context), storage_type_{storage_type}, - image_extents_(calculate_image_extents(padded_sizes, gpu_memory_layout)), + image_extents_(calculate_image_extents( + padded_sizes, + axis_mapping, + gpu_memory_layout)), buffer_length_{utils::multiply_integers(padded_sizes)}, buffer_offset_{0}, image_(allocate_image( @@ -665,6 +738,7 @@ bool vTensorStorage::is_copy_of(const vTensorStorage& other) const { void vTensorStorage::discard_and_reallocate( const std::vector& padded_sizes, + const std::vector& axis_mapping, const utils::GPUMemoryLayout gpu_memory_layout, const vkapi::ScalarType dtype) { const bool image_owns_memory = image_.owns_memory(); @@ -672,7 +746,8 @@ void vTensorStorage::discard_and_reallocate( flush(); - image_extents_ = calculate_image_extents(padded_sizes, gpu_memory_layout); + image_extents_ = + calculate_image_extents(padded_sizes, axis_mapping, gpu_memory_layout); image_ = allocate_image( context_, image_extents_, diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index d37628e4ad..70f363796f 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -60,11 +60,11 @@ std::vector calculate_padded_sizes( const utils::GPUMemoryLayout memory_layout); /* - * Given the padded sizes of a tensor and the GPU memory layout, calculate the - * 3D image extents required to store the tensor data as an image texture. + * Calculate the image extents required of a texture backed tensor. */ utils::uvec3 calculate_image_extents( const std::vector& padded_sizes, + const std::vector& axis_mapping, const utils::GPUMemoryLayout memory_layout); struct LastAccess { @@ -90,7 +90,8 @@ class vTensorStorage final { Context* context, const utils::StorageType storage_type, const utils::GPUMemoryLayout gpu_memory_layout, - const std::vector& sizes, + const std::vector& axis_mapping, + const std::vector& padded_sizes, const vkapi::ScalarType dtype, const bool allocate_memory = true); @@ -159,6 +160,7 @@ class vTensorStorage final { void discard_and_reallocate( const std::vector& padded_sizes, + const std::vector& axis_mapping, const utils::GPUMemoryLayout gpu_memory_layout, const vkapi::ScalarType dtype); }; @@ -218,21 +220,58 @@ class vTensor final { vTensor& operator=(vTensor&& other) = default; private: + /* + * "Core" tensor metadata. They are the minimum amount of information required + * to construct a tensor. + */ + + // Whether the tensor has elements of type float, int, etc. vkapi::ScalarType dtype_; + // Describes which dimension is "tightly packed". For texture backed tensors, + // this describes which dimension is packed along a texel. For buffer backed + // tensors, this describes which dimension has a stride of 1 (i.e. is last in + // the dim order). utils::GPUMemoryLayout memory_layout_; - // sizes of the tensor in NCHW dimension order std::vector sizes_; + + /* + * "Layout" metadata. These describe with further detail how tensor data is + * laid out in memory. However, they are considered secondary to the "core" + * metadata members above because defaults can be assumed based on a given + * memory layout. When permuting the tensor without performing a copy, these + * metadata members are the ones that will be changed. All other metadata is + * derived from a combination of sizes, memory layout, and the below members. + */ + // dim order of the tensor; dimension indices are in NCHW dimension order // i.e. 0 is N, 1 is C, 2 is H, 3 is W for a 4D tensor. The dims with larger // strides precede the dims with smaller strides in the dim order. The last // dim is always the fastest moving dim with a stride of 1. std::vector dim_order_; + // Describes which axis of an image texture each dimension of the tensor maps + // to. The axis mapping allows texture based tensors to be permuted and + // transposed without modifying the underlying texture storage. For a more in + // depth explanation of axis mapping, see the `default_axis_mapping()` + // function. + std::vector axis_mapping_; + + /* + * The below can be consider "layout" metadata as well, but are derived from + * the above data members. + */ + // strides of the tensor in NCHW dimension order std::vector strides_; // Contains the number of elements in the tensor according to the canonical // sizes. size_t numel_; + + /* + * The below metadata members are derived from the above, and are typically + * to i.e. pass tensor metadata to compute shaders. + */ + // padded sizes of the tensor in NCHW dimension order. See the // calculate_padded_sizes() function for more context. Note that padded sizes // are only used for texture storage, and not for buffer storage. @@ -260,6 +299,7 @@ class vTensor final { ParamsBuffer sizes_uniform_; ParamsBuffer strides_uniform_; ParamsBuffer numel_uniform_; + ParamsBuffer axis_mapping_uniform_; ParamsBuffer texture_limits_uniform_; vTensorStorage storage_; @@ -365,14 +405,18 @@ class vTensor final { */ const vkapi::BufferBindInfo strides_ubo(); + /* + * Returns a GPU buffer containing the texture axis mapping for each dimension + * of the tensor, in WHCN dimension order. + */ + const vkapi::BufferBindInfo axis_mapping_ubo(); + /* * Returns a GPU buffer containing the virtual image extents of the tensor. * Since a tensor can be resized with the virtual_resize() function, this * GPU buffer contains the image extents of the tensor calculated using the * virtual_resize() function. This allows shaders to exit early if they are * working outside the limits of the texture. - * - * This buffer should only be used to */ const vkapi::BufferBindInfo texture_limits_ubo(); @@ -423,13 +467,10 @@ class vTensor final { private: /* - * Given new sizes and new strides of the dim order, update the sizes and dim - * order metadata of the vTensor. New strides are computed using the new sizes - * and new dim order. + * Assuming sizes, dim order, or axis mapping was modified, recompute all + * derived metadata and update metadata UBO with new values. */ - void update_metadata( - const std::vector& new_sizes, - const std::vector& new_dim_order); + void update_metadata(); /* * Check that tensor sizes are valid given the current storage resource's diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 210b03c4ca..afdc8290cd 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -327,6 +327,10 @@ class ComputeGraph final { return values_.at(idx).toTensor().numel_ubo(); } + inline vkapi::BufferBindInfo axis_mapping_ubo(const ValueRef idx) { + return values_.at(idx).toTensor().axis_mapping_ubo(); + } + inline vkapi::BufferBindInfo texture_limits_ubo(const ValueRef idx) { return values_.at(idx).toTensor().texture_limits_ubo(); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl index b51d5a3f6e..8f113bd2cc 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl @@ -21,9 +21,10 @@ ${define_required_extensions(DTYPE)} layout(std430) buffer; -${layout_declare_buffer(0, "w", "nchw_out", DTYPE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(2, "ivec4", "sizes")} +${layout_declare_buffer(B, "w", "nchw_out", DTYPE)} +${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "axis_mapping")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -51,7 +52,7 @@ void write_out_texel(VEC4_T texel, ivec4 tensor_idx) { void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim); + const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim); if (any(greaterThanEqual(tensor_idx, sizes))) { return; diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h index 21eadff0b3..9dc06bd855 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h @@ -183,6 +183,42 @@ ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) { return tensor_idx; } +/* + * Derive (w,h,c,n) tensor indices from (x,y,z) texture position using axis + * mapping. + */ +ivec4 to_tensor_idx( + ivec3 pos, + ivec4 sizes, + const ivec4 axis_mapping, + const int packed_dim) { + // Align packed dim to next multiple of 4 to account for texel padding + sizes[packed_dim] = alignup4(sizes[packed_dim]); + + // Packed dim contains 4 elements per texel, so moving 1 unit traverses 4 + // elements in the tensor. + pos[axis_mapping[packed_dim]] *= 4; + + ivec4 tensor_idx; + for (int dim = 0; dim < 3; ++dim) { + tensor_idx[dim] = pos[axis_mapping[dim]]; + } + + // Early return if batch is 1. Batch index will be 0. + if (sizes.w == 1) { + tensor_idx.w = 0; + return tensor_idx; + } + + // Else, adjust the dim that's concatenated with batch. Note that the axis + // mapping for the batch dim indicates WHCN dim index of the dim that it is + // concatenated with, not a texture axis. + tensor_idx.w = tensor_idx[axis_mapping[3]] / sizes[axis_mapping[3]]; + tensor_idx[axis_mapping[3]] %= sizes[axis_mapping[3]]; + + return tensor_idx; +} + /* * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim * is packed along a texel @@ -199,6 +235,34 @@ ivec3 to_texture_pos(ivec4 idx, ivec4 sizes, int packed_dim) { return pos; } +/* + * Derive (x,y,z) texture position from (w,h,c,n) tensor indices using axis + * mapping. + */ +ivec3 to_texture_pos( + const ivec4 idx, + ivec4 sizes, + const ivec4 axis_mapping, + const int packed_dim) { + // Align packed dim to next multiple of 4 to account for texel padding + sizes[packed_dim] = alignup4(sizes[packed_dim]); + + ivec3 pos; + for (int dim = 0; dim < 3; ++dim) { + pos[axis_mapping[dim]] = idx[dim]; + } + + // Adjust batch dim if needed + if (sizes.w > 1) { + pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w; + } + + // Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4 + // tensor elements in that dim. + pos[axis_mapping[packed_dim]] /= 4; + return pos; +} + /* * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of the tensor, which dim * is packed along a texel @@ -218,6 +282,35 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) { return pos; } +/* + * Derive (x,y,z,i) texel element position from the (w,h,c,n) tensor index using + * the axis mapping. + */ +ivec4 to_texture_elem_pos( + const ivec4 idx, + ivec4 sizes, + const ivec4 axis_mapping, + const int packed_dim) { + // Align packed dim to next multiple of 4 to account for texel padding + sizes[packed_dim] = alignup4(sizes[packed_dim]); + + ivec4 pos; + for (int dim = 0; dim < 3; ++dim) { + pos[axis_mapping[dim]] = idx[dim]; + } + + // Adjust batch dim if needed + if (sizes.w > 1) { + pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w; + } + + // Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4 + // tensor elements in that dim. + pos[axis_mapping[packed_dim]] /= 4; + pos.w = idx[packed_dim] % 4; + return pos; +} + // // Texel Access and Storage // diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl index b1e3a0abdf..3ef984bfc9 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl @@ -16,10 +16,11 @@ layout(std430) buffer; #extension GL_EXT_control_flow_attributes : require -${layout_declare_buffer(0, "w", "nchw_out", "int")} -${layout_declare_tensor(1, "r", "t_in", "int8", "texture3d")} -${layout_declare_ubo(2, "ivec4", "tensor_sizes")} -${layout_declare_ubo(3, "int", "out_numel")} +${layout_declare_buffer(B, "w", "nchw_out", "int")} +${layout_declare_tensor(B, "r", "t_in", "int8", "texture3d")} +${layout_declare_ubo(B, "ivec4", "tensor_sizes")} +${layout_declare_ubo(B, "ivec4", "axis_mapping")} +${layout_declare_ubo(B, "int", "out_numel")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl index abe9390480..04b6a26cc4 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl @@ -21,9 +21,10 @@ ${define_required_extensions(DTYPE)} layout(std430) buffer; -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_buffer(1, "r", "nchw_in", DTYPE)} -${layout_declare_ubo(2, "ivec4", "sizes")} +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_buffer(B, "r", "nchw_in", DTYPE)} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "axis_mapping")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -53,7 +54,7 @@ VEC4_T read_texel(ivec4 tensor_idx) { void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim); + const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim); if (any(greaterThanEqual(tensor_idx, sizes))) { return; } diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl index 378cf09d12..813a174d2a 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl @@ -16,9 +16,10 @@ layout(std430) buffer; #extension GL_EXT_control_flow_attributes : require -${layout_declare_tensor(0, "w", "t_out", "int8", "texture3d")} -${layout_declare_buffer(1, "r", "nchw_in", "int")} -${layout_declare_ubo(2, "ivec4", "tensor_sizes")} +${layout_declare_tensor(B, "w", "t_out", "int8", "texture3d")} +${layout_declare_buffer(B, "r", "nchw_in", "int")} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "axis_mapping")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -36,7 +37,7 @@ int extend_sign(int x) { ivec4 read_texel(ivec4 tensor_idx) { const ivec4 buf_indices = get_texel_nchw_buffer_ixs( - tensor_idx, tensor_sizes, packed_dim); + tensor_idx, sizes, packed_dim); int shift = (1 << 8) - 1; ivec4 masks; @@ -51,7 +52,7 @@ ivec4 read_texel(ivec4 tensor_idx) { ivec4 out_tex = ivec4(0); [[unroll]] for (int i = 0; i < 4; ++i) { - if (tensor_idx[packed_dim] + i < tensor_sizes[packed_dim]) { + if (tensor_idx[packed_dim] + i < sizes[packed_dim]) { int in_texel = nchw_in[buf_indices[i] / 4]; int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4)); extracted_val = extend_sign(extracted_val); @@ -64,9 +65,9 @@ ivec4 read_texel(ivec4 tensor_idx) { void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 tensor_idx = to_tensor_idx(pos, tensor_sizes, packed_dim); + const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim); - if (any(greaterThanEqual(tensor_idx, tensor_sizes))) { + if (any(greaterThanEqual(tensor_idx, sizes))) { return; } diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index 74113197d4..dcdd2dccfa 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -106,7 +106,7 @@ ValueRef prepack_biases( graph.create_local_wg_size(v), vref, v, - {t->sizes_ubo()}, + {t->sizes_ubo(), t->axis_mapping_ubo()}, // Specialization constants {SV(t->packed_dim_whcn_idx())})); diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index 9df5b73c1a..6a759e0fd2 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -31,7 +31,8 @@ void add_staging_to_tensor_node( graph.strides_ubo(out_tensor), graph.numel_ubo(out_tensor)}); } else { - ubos.append(graph.sizes_ubo(out_tensor)); + ubos.append( + {graph.sizes_ubo(out_tensor), graph.axis_mapping_ubo(out_tensor)}); } graph.execute_nodes().emplace_back(new ExecuteNode( @@ -69,7 +70,8 @@ void add_tensor_to_staging_node( graph.strides_ubo(in_tensor), graph.numel_ubo(in_tensor)}); } else { - ubos.append(graph.sizes_ubo(in_tensor)); + ubos.append( + {graph.sizes_ubo(in_tensor), graph.axis_mapping_ubo(in_tensor)}); } // Normally, the image_to_nchw shader is structured so that each thread reads @@ -113,7 +115,7 @@ ValueRef prepack( if (graph.is_buffer_storage(v)) { ubos.append({graph.sizes_ubo(v), graph.strides_ubo(v), graph.numel_ubo(v)}); } else { - ubos.append(graph.sizes_ubo(v)); + ubos.append({graph.sizes_ubo(v), graph.axis_mapping_ubo(v)}); } graph.prepack_nodes().emplace_back(new PrepackNode( diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h index 45dfceb3f0..4bd8e9b900 100644 --- a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h +++ b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h @@ -32,7 +32,8 @@ constexpr DimIndex kChannel4D = DimIndex::DIM_3RD_LAST; constexpr DimIndex kBatch4D = DimIndex::DIM_4TH_LAST; inline DimIndex normalize_to_dim_index(const api::vTensor& v_in, int32_t dim) { - return static_cast(dim - v_in.dim()); + return dim < 0 ? static_cast(dim) + : static_cast(dim - v_in.dim()); } /* diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index a469a44dc1..4feaecced5 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -85,7 +85,8 @@ void record_nchw_to_image_op( vkapi::PipelineStage::COMPUTE, vkapi::MemoryAccessType::WRITE), src_buffer, - v_dst.sizes_ubo()); + v_dst.sizes_ubo(), + v_dst.axis_mapping_ubo()); } void record_image_to_nchw_op( @@ -106,7 +107,8 @@ void record_image_to_nchw_op( 0, dst_buffer, v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), - v_src.sizes_ubo()); + v_src.sizes_ubo(), + v_src.axis_mapping_ubo()); } void record_int8_image_to_nchw_noint8_op( @@ -127,6 +129,7 @@ void record_int8_image_to_nchw_noint8_op( dst_buffer.buffer(), v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), v_src.sizes_ubo(), + v_src.axis_mapping_ubo(), v_src.numel_ubo()); } diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 1112548b85..53d0c820f4 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -1007,10 +1007,16 @@ TEST_F(VulkanComputeAPITest, print_object_sizes) { // The actual sizes of each object is dependent on the platform. However, we // can alert ourselves to any significant changes in the sizes of these // objects by checking the `sizeof()` the class against some loose thresholds. - EXPECT_TRUE(sizeof(vTensor) < 1800); - EXPECT_TRUE(sizeof(Value) < 2400); + + // Current known size on 64 bit system: 1824 B + EXPECT_TRUE(sizeof(vTensor) < 2000); + // Current known size on 64 bit system: 1840 B + EXPECT_TRUE(sizeof(Value) < 2200); + // Current known size on 64 bit system: 240 B EXPECT_TRUE(sizeof(StagingBuffer) < 500); + // Current known size on 64 bit system: 384 B EXPECT_TRUE(sizeof(ComputeGraph) < 500); + // Current known size on 64 bit system: 248 B EXPECT_TRUE(sizeof(ExecuteNode) < 500); } @@ -1227,8 +1233,8 @@ TEST(VulkanComputeGraphTest, test_simple_graph) { GraphConfig config; ComputeGraph graph(config); - std::vector size_big = {8, 64, 124}; - std::vector size_small = {8, 1, 124}; + std::vector size_big = {1, 8, 8}; + std::vector size_small = {1, 1, 8}; // Build graph @@ -1409,8 +1415,9 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { /*shared_object_idx = */ 4); // +2: t.sizes_ubo() for each staging shader + // +2: t.axis_mapping_ubo() for each staging shader // +2: staging buffer for each input tensor - EXPECT_TRUE(get_vma_allocation_count() == 4); + EXPECT_TRUE(get_vma_allocation_count() == 6); ValueRef c = graph.add_tensor( size_big, @@ -1427,8 +1434,9 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { // +2: alpha UBO, broadcast UBO for arithmetic shader // +1: t.sizes_ubo() uniform buffer for staging shader + // +1: t.axis_mapping_ubo() uniform buffer for staging shader // +1: staging buffer for the input tensor - EXPECT_TRUE(get_vma_allocation_count() == 9); + EXPECT_TRUE(get_vma_allocation_count() == 12); ValueRef e = graph.add_tensor( size_big, @@ -1444,14 +1452,15 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { // +2: alpha UBO, broadcast UBO for arithmetic shader // +1: t.sizes_ubo() for staging shader + // +1: t.axis_mapping_ubo() for staging shader // +1 staging buffer for the input tensor - EXPECT_TRUE(get_vma_allocation_count() == 13); + EXPECT_TRUE(get_vma_allocation_count() == 17); graph.prepare(); graph.encode_execute(); // +3: shared memory allocations for tensors - EXPECT_TRUE(get_vma_allocation_count() == 16); + EXPECT_TRUE(get_vma_allocation_count() == 20); // Run graph diff --git a/backends/xnnpack/README.md b/backends/xnnpack/README.md index 33a0bfaf30..0c3d7e1442 100644 --- a/backends/xnnpack/README.md +++ b/backends/xnnpack/README.md @@ -105,9 +105,10 @@ mkdir cmake-out cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DPYTHON_EXECUTABLE=python \ -Bcmake-out . diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 5a17c8745d..4d1a0ac123 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -38,6 +38,7 @@ build_android_native_library() { -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ @@ -139,7 +140,9 @@ collect_artifacts_to_be_uploaded() { BUILD_AAR_DIR="$(mktemp -d)" export BUILD_AAR_DIR -ANDROID_ABIS=("arm64-v8a" "x86_64") +if [ -z "$ANDROID_ABIS" ]; then + ANDROID_ABIS=("arm64-v8a" "x86_64") +fi export ANDROID_ABIS ARTIFACTS_DIR_NAME="$1" diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh index 8bd9e0539f..348111e2b4 100755 --- a/build/build_apple_frameworks.sh +++ b/build/build_apple_frameworks.sh @@ -163,6 +163,7 @@ cmake_build() { -DEXECUTORCH_BUILD_COREML=$COREML \ -DEXECUTORCH_BUILD_MPS=$MPS \ -DEXECUTORCH_BUILD_XNNPACK=$XNNPACK \ + -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \ -DEXECUTORCH_BUILD_EXTENSION_APPLE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ diff --git a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md index 5abaaeb7ce..230f007d3f 100644 --- a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md +++ b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md @@ -136,6 +136,7 @@ cmake .. \ -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \ -DEXECUTORCH_BUILD_SDK=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE=python3 \ -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF @@ -167,6 +168,7 @@ cmake .. \ -DQNN_SDK_ROOT=$QNN_SDK_ROOT \ -DEXECUTORCH_BUILD_SDK=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE=python3 \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \ diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md index a086581146..9c03399444 100644 --- a/docs/source/llm/getting-started.md +++ b/docs/source/llm/getting-started.md @@ -201,9 +201,9 @@ Create a file called main.cpp with the following contents: #include "basic_sampler.h" #include "basic_tokenizer.h" -#include "managed_tensor.h" #include +#include #include #include #include @@ -244,14 +244,13 @@ std::string generate( for (auto i = 0u; i < max_output_length; i++) { // Convert the input_tokens from a vector of int64_t to EValue. // EValue is a unified data type in the ExecuTorch runtime. - ManagedTensor tensor_tokens( + auto inputs = from_blob( input_tokens.data(), {1, static_cast(input_tokens.size())}, ScalarType::Long); - std::vector inputs = {tensor_tokens.get_tensor()}; // Run the model. It will return a tensor of logits (log-probabilities). - Result> logits_evalue = llm_model.forward(inputs); + auto logits_evalue = llm_model.forward(inputs); // Convert the output logits from EValue to std::vector, which is what // the sampler expects. @@ -339,7 +338,6 @@ Finally, download the following files into the same directory as main.h: ``` curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_sampler.h curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_tokenizer.h -curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/managed_tensor.h ``` To learn more, see the [Runtime APIs Tutorial](../extension-module.md). @@ -364,6 +362,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) # Set options for executorch build. option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON) # Include the executorch subdirectory. @@ -377,6 +376,7 @@ target_link_libraries( PRIVATE executorch extension_module_static # Provides the Module class + extension_tensor # Provides the TensorPtr class optimized_native_cpu_ops_lib) # Provides baseline cross-platform kernels ``` @@ -386,7 +386,6 @@ At this point, the working directory should contain the following files: - main.cpp - basic_tokenizer.h - basic_sampler.h -- managed_tensor.h - export_nanogpt.py - model.py - vocab.json @@ -518,6 +517,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) # Set options for executorch build. option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON) option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend @@ -534,6 +534,7 @@ target_link_libraries( PRIVATE executorch extension_module_static # Provides the Module class + extension_tensor # Provides the TensorPtr class optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels xnnpack_backend) # Provides the XNNPACK CPU acceleration backend ``` @@ -548,7 +549,6 @@ At this point, the working directory should contain the following files: - main.cpp - basic_tokenizer.h - basic_sampler.h -- managed_tensor.h - export_nanogpt.py - model.py - vocab.json diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md index 4491a6e8c8..8afa6d6fe7 100644 --- a/docs/source/tutorial-xnnpack-delegate-lowering.md +++ b/docs/source/tutorial-xnnpack-delegate-lowering.md @@ -149,9 +149,10 @@ mkdir cmake-out cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DPYTHON_EXECUTABLE=python \ -Bcmake-out . diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md index 807561f44b..9af1f5266e 100644 --- a/examples/demo-apps/android/ExecuTorchDemo/README.md +++ b/examples/demo-apps/android/ExecuTorchDemo/README.md @@ -78,6 +78,7 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -Bcmake-android-out cmake --build cmake-android-out -j16 --target install @@ -119,6 +120,7 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -Bcmake-android-out cmake --build cmake-android-out -j16 --target install diff --git a/examples/demo-apps/android/ExecuTorchDemo/setup.sh b/examples/demo-apps/android/ExecuTorchDemo/setup.sh index 05dc3e4492..00d9201b09 100644 --- a/examples/demo-apps/android/ExecuTorchDemo/setup.sh +++ b/examples/demo-apps/android/ExecuTorchDemo/setup.sh @@ -15,6 +15,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TESNOR=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}" diff --git a/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml b/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml index cac83b8e6f..896e7b73fb 100644 --- a/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml +++ b/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml @@ -73,8 +73,30 @@ phases: fi fi; + # Run the new generic benchmark activity https://developer.android.com/tools/adb#am + - echo "Run LLM benchmark" + - | + adb -s $DEVICEFARM_DEVICE_UDID shell am start -W -n com.example.executorchllamademo/.LlmBenchmarkRunner \ + --es "model_dir" "/data/local/tmp/llama" \ + --es "tokenizer_path" "/data/local/tmp/llama/tokenizer.bin" + post_test: commands: + - echo "Gather LLM benchmark results" + - | + BENCHMARK_RESULTS="" + ATTEMPT=0 + MAX_ATTEMPT=10 + while [ -z "${BENCHMARK_RESULTS}" ] && [ $ATTEMPT -lt $MAX_ATTEMPT ]; do + echo "Waiting for benchmark results..." + BENCHMARK_RESULTS=$(adb -s $DEVICEFARM_DEVICE_UDID shell run-as com.example.executorchllamademo cat files/benchmark_results.json) + sleep 30 + ((ATTEMPT++)) + done + + adb -s $DEVICEFARM_DEVICE_UDID shell run-as com.example.executorchllamademo ls -la files/ + # Trying to pull the file using adb ends up with permission error, but this works too, so why not + echo "${BENCHMARK_RESULTS}" > $DEVICEFARM_LOG_DIR/benchmark_results.json artifacts: # By default, Device Farm will collect your artifacts from the $DEVICEFARM_LOG_DIR directory. diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java index 33b230b1df..cee623507f 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java @@ -14,8 +14,11 @@ import android.util.Log; import android.widget.TextView; import androidx.annotation.NonNull; +import com.google.gson.Gson; +import java.io.File; import java.io.FileWriter; import java.io.IOException; +import java.util.Arrays; public class LlmBenchmarkRunner extends Activity implements ModelRunnerCallback { ModelRunner mModelRunner; @@ -32,7 +35,12 @@ protected void onCreate(Bundle savedInstanceState) { Intent intent = getIntent(); - String modelPath = intent.getStringExtra("model_path"); + File modelDir = new File(intent.getStringExtra("model_dir")); + File model = + Arrays.stream(modelDir.listFiles()) + .filter(file -> file.getName().endsWith(".pte")) + .findFirst() + .get(); String tokenizerPath = intent.getStringExtra("tokenizer_path"); float temperature = intent.getFloatExtra("temperature", 0.8f); @@ -42,7 +50,7 @@ protected void onCreate(Bundle savedInstanceState) { } mStatsDump = new StatsDump(); - mModelRunner = new ModelRunner(modelPath, tokenizerPath, temperature, this); + mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this); mStatsDump.loadStart = System.currentTimeMillis(); } @@ -79,11 +87,21 @@ public void onGenerationStopped() { mTextView.append(mStatsDump.toString()); }); + // TODO (huydhn): Remove txt files here once the JSON format is ready try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) { writer.write(mStatsDump.toString()); } catch (IOException e) { e.printStackTrace(); } + + // TODO (huydhn): Figure out on what the final JSON results looks like, we need something + // with the same number of fields as https://github.com/pytorch/pytorch/pull/135042 + try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) { + Gson gson = new Gson(); + writer.write(gson.toJson(mStatsDump)); + } catch (IOException e) { + e.printStackTrace(); + } } } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java index f24254efb3..96b200303c 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java @@ -73,8 +73,15 @@ public class MainActivity extends AppCompatActivity implements Runnable, LlamaCa @Override public void onResult(String result) { - mResultMessage.appendText(result); - run(); + if (result.equals("\n\n")) { + if (!mResultMessage.getText().isEmpty()) { + mResultMessage.appendText(result); + run(); + } + } else { + mResultMessage.appendText(result); + run(); + } } @Override @@ -614,6 +621,7 @@ public void run() { ModelUtils.VISION_MODEL_IMAGE_CHANNELS, prompt, ModelUtils.VISION_MODEL_SEQ_LEN, + false, MainActivity.this); } else { // no image selected, we pass in empty int array @@ -624,10 +632,12 @@ public void run() { ModelUtils.VISION_MODEL_IMAGE_CHANNELS, prompt, ModelUtils.VISION_MODEL_SEQ_LEN, + false, MainActivity.this); } } else { - mModule.generate(prompt, ModelUtils.TEXT_MODEL_SEQ_LEN, MainActivity.this); + mModule.generate( + prompt, ModelUtils.TEXT_MODEL_SEQ_LEN, false, MainActivity.this); } long generateDuration = System.currentTimeMillis() - generateStartTime; diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh index 5e3ac6fc01..87d0f47c95 100644 --- a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh +++ b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh @@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_QNN=ON \ diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh index ccb2a788d6..91a68d4b88 100644 --- a/examples/demo-apps/android/LlamaDemo/setup.sh +++ b/examples/demo-apps/android/LlamaDemo/setup.sh @@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ diff --git a/examples/llm_manual/CMakeLists.txt b/examples/llm_manual/CMakeLists.txt index 185665180f..e5054a683a 100644 --- a/examples/llm_manual/CMakeLists.txt +++ b/examples/llm_manual/CMakeLists.txt @@ -13,6 +13,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) # Set options for executorch build. option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON) option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend @@ -29,6 +30,7 @@ target_link_libraries( nanogpt_runner PRIVATE executorch extension_module_static # Provides the Module class + extension_tensor # Provides the TensorPtr class optimized_native_cpu_ops_lib # Provides baseline cross-platform # kernels xnnpack_backend diff --git a/examples/llm_manual/main.cpp b/examples/llm_manual/main.cpp index c0fc482542..3c4ecd71af 100644 --- a/examples/llm_manual/main.cpp +++ b/examples/llm_manual/main.cpp @@ -10,9 +10,9 @@ #include "basic_sampler.h" #include "basic_tokenizer.h" -#include "managed_tensor.h" #include +#include #include #include #include @@ -42,14 +42,13 @@ std::string generate( for (auto i = 0u; i < max_output_length; i++) { // Convert the input_tokens from a vector of int64_t to EValue. // EValue is a unified data type in the ExecuTorch runtime. - ManagedTensor tensor_tokens( + auto inputs = from_blob( input_tokens.data(), {1, static_cast(input_tokens.size())}, ScalarType::Long); - std::vector inputs = {tensor_tokens.get_tensor()}; // Run the model. It will return a tensor of logits (log-probabilities). - Result> logits_evalue = llm_model.forward(inputs); + auto logits_evalue = llm_model.forward(inputs); // Convert the output logits from EValue to std::vector, which is what // the sampler expects. diff --git a/examples/llm_manual/managed_tensor.h b/examples/llm_manual/managed_tensor.h deleted file mode 100644 index 204b38aa4e..0000000000 --- a/examples/llm_manual/managed_tensor.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -/** - * Creates and owns the necessary metadata for a Tensor instance. Does not own - * the data pointer. - */ -class ManagedTensor { - public: - ManagedTensor( - void* data, - const std::vector& sizes, - exec_aten::ScalarType dtype) - : sizes_(sizes), - tensor_impl_( - /*type=*/dtype, - /*dim=*/sizes_.size(), - /*sizes=*/sizes_.data(), - /*data=*/data, - /*dim_order=*/nullptr, - /*strides=*/nullptr, - /*dynamism=*/ - executorch::runtime::TensorShapeDynamism::DYNAMIC_BOUND) {} - - /** - * Get the Tensor object managed by this class. - */ - exec_aten::Tensor get_tensor() { - return exec_aten::Tensor(&tensor_impl_); - } - - private: - std::vector sizes_; - exec_aten::TensorImpl tensor_impl_; -}; diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask.cpp b/examples/models/flamingo/cross_attention/cross_attention_mask.cpp index b2a2a6a806..06887ec473 100644 --- a/examples/models/flamingo/cross_attention/cross_attention_mask.cpp +++ b/examples/models/flamingo/cross_attention/cross_attention_mask.cpp @@ -6,12 +6,11 @@ * LICENSE file in the root directory of this source tree. */ +#include + #include #include -#include -#include - namespace torch::executor { // Fowrward declaration needed for ARM compilers. @@ -97,7 +96,7 @@ std::vector> _get_image_attention_intervals( return vision_masks; } -std::vector cross_attention_mask( +std::vector cross_attention_mask( const std::vector& tokens, const std::vector& images, size_t tile_size, @@ -121,7 +120,7 @@ std::vector cross_attention_mask( // Create mask for each individual image based on its number of tokens, // which can vary based on number of tiles since they are not yet tile padded. // The masks are padded and concatenated together in the batch collator. - std::vector cross_attention_masks; + std::vector cross_attention_masks; size_t text_seq_len = tokens.size(); for (size_t image_idx = 0; image_idx < image_intervals.size(); ++image_idx) { size_t n_tiles = images[image_idx].size(0); @@ -140,7 +139,8 @@ std::vector cross_attention_mask( size_t stride = image_seq_len; std::vector mask_data(num_elements); - ManagedTensor mask(mask_data.data(), sizes, ScalarType::Int); + auto mask = executorch::extension::from_blob( + mask_data.data(), sizes, ScalarType::Int); cross_attention_masks.emplace_back(std::move(mask)); // Add the allocated data to the output vector. diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask.h b/examples/models/flamingo/cross_attention/cross_attention_mask.h index 6998d91ad4..ccbc9eb171 100644 --- a/examples/models/flamingo/cross_attention/cross_attention_mask.h +++ b/examples/models/flamingo/cross_attention/cross_attention_mask.h @@ -8,11 +8,11 @@ #pragma once -#include -#include - #include +#include +#include + namespace torch { namespace executor { @@ -59,7 +59,7 @@ namespace executor { * * @returns A vector of cross attention masks, as Tensors, one for each image. */ -std::vector cross_attention_mask( +std::vector<::executorch::extension::TensorPtr> cross_attention_mask( const std::vector& tokens, const std::vector& images, size_t tile_size, diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp b/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp index 5b9e58c216..b232212fa3 100644 --- a/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp +++ b/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp @@ -7,10 +7,10 @@ */ #include + #include using namespace ::testing; -using torch::executor::ManagedTensor; using torch::executor::ScalarType; using torch::executor::Tensor; using torch::executor::TensorImpl; @@ -41,29 +41,27 @@ TEST(CrossAttentxnMaskTest, TestCrossAttentionMask) { std::vector images = {a, b, c}; std::vector> mask_data; - std::vector output_masks = - torch::executor::cross_attention_mask( - tokens, - images, - /*tile_size=*/1, - /*patch_size=*/1, - /*image_token_id=*/1, - /*out=*/mask_data); + auto output_masks = torch::executor::cross_attention_mask( + tokens, + images, + /*tile_size=*/1, + /*patch_size=*/1, + /*image_token_id=*/1, + /*out=*/mask_data); // Check contents of the mask. std::vector> expected_intervals = { {0, 7}, {1, 7}, {7, 12}}; for (size_t mask_idx = 0; mask_idx < output_masks.size(); ++mask_idx) { - ManagedTensor& output_mask = output_masks[mask_idx]; - Tensor output_tensor = output_mask.get_aliasing_tensor(); - for (size_t i = 0; i < output_tensor.size(0); ++i) { - for (size_t j = 0; j < output_tensor.strides()[0]; ++j) { - size_t unrolled_index = i * output_tensor.strides()[0] + j; + auto& output_tensor = output_masks[mask_idx]; + for (size_t i = 0; i < output_tensor->size(0); ++i) { + for (size_t j = 0; j < output_tensor->strides()[0]; ++j) { + size_t unrolled_index = i * output_tensor->strides()[0] + j; if (i >= expected_intervals[mask_idx][0] && i < expected_intervals[mask_idx][1]) { - EXPECT_EQ(output_tensor.const_data_ptr()[unrolled_index], 1); + EXPECT_EQ(output_tensor->const_data_ptr()[unrolled_index], 1); } else { - EXPECT_EQ(output_tensor.const_data_ptr()[unrolled_index], 0); + EXPECT_EQ(output_tensor->const_data_ptr()[unrolled_index], 0); } } } diff --git a/examples/models/flamingo/cross_attention/targets.bzl b/examples/models/flamingo/cross_attention/targets.bzl index 7bc13270aa..c3d9da0156 100644 --- a/examples/models/flamingo/cross_attention/targets.bzl +++ b/examples/models/flamingo/cross_attention/targets.bzl @@ -12,8 +12,8 @@ def define_common_targets(): srcs = ["cross_attention_mask.cpp"], exported_headers = ["cross_attention_mask.h"], exported_deps = [ + "//executorch/extension/tensor:tensor", "//executorch/runtime/core/exec_aten:lib", - "//executorch/extension/runner_util:managed_tensor", "//executorch/runtime/core/exec_aten/util:tensor_util", ], ) diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md index ea95c7f965..09ada515a1 100644 --- a/examples/models/llama2/README.md +++ b/examples/models/llama2/README.md @@ -200,8 +200,9 @@ The Wikitext results generated above used: `{max_seq_len: 2048, limit: 1000}` -DCMAKE_INSTALL_PREFIX=cmake-out \ -DEXECUTORCH_ENABLE_LOGGING=1 \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ @@ -251,8 +252,9 @@ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_PLATFORM=android-23 \ -DCMAKE_INSTALL_PREFIX=cmake-out-android \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_LOGGING=1 \ -DPYTHON_EXECUTABLE=python \ -DEXECUTORCH_BUILD_XNNPACK=ON \ diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 1dac12cc85..c19ddd58a2 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -313,7 +313,6 @@ def build_args_parser() -> argparse.ArgumentParser: def canonical_path(path: Union[str, Path], *, dir: bool = False) -> str: - path = str(path) if verbose_export(): @@ -424,6 +423,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: verbose=args.verbose, max_seq_len=args.max_seq_length, metadata_str=args.metadata, + args=args, ) .set_output_dir(output_dir_path) .to_dtype(dtype_override) @@ -633,6 +633,7 @@ def _load_llama_model( verbose: bool = False, max_seq_len: int = 128, metadata_str: Optional[str] = None, + args, ) -> "LLMEdgeManager": """ A helper util that builds a Llama2 model. It returns a LLMEdgeManager that @@ -694,4 +695,5 @@ def _load_llama_model( model.params, metadata_str, ), + args=args, ) diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama2/runner/CMakeLists.txt index abad63a3b5..79fcd267af 100644 --- a/examples/models/llama2/runner/CMakeLists.txt +++ b/examples/models/llama2/runner/CMakeLists.txt @@ -75,8 +75,8 @@ add_subdirectory( ) set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) -set(llama_runner_deps executorch extension_module extension_data_loader - re2::re2 +set(llama_runner_deps executorch extension_data_loader extension_module + extension_tensor re2::re2 ) target_link_libraries(llama_runner PUBLIC ${llama_runner_deps}) diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index 0a5d773092..1e17c75400 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -144,7 +143,8 @@ Error Runner::generate( const std::string& prompt, int32_t seq_len, std::function token_callback, - std::function stats_callback) { + std::function stats_callback, + bool echo) { // Prepare the inputs. // Use ones-initialized inputs. ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); @@ -154,6 +154,11 @@ Error Runner::generate( stats_.model_load_end_ms = util::time_in_ms(); } + ET_LOG( + Info, + "RSS after loading model: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); + // Wrap the token_callback with print function std::function wrapped_callback = [token_callback](const std::string& piece) { @@ -204,9 +209,11 @@ Error Runner::generate( // after the prompt. After that we will enter generate loop. // print prompts - wrapped_callback(prompt); - - auto prefill_res = text_prefiller_->prefill(prompt_tokens, 0); + if (echo) { + wrapped_callback(prompt); + } + int64_t pos = 0; + auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos); stats_.first_token_ms = util::time_in_ms(); stats_.prompt_eval_end_ms = util::time_in_ms(); ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error()); @@ -214,6 +221,10 @@ Error Runner::generate( // print the first token from prefill. No prev_token so use cur_token for it. wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token))); + ET_LOG( + Info, + "RSS after prompt prefill: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); // start the main loop prompt_tokens.push_back(cur_token); @@ -222,6 +233,10 @@ Error Runner::generate( stats_.inference_end_ms = util::time_in_ms(); printf("\n"); + ET_LOG( + Info, + "RSS after finishing text generation: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); if (num_prompt_tokens + num_generated_tokens == seq_len) { ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len); diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h index 4e3c1daef7..cec8c61157 100644 --- a/examples/models/llama2/runner/runner.h +++ b/examples/models/llama2/runner/runner.h @@ -40,7 +40,8 @@ class Runner { const std::string& prompt, int32_t seq_len = 128, std::function token_callback = {}, - std::function stats_callback = {}); + std::function stats_callback = {}, + bool echo = true); void stop(); private: diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl index 475c5d92ab..9ee3f99567 100644 --- a/examples/models/llama2/runner/targets.bzl +++ b/examples/models/llama2/runner/targets.bzl @@ -34,8 +34,8 @@ def define_common_targets(): "//executorch/extension/llm/runner:text_prefiller" + aten_suffix, "//executorch/extension/llm/runner:text_token_generator" + aten_suffix, "//executorch/extension/evalue_util:print_evalue" + aten_suffix, - "//executorch/extension/runner_util:managed_tensor" + aten_suffix, "//executorch/extension/module:module" + aten_suffix, + "//executorch/extension/tensor:tensor" + aten_suffix, "//executorch/kernels/quantized:generated_lib" + aten_suffix, "//executorch/runtime/core/exec_aten:lib" + aten_suffix, "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt index 444f6b3389..c36e39a04c 100644 --- a/examples/models/llava/CMakeLists.txt +++ b/examples/models/llava/CMakeLists.txt @@ -21,6 +21,9 @@ project(llava) # Duplicating options as root CMakeLists.txt option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF) +# This is a temporary hack to get around Torch dep so we can test this on android +option(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE "Hack option to feed dummy image to remove torch.load dep" OFF) + include(CMakeDependentOption) # # pthreadpool: build pthreadpool library. Disable on unsupported platforms @@ -70,7 +73,14 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..) set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags) find_package(gflags REQUIRED) -find_package(Torch CONFIG REQUIRED) +# Avoid torch dep from torch.load()-ing the image. +# This is a temporary hack. +if(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE) + add_definitions(-DLLAVA_NO_TORCH_DUMMY_IMAGE=1) + message("Buidling the runner without Torch, feeding a dummy image!") +else() + find_package(Torch CONFIG REQUIRED) +endif() add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) # @@ -95,7 +105,11 @@ endif() # llava_runner library add_subdirectory(runner) -set(link_libraries gflags torch) +set(LINK_LIBS gflags) +if(NOT LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE) + list(APPEND LINK_LIBS torch) +endif() +set(link_libraries ${LINK_LIBS}) set(_srcs main.cpp) if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) diff --git a/examples/models/llava/README.md b/examples/models/llava/README.md index 807e1b3cee..8cb605d75f 100644 --- a/examples/models/llava/README.md +++ b/examples/models/llava/README.md @@ -34,8 +34,9 @@ Run the following cmake commands from `executorch/`: cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Debug \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index 4f8a403bb3..bdeaef15fe 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -89,6 +89,7 @@ def forward(self, input_pos, embeddings): use_kv_cache=True, example_inputs=(torch.tensor([0], dtype=torch.int64), embeddings), dynamic_shapes=dynamic_shapes, + args=llava.text_model_args, ) dtype_override = DType.fp32 @@ -145,6 +146,7 @@ def forward(self, images): use_kv_cache=True, example_inputs=(resized,), dynamic_shapes=dynamic_shapes, + args=None, ) .capture_pre_autograd_graph() .pt2e_quantize([quantizer]) @@ -211,10 +213,15 @@ def export_all(llava_model: LlavaModel): partitioner={ "image_encoder": [XnnpackPartitioner()], "text_model": [ + # First partition the DQLinear nodes, then partition the rest of the nodes, + # to avoid multiple DQLinear nodes in the same partition, + # to avoid holding multiple unpacked and packed weight buffers in memory, + # to reduce peak memory footprint. XnnpackPartitioner( config_precisions=ConfigPrecisionType.DYNAMIC_QUANT, per_op_mode=True, - ) + ), + XnnpackPartitioner(), ], }, compile_config=EdgeCompileConfig(_check_ir_validity=False), diff --git a/examples/models/llava/install_requirements.sh b/examples/models/llava/install_requirements.sh index 3bf803b356..931d63b391 100644 --- a/examples/models/llava/install_requirements.sh +++ b/examples/models/llava/install_requirements.sh @@ -7,6 +7,6 @@ set -x -pip install transformers accelerate +pip install transformers accelerate sentencepiece pip list diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp index 171eb77077..53f6329b4d 100644 --- a/examples/models/llava/main.cpp +++ b/examples/models/llava/main.cpp @@ -8,7 +8,11 @@ #include #include +#ifndef LLAVA_NO_TORCH_DUMMY_IMAGE #include +#else +#include // std::fill +#endif #if defined(ET_USE_THREADPOOL) #include @@ -80,6 +84,15 @@ int32_t main(int32_t argc, char** argv) { // read image and resize the longest edge to 336 std::vector image_data; + +#ifdef LLAVA_NO_TORCH_DUMMY_IMAGE + // Work without torch using a random data + image_data.resize(3 * 240 * 336); + std::fill(image_data.begin(), image_data.end(), 0); // black + std::array image_shape = {3, 240, 336}; + std::vector images = { + {.data = image_data, .width = image_shape[2], .height = image_shape[1]}}; +#else // LLAVA_NO_TORCH_DUMMY_IMAGE // cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR); // int longest_edge = std::max(image.rows, image.cols); // float scale_factor = 336.0f / longest_edge; @@ -102,6 +115,8 @@ int32_t main(int32_t argc, char** argv) { {.data = image_data, .width = static_cast(image_tensor.size(2)), .height = static_cast(image_tensor.size(1))}}; +#endif // LLAVA_NO_TORCH_DUMMY_IMAGE + // generate runner.generate(std::move(images), prompt, seq_len); return 0; diff --git a/examples/models/llava/runner/CMakeLists.txt b/examples/models/llava/runner/CMakeLists.txt index 564d31f8e7..2d0c30a620 100644 --- a/examples/models/llava/runner/CMakeLists.txt +++ b/examples/models/llava/runner/CMakeLists.txt @@ -40,8 +40,8 @@ add_subdirectory( add_library(llava_runner STATIC ${_llava_runner__srcs}) -set(llava_runner_deps executorch extension_module extension_data_loader - extension_llm_runner +set(llava_runner_deps executorch extension_data_loader extension_llm_runner + extension_module extension_tensor ) target_link_libraries(llava_runner PUBLIC ${llava_runner_deps}) diff --git a/examples/models/llava/runner/llava_image_prefiller.h b/examples/models/llava/runner/llava_image_prefiller.h index 50c981026a..3597ff82ef 100644 --- a/examples/models/llava/runner/llava_image_prefiller.h +++ b/examples/models/llava/runner/llava_image_prefiller.h @@ -11,7 +11,7 @@ #pragma once #include -#include +#include namespace torch::executor { @@ -26,18 +26,18 @@ class LlavaImagePrefiller : public ImagePrefiller { */ inline Result prefill(Image& image, int64_t& start_pos) override { - ManagedTensor managed_images( + auto image_tensor = executorch::extension::from_blob( image.data.data(), {3, image.height, image.width}, ScalarType::Byte); // Run image encoder - std::vector image_encoder_outputs = ET_UNWRAP(module_->execute( - kImageEncoderMethod, managed_images.get_aliasing_tensor())); + auto image_encoder_outputs = + ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor)); // inputs:[start_pos, embeds] - ManagedTensor managed_start_pos(&start_pos, {1}, ScalarType::Long); - auto start_pos_tensor = managed_start_pos.get_aliasing_tensor(); + auto start_pos_tensor = + executorch::extension::from_blob(&start_pos, {1}, ScalarType::Long); // Run text model - std::vector outputs_res = ET_UNWRAP(module_->execute( + auto outputs_res = ET_UNWRAP(module_->execute( kTextModelMethod, {start_pos_tensor, image_encoder_outputs[0]})); ET_CHECK_MSG( outputs_res[0].isTensor(), diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp index 0fc06da0c5..64763c7257 100644 --- a/examples/models/llava/runner/llava_runner.cpp +++ b/examples/models/llava/runner/llava_runner.cpp @@ -72,6 +72,54 @@ Error LlavaRunner::load() { return Error::Ok; } +Error LlavaRunner::prefill_images( + std::vector& images, + int64_t& start_pos) { + for (auto& image : images) { + // pos is updated inside image prefill. + ET_UNWRAP(image_prefiller_->prefill(image, start_pos)); + } + return Error::Ok; +} + +Result LlavaRunner::prefill_prompt( + const std::string& prompt, + int64_t& start_pos, + int8_t bos, + int8_t eos) { + std::vector prompt_tokens = + ET_UNWRAP(tokenizer_->encode(prompt, bos, eos)); + + return text_prefiller_->prefill(prompt_tokens, start_pos); +} + +Error LlavaRunner::generate_from_pos( + const std::string& prompt, + int32_t seq_len, + int64_t start_pos, + std::function token_callback, + std::function + stats_callback) { + // prefill user prompt. No BOS because preset prompt already has it. + token_callback(prompt); + + uint64_t prefill_next_token = + ET_UNWRAP(prefill_prompt(prompt, start_pos, /*bos=*/0, /*eos*/ 0)); + stats_.num_prompt_tokens = start_pos; + + // Generate tokens + int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate( + {prefill_next_token}, start_pos, seq_len, token_callback)); + + // Bookkeeping + stats_.num_generated_tokens = num_generated_tokens; + ::executorch::llm::print_report(stats_); + if (stats_callback) { + stats_callback(stats_); + } + return Error::Ok; +} + Error LlavaRunner::generate( std::vector images, const std::string& prompt, @@ -83,6 +131,11 @@ Error LlavaRunner::generate( ET_CHECK_OK_OR_RETURN_ERROR(load()); } + ET_LOG( + Info, + "RSS after loading model: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); + // Wrap the token_callback with print function std::function wrapped_callback = [token_callback](const std::string& piece) { @@ -96,43 +149,26 @@ Error LlavaRunner::generate( int64_t pos = 0; // prefill preset prompt - std::vector preset_prompt_tokens = - ET_UNWRAP(tokenizer_->encode(kPresetPrompt, /*bos=*/1, /*eos=*/0)); - size_t num_preset_tokens = preset_prompt_tokens.size(); - - ET_UNWRAP(text_prefiller_->prefill(preset_prompt_tokens, pos)); - pos += num_preset_tokens; + prefill_prompt(kPresetPrompt, pos, /*bos=*/1, /*eos*/ 0); // prefill images - for (auto& image : images) { - // pos is updated inside image prefill. - ET_UNWRAP(image_prefiller_->prefill(image, pos)); - } + prefill_images(images, pos); - // prefill user prompt. No BOS because preset prompt already has it. - wrapped_callback(prompt); - - std::vector user_prompt_tokens = - ET_UNWRAP(tokenizer_->encode(prompt, /*bos=*/0, /*eos=*/0)); - size_t num_user_tokens = user_prompt_tokens.size(); - - uint64_t prefill_next_token = - ET_UNWRAP(text_prefiller_->prefill(user_prompt_tokens, pos)); - pos += num_user_tokens; + ET_LOG( + Info, + "RSS after prompt and image prefill: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); // Generate tokens - int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate( - {prefill_next_token}, pos, seq_len, wrapped_callback)); + Error err = + generate_from_pos(prompt, seq_len, pos, wrapped_callback, stats_callback); - // Bookkeeping - stats_.num_prompt_tokens = num_preset_tokens + num_user_tokens; - stats_.num_generated_tokens = num_generated_tokens; - ::executorch::llm::print_report(stats_); - if (stats_callback) { - stats_callback(stats_); - } + ET_LOG( + Info, + "RSS after finishing text generation: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); - return Error::Ok; + return err; } } // namespace torch::executor diff --git a/examples/models/llava/runner/llava_runner.h b/examples/models/llava/runner/llava_runner.h index 9b14bc9283..923f8180a8 100644 --- a/examples/models/llava/runner/llava_runner.h +++ b/examples/models/llava/runner/llava_runner.h @@ -38,6 +38,48 @@ class LlavaRunner : public MultimodalRunner { std::function stats_callback = {}); + /** + * Prefill an LLaVA Module with the given images input. + * @param images The image input to LLaVA. + * @param start_pos The starting position in KV cache of the input in the LLM. + * It's passed as reference and will be updated inside this function. + * @return The error status of prefilling images. + */ + Error prefill_images(std::vector& images, int64_t& start_pos); + + /** + * Prefill an LLaVA Module with the given text input. + * @param prompt The text prompt to LLaVA. + * @param start_pos The starting position in KV cache of the input in the LLM. + * It's passed as reference and will be updated inside this function. + * @param bos The number of BOS (begin of sequence) token. + * @param eos The number of EOS (end of sequence) token. + * @return The generated token of the LLaVA Module after prefill prompt. + */ + Result prefill_prompt( + const std::string& prompt, + int64_t& start_pos, + int8_t bos = 0, + int8_t eos = 0); + + /** + * Generate tokens from the given prompt, starting from the given position. + * @param prompt The text prompt to LLaVA. + * @param seq_len The total sequence length, including the prompt tokens and + * new tokens. + * @param start_pos The starting position in KV cache of the input in the LLM. + * @param token_callback What to do after a token is generated. + * @param stats_callback What to do with Stats. + * @return The error code. + */ + Error generate_from_pos( + const std::string& prompt, + int32_t seq_len = 1024, + int64_t start_pos = 0, + std::function token_callback = {}, + std::function + stats_callback = {}); + private: inline static const std::string kPresetPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: "; diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h index e70ba59d51..a58bcc47e0 100644 --- a/examples/models/llava/runner/llava_text_decoder_runner.h +++ b/examples/models/llava/runner/llava_text_decoder_runner.h @@ -20,17 +20,14 @@ class LlavaTextDecoderRunner : public TextDecoderRunner { : TextDecoderRunner(module, true, vocab_size, temperature){}; inline Result step( - ManagedTensor& managed_tokens, - ManagedTensor& managed_start_pos) override { - auto tokens = managed_tokens.get_aliasing_tensor(); - auto start_pos = managed_start_pos.get_aliasing_tensor(); - + executorch::extension::TensorPtr& tokens, + executorch::extension::TensorPtr& start_pos) override { // run token embedding - std::vector token_embedding_outputs = + auto token_embedding_outputs = ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, tokens)); // run text model - std::vector outputs_res = ET_UNWRAP(module_->execute( + auto outputs_res = ET_UNWRAP(module_->execute( kTextModelMethod, {start_pos, token_embedding_outputs[0]})); ET_CHECK_MSG( diff --git a/examples/models/llava/runner/targets.bzl b/examples/models/llava/runner/targets.bzl index 72942acf16..c7523d6cc4 100644 --- a/examples/models/llava/runner/targets.bzl +++ b/examples/models/llava/runner/targets.bzl @@ -16,8 +16,8 @@ def define_common_targets(): "//executorch/extension/llm/runner:runner_lib", "//executorch/extension/llm/tokenizer:bpe_tokenizer", "//executorch/extension/evalue_util:print_evalue", - "//executorch/extension/runner_util:managed_tensor", "//executorch/extension/module:module", + "//executorch/extension/tensor:tensor", "//executorch/kernels/quantized:generated_lib", "//executorch/runtime/core/exec_aten:lib", "//executorch/runtime/core/exec_aten/util:tensor_util", diff --git a/examples/models/phi-3-mini/CMakeLists.txt b/examples/models/phi-3-mini/CMakeLists.txt index 39358e088e..e1ffd0da05 100644 --- a/examples/models/phi-3-mini/CMakeLists.txt +++ b/examples/models/phi-3-mini/CMakeLists.txt @@ -23,6 +23,7 @@ set(CMAKE_BUILD_TYPE Release) # Set options for executorch build. option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON) option(EXECUTORCH_BUILD_XNNPACK "" ON) @@ -47,6 +48,6 @@ target_include_directories( PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags/src ) target_link_libraries( - phi_3_mini_runner PRIVATE executorch extension_module_static + phi_3_mini_runner PRIVATE executorch extension_module_static extension_tensor optimized_native_cpu_ops_lib xnnpack_backend gflags ) diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md index 6619a111a2..1926971621 100644 --- a/examples/models/phi-3-mini/README.md +++ b/examples/models/phi-3-mini/README.md @@ -26,8 +26,9 @@ python -m examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-m -DCMAKE_INSTALL_PREFIX=cmake-out \ -DEXECUTORCH_ENABLE_LOGGING=1 \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ diff --git a/examples/models/phi-3-mini/runner.cpp b/examples/models/phi-3-mini/runner.cpp index a6cee57ea8..9da323278f 100644 --- a/examples/models/phi-3-mini/runner.cpp +++ b/examples/models/phi-3-mini/runner.cpp @@ -12,7 +12,7 @@ #include #include -#include +#include #include namespace torch::executor { @@ -81,23 +81,17 @@ uint64_t Runner::logits_to_token(const exec_aten::Tensor& logits_tensor) { } uint64_t Runner::prefill(std::vector& tokens) { - ManagedTensor input_tokens( + auto result = module_->forward(from_blob( tokens.data(), {1, static_cast(tokens.size())}, - ScalarType::Long); - std::vector inputs = {input_tokens.get_aliasing_tensor()}; - - auto result = module_->forward(inputs); + ScalarType::Long)); ET_CHECK_MSG(result.error() == Error::Ok, "Failed to prefill tokens"); return logits_to_token(result.get()[0].toTensor()); } uint64_t Runner::run_model_step(uint64_t token) { - ManagedTensor input_token(&token, {1, 1}, ScalarType::Long); - std::vector inputs = {input_token.get_aliasing_tensor()}; - - auto result = module_->forward(inputs); + auto result = module_->forward(from_blob(&token, {1, 1}, ScalarType::Long)); ET_CHECK_MSG( result.error() == Error::Ok, "Failed to run forward() for token %" PRIu64, diff --git a/examples/models/test/test_export.py b/examples/models/test/test_export.py index f258cc2139..b3030c24fe 100644 --- a/examples/models/test/test_export.py +++ b/examples/models/test/test_export.py @@ -29,7 +29,7 @@ def collect_executorch_and_eager_outputs( Returns a tuple containing the outputs of the eager mode model and the executorch mode model. """ eager_model = eager_model.eval() - model = torch._export.capture_pre_autograd_graph(eager_model, example_inputs) + model = torch.export.export_for_training(eager_model, example_inputs).module() edge_model = export_to_edge(model, example_inputs) executorch_prog = edge_model.to_executorch() diff --git a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt index 006e0f7517..9799508633 100644 --- a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt +++ b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt @@ -28,6 +28,7 @@ target_link_libraries( full_portable_ops_lib extension_data_loader extension_module + extension_tensor gflags re2::re2 ) diff --git a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp index 7340672c9e..599accfd1e 100644 --- a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp +++ b/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp index d452336175..0ccaefa79e 100644 --- a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp @@ -13,9 +13,9 @@ #include #include #include -#include #include #include +#include #include #include @@ -26,6 +26,7 @@ namespace torch { namespace executor { namespace { +using namespace executorch::extension; static constexpr auto kTopp = 0.9f; void printReport(const Runner::Stats& stats); std::string statsToJsonString(const Runner::Stats& stats); @@ -136,32 +137,30 @@ int32_t Runner::logitsToToken(const exec_aten::Tensor& logits_tensor) { // step. Returning the logits tensor. Result Runner::run_model_step( int64_t input_token, - Tensor& token, - Tensor& start_pos, - Tensor& atten_mask, - std::vector& kv_tensors, - std::vector& kv_outputs) { - token.mutable_data_ptr()[0] = input_token; + TensorPtr& token, + TensorPtr& start_pos, + TensorPtr& atten_mask, + std::vector& kv_tensors, + std::vector& kv_outputs) { + token->mutable_data_ptr()[0] = input_token; // inputs:[tokens, start_pos, atten_mask, k_cache, v_cache] - std::vector inputs = {token, start_pos, atten_mask}; - inputs.insert(inputs.end(), kv_tensors.begin(), kv_tensors.end()); - Result> outputs_res = module_->forward(inputs); + auto outputs_res = module_->forward({*token, *start_pos, *atten_mask}); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); // TODO: need to handle batch size != 1 - size_t v_offset = kv_outputs[0].nbytes(); - size_t el_size = kv_outputs[0].element_size(); + size_t v_offset = kv_outputs[0]->nbytes(); + size_t el_size = kv_outputs[0]->element_size(); size_t k_input_step = (max_seq_len_ - 1) * el_size; int k_tensors_end = kv_tensors.size() / 2; // update k caches for (int j = 0; j < k_tensors_end; ++j) { uint8_t* input_addr = - static_cast(kv_tensors[j].mutable_data_ptr()); + static_cast(kv_tensors[j]->mutable_data_ptr()); uint8_t* output_addr = - static_cast(kv_outputs[j].mutable_data_ptr()); + static_cast(kv_outputs[j]->mutable_data_ptr()); // fill the output k values back - for (int src = 0, dst = k_input_step; src < kv_outputs[j].nbytes(); + for (int src = 0, dst = k_input_step; src < kv_outputs[j]->nbytes(); src += el_size, dst += k_input_step) { input_addr[dst] = output_addr[src]; } @@ -169,7 +168,7 @@ Result Runner::run_model_step( // inputs ET_CHECK_MSG( internal::set_tensor_data( - kv_tensors[j], new_inp_addr, kv_tensors[j].nbytes()) == Error::Ok, + *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok, "Failed to set input tensor when updating k_cache"); } // update v caches @@ -179,25 +178,25 @@ Result Runner::run_model_step( ET_CHECK_MSG( internal::set_tensor_data( - kv_tensors[j], new_inp_addr, kv_tensors[j].nbytes()) == Error::Ok, + *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok, "Failed to set input tensor when updating v_cache"); // outputs char* new_out_addr = io_mem_mgr_.update_v_caches_write(v_idx, v_offset); ET_CHECK_MSG( internal::set_tensor_data( - kv_outputs[j], new_out_addr, kv_outputs[j].nbytes()) == Error::Ok, + *kv_outputs[j], new_out_addr, kv_outputs[j]->nbytes()) == Error::Ok, "Failed to set output tensor when updating v_cache"); ET_CHECK_MSG( - module_->set_output_data_ptr(kv_outputs[j], j + 1) == Error::Ok, + module_->set_output_data_ptr(*kv_outputs[j], j + 1) == Error::Ok, "Failed to set llama output data pointer"); } // Bump start_pos by 1 - start_pos.mutable_data_ptr()[0]++; + start_pos->mutable_data_ptr()[0]++; // update atten_mask - atten_mask.mutable_data_ptr() - [atten_mask.numel() - 1 - start_pos.const_data_ptr()[0]] = 0; + atten_mask->mutable_data_ptr() + [atten_mask->numel() - 1 - start_pos->const_data_ptr()[0]] = 0; return outputs_res.get()[0].toTensor(); } // TODO: add overloaded method for on-device tokenize @@ -253,19 +252,14 @@ Error Runner::generate( std::vector hidden_states_data_shape = {1, 1, dim_}; // initialize tensor wrappers - ManagedTensor managed_token( + auto token = from_blob( io_mem_mgr_.get_input_token_ptr(), token_shape, ScalarType::Int); - ManagedTensor managed_pos_id( + auto start_pos = from_blob( io_mem_mgr_.get_pos_idx_ptr(), start_pos_shape, ScalarType::Int); - ManagedTensor managed_atten_mask( + auto atten_mask = from_blob( io_mem_mgr_.get_atten_mask_ptr(), atten_mask_shape, ScalarType::Float); - Tensor token = managed_token.get_aliasing_tensor(); - Tensor atten_mask = managed_atten_mask.get_aliasing_tensor(); - Tensor start_pos = managed_pos_id.get_aliasing_tensor(); - - std::vector managed_kv_inputs, managed_kv_outputs; - std::vector kv_tensors, kv_outputs; + std::vector kv_tensors, kv_outputs; Result method_meta = get_method_meta(); size_t num_inputs = method_meta->num_inputs(); @@ -282,22 +276,20 @@ Error Runner::generate( auto tensor_shape = tensor_meta->sizes(); std::vector sizes( tensor_shape.data(), tensor_shape.data() + tensor_shape.size()); - managed_kv_inputs.emplace_back(ManagedTensor( + kv_tensors.emplace_back(from_blob( io_mem_mgr_.get_k_caches_read_ptr(i), sizes, tensor_meta->scalar_type())); - kv_tensors.emplace_back(managed_kv_inputs.back().get_aliasing_tensor()); // outpus Result out_tensor_meta = method_meta->output_tensor_meta(i + 1); tensor_shape = out_tensor_meta->sizes(); sizes = std::vector{ tensor_shape.data(), tensor_shape.data() + tensor_shape.size()}; - managed_kv_outputs.emplace_back(ManagedTensor( + kv_outputs.emplace_back(from_blob( io_mem_mgr_.get_k_caches_write_ptr(i), sizes, - kv_tensors.back().scalar_type())); - kv_outputs.emplace_back(managed_kv_outputs.back().get_aliasing_tensor()); + kv_tensors.back()->scalar_type())); ET_CHECK_MSG( module_->set_output_data_ptr(kv_outputs.back(), i + 1) == Error::Ok, "Failed to set output tensor for kv cache"); @@ -314,11 +306,10 @@ Error Runner::generate( std::vector sizes( tensor_shape.data(), tensor_shape.data() + tensor_shape.size()); - managed_kv_inputs.emplace_back(ManagedTensor( + kv_tensors.emplace_back(from_blob( io_mem_mgr_.get_v_caches_read_ptr(i), sizes, tensor_meta->scalar_type())); - kv_tensors.push_back(managed_kv_inputs.back().get_aliasing_tensor()); // outputs Result out_tensor_meta = @@ -327,22 +318,20 @@ Error Runner::generate( sizes = std::vector{ tensor_shape.data(), tensor_shape.data() + tensor_shape.size()}; - managed_kv_outputs.push_back(ManagedTensor( + kv_outputs.push_back(from_blob( io_mem_mgr_.get_v_caches_write_ptr(i), sizes, - kv_tensors.back().scalar_type())); - kv_outputs.push_back(managed_kv_outputs.back().get_aliasing_tensor()); + kv_tensors.back()->scalar_type())); ET_CHECK_MSG( module_->set_output_data_ptr(kv_outputs.back(), output_index) == Error::Ok, "Failed to set output tensor for llama block"); } - ManagedTensor affine_managed_logits( + auto affine_logits = from_blob( reinterpret_cast(io_mem_mgr_.get_logit_ptr()), logits_data_shape, ScalarType::Float); - Tensor affine_logits = affine_managed_logits.get_aliasing_tensor(); ET_CHECK_MSG( module_->set_output_data_ptr(affine_logits, 0) == Error::Ok, "Failed to set output tensor for affine module - logits"); @@ -351,7 +340,7 @@ Error Runner::generate( std::string final_output; while (pos < seq_len - 1) { // Run the model - Result logits_res = run_model_step( + auto logits_res = run_model_step( cur_token, token, start_pos, atten_mask, kv_tensors, kv_outputs); if (pos == num_prompt_tokens) { stats_.first_token_ms = util::time_in_ms(); diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.h b/examples/qualcomm/oss_scripts/llama2/runner/runner.h index cdbb2cdd2e..1c35c821ce 100644 --- a/examples/qualcomm/oss_scripts/llama2/runner/runner.h +++ b/examples/qualcomm/oss_scripts/llama2/runner/runner.h @@ -21,7 +21,7 @@ #include #include #include -#include +#include class RpcMemAllocator { public: @@ -248,13 +248,13 @@ class Runner { T getMetadataHelper(std::string method_name, T default_val); template int32_t logitsToToken(const exec_aten::Tensor& logits_tensor); - Result run_model_step( + Result run_model_step( int64_t input_token, - Tensor& token, - Tensor& start_pos, - Tensor& atten_mask, - std::vector& kv_tensors, - std::vector& kv_outputs); + ::executorch::extension::TensorPtr& token, + ::executorch::extension::TensorPtr& start_pos, + ::executorch::extension::TensorPtr& atten_mask, + std::vector<::executorch::extension::TensorPtr>& kv_tensors, + std::vector<::executorch::extension::TensorPtr>& kv_outputs); // metadata int32_t vocab_size_; int64_t bos_id_; diff --git a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt index c1fd5dc653..1a9406ca95 100644 --- a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt +++ b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt @@ -35,6 +35,7 @@ target_link_libraries( executorch_no_prim_ops extension_data_loader extension_module + extension_tensor gflags re2::re2 ) @@ -89,6 +90,7 @@ target_link_libraries( executorch_no_prim_ops extension_data_loader extension_module + extension_tensor gflags re2::re2 ) diff --git a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp index 49782cf878..d69aa0aa7a 100644 --- a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp +++ b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp index aae18434c6..9d06e8118d 100644 --- a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp +++ b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp index ec13cec37c..d6d9911293 100644 --- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h index b9849a2132..bd24ea6beb 100644 --- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h +++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h @@ -21,7 +21,6 @@ #include #include #include -#include namespace torch { namespace executor { diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt index e6af95595b..c59cea32b9 100644 --- a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt @@ -24,6 +24,7 @@ target_link_libraries( executorch_no_prim_ops extension_data_loader extension_module + extension_tensor gflags re2::re2 ) diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp index 3d3d99d707..b6c211d8ac 100644 --- a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include @@ -22,6 +22,8 @@ #include #include +using namespace ::executorch::extension; + namespace torch { namespace executor { @@ -350,31 +352,27 @@ Error Runner::generate(std::string prompt) { MethodMeta encoder_method_meta = method_metas[0].get(); // Initialize text_encoder input tensors: cond/uncond tokenized_input[1,77] - ManagedTensor managed_cond_tokens( + auto cond_tokens_tensor = from_blob( cond_tokens.data(), {1, 77}, encoder_method_meta.input_tensor_meta(0)->scalar_type()); - ManagedTensor managed_uncond_tokens( + auto uncond_tokens_tensor = from_blob( uncond_tokens.data(), {1, 77}, encoder_method_meta.input_tensor_meta(0)->scalar_type()); - Tensor cond_tokens_tensor = managed_cond_tokens.get_aliasing_tensor(); - Tensor uncond_tokens_tensor = managed_uncond_tokens.get_aliasing_tensor(); // Initialize text_encoder output tensors: cond/uncond embedding[1, 77, 1024] constexpr int emb_size = 1 * 77 * 1024; std::vector cond_emb_vec(emb_size); std::vector uncond_emb_vec(emb_size); std::vector fp_emb_vec(emb_size); - ManagedTensor managed_cond_emb( + auto cond_emb_tensor = from_blob( cond_emb_vec.data(), {1, 77, 1024}, encoder_method_meta.output_tensor_meta(0)->scalar_type()); - ManagedTensor managed_uncond_emb( + auto uncond_emb_tensor = from_blob( uncond_emb_vec.data(), {1, 77, 1024}, encoder_method_meta.output_tensor_meta(0)->scalar_type()); - Tensor cond_emb_tensor = managed_cond_emb.get_aliasing_tensor(); - Tensor uncond_emb_tensor = managed_uncond_emb.get_aliasing_tensor(); modules_[0]->set_output_data_ptr(cond_emb_tensor, 0); long encoder_start = util::time_in_ms(); auto cond_res = modules_[0]->forward(cond_tokens_tensor); @@ -403,22 +401,17 @@ Error Runner::generate(std::string prompt) { // 3. cond/uncond embedding[1,77,1024] std::vector latent_model_input(latent.size()); std::vector fp_latent_model_input(latent.size()); - ManagedTensor managed_latent( + auto latent_tensor = from_blob( latent_model_input.data(), {1, 64, 64, 4}, unet_method_meta.input_tensor_meta(0)->scalar_type()); - Tensor latent_tensor = managed_latent.get_aliasing_tensor(); - std::vector managed_time_emb_tensors; - std::vector time_emb_tensors; - managed_time_emb_tensors.reserve(num_time_steps_); + std::vector time_emb_tensors; time_emb_tensors.reserve(num_time_steps_); - for (int step_index = 0; step_index < num_time_steps_; step_index++) { - managed_time_emb_tensors.emplace_back(ManagedTensor( + for (auto step_index = 0; step_index < num_time_steps_; step_index++) { + time_emb_tensors.emplace_back(from_blob( time_emb_list_[step_index].data(), {1, 1280}, unet_method_meta.input_tensor_meta(1)->scalar_type())); - time_emb_tensors.emplace_back( - managed_time_emb_tensors.back().get_aliasing_tensor()); } // requantize text encoders output dequant_tensor( @@ -447,17 +440,14 @@ Error Runner::generate(std::string prompt) { std::vector noise_pred_uncond(latent.size()); std::vector fp_noise_pred_text(noise_pred_text.size()); std::vector fp_noise_pred_uncond(noise_pred_uncond.size()); - ManagedTensor managed_noise_pred_text( + auto noise_pred_text_tensor = from_blob( noise_pred_text.data(), {1, 64, 64, 4}, unet_method_meta.output_tensor_meta(0)->scalar_type()); - Tensor noise_pred_text_tensor = managed_noise_pred_text.get_aliasing_tensor(); - ManagedTensor managed_noise_pred_uncond( + auto noise_pred_uncond_tensor = from_blob( noise_pred_uncond.data(), {1, 64, 64, 4}, unet_method_meta.output_tensor_meta(0)->scalar_type()); - Tensor noise_pred_uncond_tensor = - managed_noise_pred_uncond.get_aliasing_tensor(); // Execute unet for (int step_index = 0; step_index < num_time_steps_; step_index++) { @@ -514,20 +504,18 @@ Error Runner::generate(std::string prompt) { MethodMeta vae_method_meta = method_metas[2].get(); // Initialize vae input tensor : latent[1,64,64,4] std::vector vae_input(latent.size()); - ManagedTensor managed_vae_input( + auto vae_input_tensor = from_blob( vae_input.data(), {1, 64, 64, 4}, vae_method_meta.input_tensor_meta(0)->scalar_type()); - Tensor vae_input_tensor = managed_vae_input.get_aliasing_tensor(); // Intialize vae output tensor: output[1,512,512,3] constexpr int image_size = 1 * 512 * 512 * 3; std::vector q_out(image_size); std::vector out(image_size); - ManagedTensor managed_output( + auto output_tensor = from_blob( q_out.data(), {1, 512, 512, 3}, vae_method_meta.output_tensor_meta(0)->scalar_type()); - Tensor output_tensor = managed_output.get_aliasing_tensor(); quant_tensor(latent, vae_input, vae_input_scale_, vae_input_offset_); diff --git a/examples/xnnpack/README.md b/examples/xnnpack/README.md index 61c14b5c7e..dcd5b9c5d7 100644 --- a/examples/xnnpack/README.md +++ b/examples/xnnpack/README.md @@ -38,9 +38,10 @@ mkdir cmake-out cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DPYTHON_EXECUTABLE=python \ -Bcmake-out . @@ -92,9 +93,10 @@ mkdir cmake-out cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DPYTHON_EXECUTABLE=python \ -Bcmake-out . diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py index 24ee6bd21a..2256d5fcc9 100644 --- a/exir/_serialize/_program.py +++ b/exir/_serialize/_program.py @@ -347,7 +347,6 @@ def serialize_pte_binary( *, mutable_data: Optional[List[Buffer]] = None, extract_delegate_segments: bool = False, - extract_constant_segment: bool = True, segment_alignment: int = 128, constant_tensor_alignment: Optional[int] = None, delegate_alignment: Optional[int] = None, @@ -363,8 +362,6 @@ def serialize_pte_binary( and the starting segment offset. - Update the Program.segments field with the offsets and lengths of each segment. - extract_constant_segment: Whether to move the constant data from the Program - into a separate segment. segment_alignment: Alignment in bytes. The starting offset of each segment will be aligned to this value in the output data. constant_tensor_alignment: The minimum alignment of tensor @@ -387,19 +384,23 @@ def serialize_pte_binary( # Store extracted segment data; this may be constant data or delegate data. segments: List[Cord] = [] - if extract_constant_segment: - constant_segment_data, constant_segment_offsets = _extract_constant_segment( - program.constant_buffer, tensor_alignment=constant_tensor_alignment + constant_segment_data, constant_segment_offsets = _extract_constant_segment( + program.constant_buffer, tensor_alignment=constant_tensor_alignment + ) + + # If there are no constants, len(constant_segment_data) = 0. However, there may + # be non-constants, in which case len(constant_segment_offsets) = 1, containing + # the placeholder value 0. Ensure the placeholder value is put into + # program.constant_segment.offsets. + if len(constant_segment_offsets) > 0: + # Update program.constant_segment with constant subsegment offset information. + program.constant_segment = SubsegmentOffsets( + segment_index=len(segments), offsets=constant_segment_offsets ) - if len(constant_segment_data) > 0: - # Update program.constant_segment with constant subsegment offset information. - program.constant_segment = SubsegmentOffsets( - segment_index=len(segments), offsets=constant_segment_offsets - ) - # Clear the constant buffer, as constant data will be stored in segments. - program.constant_buffer = [] - # Add to the aggregate segments cord. - segments.append(constant_segment_data) + # Clear the constant buffer, as constant data will be stored in segments. + program.constant_buffer = [] + # Add to the aggregate segments cord. + segments.append(constant_segment_data) if mutable_data is not None: mutable_segment_data, mutable_segment_offsets = _extract_constant_segment( diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py index c4f4df0d0b..afd8e3d282 100644 --- a/exir/_serialize/test/test_program.py +++ b/exir/_serialize/test/test_program.py @@ -583,6 +583,33 @@ def test_round_trip_with_segments(self) -> None: program2 = deserialize_pte_binary(pte_data) self.assert_programs_equal(program, program2) + def test_no_constants(self) -> None: + program = get_test_program() + # Insert placeholder for non-const tensors. + add_constant_data(program, [b""]) + + pte_data = bytes( + serialize_pte_binary( + program, + extract_delegate_segments=True, + segment_alignment=SEGMENT_ALIGNMENT, + constant_tensor_alignment=CONSTANT_TENSOR_ALIGNMENT, + ) + ) + # The input Program should not be modified. + self.assertEqual(program.segments, []) + + # Peek inside the actual flatbuffer data to see the segments. + flatbuffer_program = _json_to_program(_program_flatbuffer_to_json(pte_data)) + + # Constant buffer should be empty. + self.assertEqual(len(flatbuffer_program.constant_buffer), 0) + + # Constant segment should contain the placeholder. + self.assertEqual(flatbuffer_program.constant_segment.segment_index, 0) + self.assertEqual(len(flatbuffer_program.constant_segment.offsets), 1) + self.assertEqual(flatbuffer_program.constant_segment.offsets[0], 0) + def test_unused_inline_delegate_blobs_with_segments(self) -> None: # Create a program with some delegate data blobs. program = get_test_program() diff --git a/exir/capture/_config.py b/exir/capture/_config.py index 7b91464bdc..2d0a6c4ca8 100644 --- a/exir/capture/_config.py +++ b/exir/capture/_config.py @@ -65,12 +65,6 @@ class ExecutorchBackendConfig: # This makes it possible to free those blobs at runtime. extract_delegate_segments: bool = True - # Whether to extract constants from the Program into separate segments, - # rather than encoding those constants in the flatbuffer data. - # This reduces the memory overhead of creating the .pte file for models with - # large constant data. - extract_constant_segment: bool = True - # When extracting segments, the starting offset of each segment will be # aligned to this value (in bytes). Must be a power of two. segment_alignment: int = 128 diff --git a/exir/program/_program.py b/exir/program/_program.py index 849eae4f6f..1339760f21 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -439,7 +439,6 @@ def to_executorch( new_prog, emit_stacktrace=config.emit_stacktrace, extract_delegate_segments=config.extract_delegate_segments, - extract_constant_segment=config.extract_constant_segment, segment_alignment=config.segment_alignment, constant_tensor_alignment=config.constant_tensor_alignment, delegate_alignment=config.delegate_alignment, @@ -468,7 +467,6 @@ def __init__( exir_exported_program: ExirExportedProgram, emit_stacktrace: bool, extract_delegate_segments: bool, - extract_constant_segment: bool, segment_alignment: int, constant_tensor_alignment: Optional[int] = None, delegate_alignment: Optional[int] = None, @@ -483,7 +481,6 @@ def __init__( self._emitter_output: Optional[EmitterOutput] = None self._emit_stacktrace: bool = emit_stacktrace self._extract_delegate_segments: bool = extract_delegate_segments - self._extract_constant_segment: bool = extract_constant_segment self._segment_alignment: int = segment_alignment self._constant_tensor_alignment: Optional[int] = constant_tensor_alignment self._delegate_alignment: Optional[int] = delegate_alignment @@ -493,7 +490,6 @@ def _get_pte_data(self) -> Cord: self._pte_data = _serialize_pte_binary( program=self.program, extract_delegate_segments=self._extract_delegate_segments, - extract_constant_segment=self._extract_constant_segment, segment_alignment=self._segment_alignment, constant_tensor_alignment=self._constant_tensor_alignment, delegate_alignment=self._delegate_alignment, @@ -1351,7 +1347,6 @@ def __init__( program=self._emitter_output.program, mutable_data=self._emitter_output.mutable_data, extract_delegate_segments=backend_config.extract_delegate_segments, - extract_constant_segment=backend_config.extract_constant_segment, segment_alignment=backend_config.segment_alignment, constant_tensor_alignment=backend_config.constant_tensor_alignment, delegate_alignment=backend_config.delegate_alignment, diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 6827ae7904..74f9896000 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -39,6 +39,7 @@ list( extension_data_loader extension_module extension_runner_util + extension_tensor extension_threadpool fbjni ) diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK index 7afd9f8a94..7cdf8ef7ec 100644 --- a/extension/android/jni/BUCK +++ b/extension/android/jni/BUCK @@ -40,7 +40,7 @@ fb_android_cxx_library( "//third-party/glog:glog", "//xplat/executorch/extension/module:module_static", "//xplat/executorch/extension/runner_util:inputs_static", - "//xplat/executorch/extension/runner_util:managed_tensor_static", + "//xplat/executorch/extension/tensor:tensor_static", ], ) @@ -64,7 +64,7 @@ fb_android_cxx_library( "//xplat/executorch/backends/xnnpack:xnnpack_backend_static", "//xplat/executorch/extension/module:module_static", "//xplat/executorch/extension/runner_util:inputs_static", - "//xplat/executorch/extension/runner_util:managed_tensor_static", + "//xplat/executorch/extension/tensor:tensor_static", ], ) diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp index 79c6ebc516..ef74d6480b 100644 --- a/extension/android/jni/jni_layer.cpp +++ b/extension/android/jni/jni_layer.cpp @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include #include @@ -55,6 +55,7 @@ void et_pal_emit_log_message( } #endif +using namespace executorch::extension; using namespace torch::executor; namespace executorch::extension { @@ -167,7 +168,7 @@ class JEValue : public facebook::jni::JavaClass { evalue.tag); } - static ManagedTensor JEValueToTensorImpl( + static TensorPtr JEValueToTensorImpl( facebook::jni::alias_ref JEValue) { static const auto typeCodeField = JEValue::javaClassStatic()->getField("mTypeCode"); @@ -221,7 +222,7 @@ class JEValue : public facebook::jni::JavaClass { numel, dataCapacity); } - return ManagedTensor( + return from_blob( jni->GetDirectBufferAddress(jbuffer.get()), shape_vec, scalar_type); } facebook::jni::throwNewJavaException( @@ -293,9 +294,8 @@ class ExecuTorchJni : public facebook::jni::HybridClass { facebook::jni::alias_ref< facebook::jni::JArrayClass::javaobject> jinputs) { - std::vector evalues = {}; - - std::vector managed_tensors = {}; + std::vector evalues; + std::vector tensors; static const auto typeCodeField = JEValue::javaClassStatic()->getField("mTypeCode"); @@ -304,18 +304,17 @@ class ExecuTorchJni : public facebook::jni::HybridClass { auto jevalue = jinputs->getElement(i); const auto typeCode = jevalue->getFieldValue(typeCodeField); if (typeCode == JEValue::kTypeCodeTensor) { - managed_tensors.emplace_back(JEValue::JEValueToTensorImpl(jevalue)); - evalues.emplace_back( - EValue(managed_tensors.back().get_aliasing_tensor())); + tensors.emplace_back(JEValue::JEValueToTensorImpl(jevalue)); + evalues.emplace_back(tensors.back()); } else if (typeCode == JEValue::kTypeCodeInt) { int64_t value = jevalue->getFieldValue(typeCodeField); - evalues.emplace_back(EValue(value)); + evalues.emplace_back(value); } else if (typeCode == JEValue::kTypeCodeDouble) { double value = jevalue->getFieldValue(typeCodeField); - evalues.emplace_back(EValue(value)); + evalues.emplace_back(value); } else if (typeCode == JEValue::kTypeCodeBool) { bool value = jevalue->getFieldValue(typeCodeField); - evalues.emplace_back(EValue(value)); + evalues.emplace_back(value); } } diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index dda9ece589..0d43317c3c 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -150,6 +150,7 @@ class ExecuTorchLlamaJni jint channels, facebook::jni::alias_ref prompt, jint seq_len, + jboolean echo, facebook::jni::alias_ref callback) { if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) { auto image_size = image->size(); @@ -175,11 +176,92 @@ class ExecuTorchLlamaJni prompt->toStdString(), seq_len, [callback](std::string result) { callback->onResult(result); }, - [callback](const Stats& result) { callback->onStats(result); }); + [callback](const Stats& result) { callback->onStats(result); }, + echo); } return 0; } + // Returns a tuple of (error, start_pos) + // Contract is valid within an AAR (JNI + corresponding Java code) + // If the first element is not Error::Ok, the other element is undefined. + facebook::jni::local_ref prefill_prompt( + facebook::jni::alias_ref prompt, + jlong start_pos, + jint bos, + jint eos) { + facebook::jni::local_ref tuple_result = + facebook::jni::make_long_array(2); + if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) { + tuple_result->pin()[0] = static_cast(Error::NotSupported); + return tuple_result; + } + + auto&& result = multi_modal_runner_->prefill_prompt( + prompt->toStdString(), start_pos, bos, eos); + tuple_result->pin()[0] = static_cast(Error::Ok); + if (result.ok()) { + tuple_result->pin()[1] = static_cast(start_pos); + } + return tuple_result; + } + + // Returns a tuple of (error, start_pos) + // Contract is valid within an AAR (JNI + corresponding Java code) + // If the first element is not Error::Ok, the other element is undefined. + + facebook::jni::local_ref prefill_images( + facebook::jni::alias_ref image, + jint width, + jint height, + jint channels, + jlong start_pos) { + facebook::jni::local_ref tuple_result = + facebook::jni::make_long_array(2); + + if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) { + tuple_result->pin()[0] = static_cast(Error::NotSupported); + return tuple_result; + } + + auto image_size = image->size(); + std::vector images; + if (image_size != 0) { + std::vector image_data_jint(image_size); + std::vector image_data(image_size); + image->getRegion(0, image_size, image_data_jint.data()); + for (int i = 0; i < image_size; i++) { + image_data[i] = image_data_jint[i]; + } + Image image_runner{image_data, width, height, channels}; + images.push_back(image_runner); + } + // TODO(hsz): make start_pos a reference and update it here + jint result = static_cast( + multi_modal_runner_->prefill_images(images, start_pos)); + tuple_result->pin()[0] = result; + tuple_result->pin()[1] = static_cast(start_pos); + return tuple_result; + } + + jint generate_from_pos( + facebook::jni::alias_ref prompt, + jint seq_len, + jlong start_pos, + facebook::jni::alias_ref callback) { + if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) { + return static_cast(Error::NotSupported); + } + return static_cast(multi_modal_runner_->generate_from_pos( + prompt->toStdString(), + seq_len, + start_pos, + [callback](const std::string& result) { callback->onResult(result); }, + [callback](const ::executorch::extension::llm::Stats& stats) { + callback->onStats(stats); + })); + } + void stop() { if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) { multi_modal_runner_->stop(); diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java index bdc8506aa9..c4de23df0e 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java +++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java @@ -33,6 +33,7 @@ public class LlamaModule { private final HybridData mHybridData; private static final int DEFAULT_SEQ_LEN = 128; + private static final boolean DEFAULT_ECHO = true; @DoNotStrip private static native HybridData initHybrid( @@ -59,7 +60,7 @@ public void resetNative() { * @param llamaCallback callback object to receive results. */ public int generate(String prompt, LlamaCallback llamaCallback) { - return generate(prompt, DEFAULT_SEQ_LEN, llamaCallback); + return generate(prompt, DEFAULT_SEQ_LEN, DEFAULT_ECHO, llamaCallback); } /** @@ -70,7 +71,30 @@ public int generate(String prompt, LlamaCallback llamaCallback) { * @param llamaCallback callback object to receive results. */ public int generate(String prompt, int seqLen, LlamaCallback llamaCallback) { - return generate(null, 0, 0, 0, prompt, seqLen, llamaCallback); + return generate(null, 0, 0, 0, prompt, seqLen, DEFAULT_ECHO, llamaCallback); + } + + /** + * Start generating tokens from the module. + * + * @param prompt Input prompt + * @param echo indicate whether to echo the input prompt or not (text completion vs chat) + * @param llamaCallback callback object to receive results. + */ + public int generate(String prompt, boolean echo, LlamaCallback llamaCallback) { + return generate(null, 0, 0, 0, prompt, DEFAULT_SEQ_LEN, echo, llamaCallback); + } + + /** + * Start generating tokens from the module. + * + * @param prompt Input prompt + * @param seqLen sequence length + * @param echo indicate whether to echo the input prompt or not (text completion vs chat) + * @param llamaCallback callback object to receive results. + */ + public int generate(String prompt, int seqLen, boolean echo, LlamaCallback llamaCallback) { + return generate(null, 0, 0, 0, prompt, seqLen, echo, llamaCallback); } /** @@ -82,6 +106,7 @@ public int generate(String prompt, int seqLen, LlamaCallback llamaCallback) { * @param channels Input image number of channels * @param prompt Input prompt * @param seqLen sequence length + * @param echo indicate whether to echo the input prompt or not (text completion vs chat) * @param llamaCallback callback object to receive results. */ @DoNotStrip @@ -92,8 +117,66 @@ public native int generate( int channels, String prompt, int seqLen, + boolean echo, LlamaCallback llamaCallback); + /** + * Prefill an LLaVA Module with the given images input. + * + * @param image Input image as a byte array + * @param width Input image width + * @param height Input image height + * @param channels Input image number of channels + * @param startPos The starting position in KV cache of the input in the LLM. + * @return The updated starting position in KV cache of the input in the LLM. + * @throws RuntimeException if the prefill failed + */ + public long prefillImages(int[] image, int width, int height, int channels, long startPos) { + long[] nativeResult = prefillImagesNative(image, width, height, channels, startPos); + if (nativeResult[0] != 0) { + throw new RuntimeException("Prefill failed with error code: " + nativeResult[0]); + } + return nativeResult[1]; + } + + // returns a tuple of (status, updated startPos) + private native long[] prefillImagesNative( + int[] image, int width, int height, int channels, long startPos); + + /** + * Prefill an LLaVA Module with the given text input. + * + * @param prompt The text prompt to LLaVA. + * @param startPos The starting position in KV cache of the input in the LLM. It's passed as + * reference and will be updated inside this function. + * @param bos The number of BOS (begin of sequence) token. + * @param eos The number of EOS (end of sequence) token. + * @return The updated starting position in KV cache of the input in the LLM. + * @throws RuntimeException if the prefill failed + */ + public long prefillPrompt(String prompt, long startPos, int bos, int eos) { + long[] nativeResult = prefillPromptNative(prompt, startPos, bos, eos); + if (nativeResult[0] != 0) { + throw new RuntimeException("Prefill failed with error code: " + nativeResult[0]); + } + return nativeResult[1]; + } + + // returns a tuple of (status, updated startPos) + private native long[] prefillPromptNative(String prompt, long startPos, int bos, int eos); + + /** + * Generate tokens from the given prompt, starting from the given position. + * + * @param prompt The text prompt to LLaVA. + * @param seqLen The total sequence length, including the prompt tokens and new tokens. + * @param startPos The starting position in KV cache of the input in the LLM. + * @param llamaCallback callback object to receive results. + * @return The error code. + */ + public native int generateFromPos( + String prompt, int seqLen, long startPos, LlamaCallback callback); + /** Stop current generate() before it finishes. */ @DoNotStrip public native void stop(); diff --git a/extension/apple/Benchmark/App/App.entitlements b/extension/apple/Benchmark/App/App.entitlements new file mode 100644 index 0000000000..e461e7f22f --- /dev/null +++ b/extension/apple/Benchmark/App/App.entitlements @@ -0,0 +1,12 @@ + + + + + com.apple.security.app-sandbox + + com.apple.security.files.user-selected.read-only + + com.apple.developer.kernel.increased-memory-limit + + + diff --git a/extension/apple/Benchmark/App/App.swift b/extension/apple/Benchmark/App/App.swift new file mode 100644 index 0000000000..30fbd221dc --- /dev/null +++ b/extension/apple/Benchmark/App/App.swift @@ -0,0 +1,16 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +@main +struct BenchmarkApp: App { + var body: some Scene { + WindowGroup {} + } +} diff --git a/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj new file mode 100644 index 0000000000..4dcffaffbf --- /dev/null +++ b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj @@ -0,0 +1,535 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + 03B2D3682C8A515A0046936E /* App.swift in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3672C8A515A0046936E /* App.swift */; }; + 03B2D37A2C8A515C0046936E /* Tests.mm in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3792C8A515C0046936E /* Tests.mm */; }; + 03C7FA382C8AA3EC00E6E9AE /* Models in Resources */ = {isa = PBXBuildFile; fileRef = 03C7FA322C8AA24200E6E9AE /* Models */; }; + 03ED6CFF2C8AAFB300F2D6EE /* backend_coreml.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */; }; + 03ED6D012C8AAFB300F2D6EE /* backend_mps.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */; }; + 03ED6D032C8AAFB300F2D6EE /* backend_xnnpack.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */; }; + 03ED6D052C8AAFB300F2D6EE /* executorch.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */; }; + 03ED6D072C8AAFB300F2D6EE /* kernels_custom.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */; }; + 03ED6D092C8AAFB300F2D6EE /* kernels_optimized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */; }; + 03ED6D0B2C8AAFB300F2D6EE /* kernels_portable.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */; }; + 03ED6D0D2C8AAFB300F2D6EE /* kernels_quantized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */; }; + 03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */; }; + 03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */; }; + 03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */; }; + 03ED6D152C8AAFFF00F2D6EE /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D142C8AAFFF00F2D6EE /* Metal.framework */; }; + 03ED6D172C8AB00500F2D6EE /* CoreML.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D162C8AB00500F2D6EE /* CoreML.framework */; }; + 03ED6D192C8AB00A00F2D6EE /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D182C8AB00A00F2D6EE /* Accelerate.framework */; }; +/* End PBXBuildFile section */ + +/* Begin PBXContainerItemProxy section */ + 03B2D3762C8A515C0046936E /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 03B2D35C2C8A515A0046936E /* Project object */; + proxyType = 1; + remoteGlobalIDString = 03B2D3632C8A515A0046936E; + remoteInfo = Benchmark; + }; +/* End PBXContainerItemProxy section */ + +/* Begin PBXFileReference section */ + 037C96A02C8A570B00B3DF38 /* Tests.xctestplan */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = Tests.xctestplan; sourceTree = ""; }; + 03B019502C8A80D30044D558 /* Tests.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = Tests.xcconfig; sourceTree = ""; }; + 03B2D3642C8A515A0046936E /* Benchmark.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Benchmark.app; sourceTree = BUILT_PRODUCTS_DIR; }; + 03B2D3672C8A515A0046936E /* App.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = App.swift; sourceTree = ""; }; + 03B2D36D2C8A515B0046936E /* App.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = App.entitlements; sourceTree = ""; }; + 03B2D3752C8A515C0046936E /* Tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = Tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; + 03B2D3792C8A515C0046936E /* Tests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = Tests.mm; sourceTree = ""; }; + 03C7FA322C8AA24200E6E9AE /* Models */ = {isa = PBXFileReference; lastKnownFileType = folder; path = Models; sourceTree = SOURCE_ROOT; }; + 03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_coreml.xcframework; path = Frameworks/backend_coreml.xcframework; sourceTree = ""; }; + 03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_mps.xcframework; path = Frameworks/backend_mps.xcframework; sourceTree = ""; }; + 03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_xnnpack.xcframework; path = Frameworks/backend_xnnpack.xcframework; sourceTree = ""; }; + 03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = executorch.xcframework; path = Frameworks/executorch.xcframework; sourceTree = ""; }; + 03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_custom.xcframework; path = Frameworks/kernels_custom.xcframework; sourceTree = ""; }; + 03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_optimized.xcframework; path = Frameworks/kernels_optimized.xcframework; sourceTree = ""; }; + 03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_portable.xcframework; path = Frameworks/kernels_portable.xcframework; sourceTree = ""; }; + 03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_quantized.xcframework; path = Frameworks/kernels_quantized.xcframework; sourceTree = ""; }; + 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libsqlite3.0.tbd; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/usr/lib/libsqlite3.0.tbd; sourceTree = DEVELOPER_DIR; }; + 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShadersGraph.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShadersGraph.framework; sourceTree = DEVELOPER_DIR; }; + 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShaders.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShaders.framework; sourceTree = DEVELOPER_DIR; }; + 03ED6D142C8AAFFF00F2D6EE /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/Metal.framework; sourceTree = DEVELOPER_DIR; }; + 03ED6D162C8AB00500F2D6EE /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/CoreML.framework; sourceTree = DEVELOPER_DIR; }; + 03ED6D182C8AB00A00F2D6EE /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/Accelerate.framework; sourceTree = DEVELOPER_DIR; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + 03B2D3612C8A515A0046936E /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 03B2D3722C8A515C0046936E /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 03ED6D192C8AB00A00F2D6EE /* Accelerate.framework in Frameworks */, + 03ED6D172C8AB00500F2D6EE /* CoreML.framework in Frameworks */, + 03ED6D152C8AAFFF00F2D6EE /* Metal.framework in Frameworks */, + 03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */, + 03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */, + 03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */, + 03ED6CFF2C8AAFB300F2D6EE /* backend_coreml.xcframework in Frameworks */, + 03ED6D032C8AAFB300F2D6EE /* backend_xnnpack.xcframework in Frameworks */, + 03ED6D092C8AAFB300F2D6EE /* kernels_optimized.xcframework in Frameworks */, + 03ED6D012C8AAFB300F2D6EE /* backend_mps.xcframework in Frameworks */, + 03ED6D0D2C8AAFB300F2D6EE /* kernels_quantized.xcframework in Frameworks */, + 03ED6D0B2C8AAFB300F2D6EE /* kernels_portable.xcframework in Frameworks */, + 03ED6D052C8AAFB300F2D6EE /* executorch.xcframework in Frameworks */, + 03ED6D072C8AAFB300F2D6EE /* kernels_custom.xcframework in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + 03B2D35B2C8A515A0046936E = { + isa = PBXGroup; + children = ( + 03B2D3662C8A515A0046936E /* App */, + 03ED6CEB2C8AAF5300F2D6EE /* Frameworks */, + 03C7FA322C8AA24200E6E9AE /* Models */, + 03B2D3782C8A515C0046936E /* Tests */, + 03B2D3652C8A515A0046936E /* Products */, + ); + sourceTree = ""; + }; + 03B2D3652C8A515A0046936E /* Products */ = { + isa = PBXGroup; + children = ( + 03B2D3642C8A515A0046936E /* Benchmark.app */, + 03B2D3752C8A515C0046936E /* Tests.xctest */, + ); + name = Products; + sourceTree = ""; + }; + 03B2D3662C8A515A0046936E /* App */ = { + isa = PBXGroup; + children = ( + 03B2D3672C8A515A0046936E /* App.swift */, + 03B2D36D2C8A515B0046936E /* App.entitlements */, + ); + path = App; + sourceTree = SOURCE_ROOT; + }; + 03B2D3782C8A515C0046936E /* Tests */ = { + isa = PBXGroup; + children = ( + 03B2D3792C8A515C0046936E /* Tests.mm */, + 03B019502C8A80D30044D558 /* Tests.xcconfig */, + 037C96A02C8A570B00B3DF38 /* Tests.xctestplan */, + ); + path = Tests; + sourceTree = SOURCE_ROOT; + }; + 03ED6CEB2C8AAF5300F2D6EE /* Frameworks */ = { + isa = PBXGroup; + children = ( + 03ED6D182C8AB00A00F2D6EE /* Accelerate.framework */, + 03ED6D162C8AB00500F2D6EE /* CoreML.framework */, + 03ED6D142C8AAFFF00F2D6EE /* Metal.framework */, + 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */, + 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */, + 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */, + 03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */, + 03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */, + 03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */, + 03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */, + 03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */, + 03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */, + 03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */, + 03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */, + ); + name = Frameworks; + sourceTree = SOURCE_ROOT; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + 03B2D3632C8A515A0046936E /* App */ = { + isa = PBXNativeTarget; + buildConfigurationList = 03B2D3892C8A515C0046936E /* Build configuration list for PBXNativeTarget "App" */; + buildPhases = ( + 03B2D3602C8A515A0046936E /* Sources */, + 03B2D3612C8A515A0046936E /* Frameworks */, + 03B2D3622C8A515A0046936E /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = App; + productName = Benchmark; + productReference = 03B2D3642C8A515A0046936E /* Benchmark.app */; + productType = "com.apple.product-type.application"; + }; + 03B2D3742C8A515C0046936E /* Tests */ = { + isa = PBXNativeTarget; + buildConfigurationList = 03B2D38C2C8A515C0046936E /* Build configuration list for PBXNativeTarget "Tests" */; + buildPhases = ( + 03B2D3712C8A515C0046936E /* Sources */, + 03B2D3722C8A515C0046936E /* Frameworks */, + 03B2D3732C8A515C0046936E /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + 03B2D3772C8A515C0046936E /* PBXTargetDependency */, + ); + name = Tests; + productName = BenchmarkTests; + productReference = 03B2D3752C8A515C0046936E /* Tests.xctest */; + productType = "com.apple.product-type.bundle.unit-test"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + 03B2D35C2C8A515A0046936E /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1540; + LastUpgradeCheck = 1540; + TargetAttributes = { + 03B2D3632C8A515A0046936E = { + CreatedOnToolsVersion = 15.4; + }; + 03B2D3742C8A515C0046936E = { + CreatedOnToolsVersion = 15.4; + TestTargetID = 03B2D3632C8A515A0046936E; + }; + }; + }; + buildConfigurationList = 03B2D35F2C8A515A0046936E /* Build configuration list for PBXProject "Benchmark" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = 03B2D35B2C8A515A0046936E; + productRefGroup = 03B2D3652C8A515A0046936E /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + 03B2D3632C8A515A0046936E /* App */, + 03B2D3742C8A515C0046936E /* Tests */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + 03B2D3622C8A515A0046936E /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 03B2D3732C8A515C0046936E /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 03C7FA382C8AA3EC00E6E9AE /* Models in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + 03B2D3602C8A515A0046936E /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 03B2D3682C8A515A0046936E /* App.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 03B2D3712C8A515C0046936E /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 03B2D37A2C8A515C0046936E /* Tests.mm in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin PBXTargetDependency section */ + 03B2D3772C8A515C0046936E /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 03B2D3632C8A515A0046936E /* App */; + targetProxy = 03B2D3762C8A515C0046936E /* PBXContainerItemProxy */; + }; +/* End PBXTargetDependency section */ + +/* Begin XCBuildConfiguration section */ + 03B2D3872C8A515C0046936E /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "c++17"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + 03B2D3882C8A515C0046936E /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "c++17"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SWIFT_COMPILATION_MODE = wholemodule; + }; + name = Release; + }; + 03B2D38A2C8A515C0046936E /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + CODE_SIGN_ENTITLEMENTS = App/App.entitlements; + "CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development"; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + "INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphoneos*]" = YES; + "INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphonesimulator*]" = YES; + "INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphoneos*]" = YES; + "INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphonesimulator*]" = YES; + "INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphoneos*]" = YES; + "INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphonesimulator*]" = YES; + "INFOPLIST_KEY_UIStatusBarStyle[sdk=iphoneos*]" = UIStatusBarStyleDefault; + "INFOPLIST_KEY_UIStatusBarStyle[sdk=iphonesimulator*]" = UIStatusBarStyleDefault; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + LD_RUNPATH_SEARCH_PATHS = "@executable_path/Frameworks"; + "LD_RUNPATH_SEARCH_PATHS[sdk=macosx*]" = "@executable_path/../Frameworks"; + MACOSX_DEPLOYMENT_TARGET = 11.0; + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.Benchmark; + PRODUCT_NAME = Benchmark; + SDKROOT = auto; + SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + 03B2D38B2C8A515C0046936E /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + CODE_SIGN_ENTITLEMENTS = App/App.entitlements; + "CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development"; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + "INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphoneos*]" = YES; + "INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphonesimulator*]" = YES; + "INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphoneos*]" = YES; + "INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphonesimulator*]" = YES; + "INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphoneos*]" = YES; + "INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphonesimulator*]" = YES; + "INFOPLIST_KEY_UIStatusBarStyle[sdk=iphoneos*]" = UIStatusBarStyleDefault; + "INFOPLIST_KEY_UIStatusBarStyle[sdk=iphonesimulator*]" = UIStatusBarStyleDefault; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + LD_RUNPATH_SEARCH_PATHS = "@executable_path/Frameworks"; + "LD_RUNPATH_SEARCH_PATHS[sdk=macosx*]" = "@executable_path/../Frameworks"; + MACOSX_DEPLOYMENT_TARGET = 11.0; + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.Benchmark; + PRODUCT_NAME = Benchmark; + SDKROOT = auto; + SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; + 03B2D38D2C8A515C0046936E /* Debug */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = 03B019502C8A80D30044D558 /* Tests.xcconfig */; + buildSettings = { + ALWAYS_EMBED_SWIFT_STANDARD_LIBRARIES = YES; + BUNDLE_LOADER = "$(TEST_HOST)"; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + GENERATE_INFOPLIST_FILE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + MACOSX_DEPLOYMENT_TARGET = 10.15; + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.BenchmarkTests; + PRODUCT_NAME = "$(TARGET_NAME)"; + SDKROOT = auto; + SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx"; + SWIFT_EMIT_LOC_STRINGS = NO; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + TEST_HOST = "$(BUILT_PRODUCTS_DIR)/Benchmark.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/Benchmark"; + }; + name = Debug; + }; + 03B2D38E2C8A515C0046936E /* Release */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = 03B019502C8A80D30044D558 /* Tests.xcconfig */; + buildSettings = { + ALWAYS_EMBED_SWIFT_STANDARD_LIBRARIES = YES; + BUNDLE_LOADER = "$(TEST_HOST)"; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + GENERATE_INFOPLIST_FILE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + MACOSX_DEPLOYMENT_TARGET = 10.15; + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.BenchmarkTests; + PRODUCT_NAME = "$(TARGET_NAME)"; + SDKROOT = auto; + SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx"; + SWIFT_EMIT_LOC_STRINGS = NO; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + TEST_HOST = "$(BUILT_PRODUCTS_DIR)/Benchmark.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/Benchmark"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + 03B2D35F2C8A515A0046936E /* Build configuration list for PBXProject "Benchmark" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 03B2D3872C8A515C0046936E /* Debug */, + 03B2D3882C8A515C0046936E /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 03B2D3892C8A515C0046936E /* Build configuration list for PBXNativeTarget "App" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 03B2D38A2C8A515C0046936E /* Debug */, + 03B2D38B2C8A515C0046936E /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 03B2D38C2C8A515C0046936E /* Build configuration list for PBXNativeTarget "Tests" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 03B2D38D2C8A515C0046936E /* Debug */, + 03B2D38E2C8A515C0046936E /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = 03B2D35C2C8A515A0046936E /* Project object */; +} diff --git a/extension/apple/Benchmark/Benchmark.xcodeproj/xcshareddata/xcschemes/Benchmark.xcscheme b/extension/apple/Benchmark/Benchmark.xcodeproj/xcshareddata/xcschemes/Benchmark.xcscheme new file mode 100644 index 0000000000..ebfe1e5fd3 --- /dev/null +++ b/extension/apple/Benchmark/Benchmark.xcodeproj/xcshareddata/xcschemes/Benchmark.xcscheme @@ -0,0 +1,107 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/extension/apple/Benchmark/Tests/Tests.mm b/extension/apple/Benchmark/Tests/Tests.mm new file mode 100644 index 0000000000..5cf958765d --- /dev/null +++ b/extension/apple/Benchmark/Tests/Tests.mm @@ -0,0 +1,105 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#import + +#import + +#import +#import + +using namespace ::executorch::extension; +using namespace ::executorch::runtime; + +@interface Tests : XCTestCase +@end + +@implementation Tests + ++ (void)initialize { + if (self == [Tests class]) { + NSString *modelsDir = [[NSBundle bundleForClass:[self class]].resourcePath + stringByAppendingPathComponent:@"Models"]; + NSArray *models = + [NSFileManager.defaultManager contentsOfDirectoryAtPath:modelsDir + error:nil]; + for (NSString *model in models) { + NSString *modelName = model.stringByDeletingPathExtension; + NSString *modelPath = [modelsDir stringByAppendingPathComponent:model]; + XCTAssertGreaterThan(modelPath.length, 0); + + SEL testLoadSelector = NSSelectorFromString( + [NSString stringWithFormat:@"test_load_%@", modelName]); + IMP testLoadImplementation = imp_implementationWithBlock(^(id _self) { + auto __block module = std::make_unique(modelPath.UTF8String); + [_self + measureWithMetrics:@[ [XCTClockMetric new], [XCTMemoryMetric new] ] + options:XCTMeasureOptions.defaultOptions + block:^{ + XCTAssertEqual(module->load_method("forward"), + Error::Ok); + }]; + }); + class_addMethod( + [self class], testLoadSelector, testLoadImplementation, "v@:"); + + SEL testForwardSelector = NSSelectorFromString( + [NSString stringWithFormat:@"test_forward_%@", modelName]); + IMP testForwardImplementation = imp_implementationWithBlock(^(id _self) { + auto __block module = std::make_unique(modelPath.UTF8String); + XCTAssertEqual(module->load_method("forward"), Error::Ok); + + const auto method_meta = module->method_meta("forward"); + XCTAssertEqual(method_meta.error(), Error::Ok); + + const auto num_inputs = method_meta->num_inputs(); + XCTAssertGreaterThan(num_inputs, 0); + + std::vector> buffers; + buffers.reserve(num_inputs); + std::vector tensors; + tensors.reserve(num_inputs); + std::vector __block inputs; + inputs.reserve(num_inputs); + + for (auto index = 0; index < num_inputs; ++index) { + auto input_tag = method_meta->input_tag(index); + XCTAssertEqual(input_tag.error(), Error::Ok); + + switch (*input_tag) { + case Tag::Tensor: { + const auto tensor_meta = method_meta->input_tensor_meta(index); + XCTAssertEqual(tensor_meta.error(), Error::Ok); + + const auto sizes = tensor_meta->sizes(); + buffers.emplace_back(tensor_meta->nbytes(), + 0b01010101); // Set all bytes to be non-zero. + tensors.emplace_back(from_blob(buffers.rbegin()->data(), + {sizes.begin(), sizes.end()}, + tensor_meta->scalar_type())); + inputs.emplace_back(tensors.back()); + } break; + default: + XCTFail("Unsupported tag %i at input %d", *input_tag, index); + } + } + [_self + measureWithMetrics:@[ [XCTClockMetric new], [XCTMemoryMetric new] ] + options:XCTMeasureOptions.defaultOptions + block:^{ + XCTAssertEqual(module->forward(inputs).error(), + Error::Ok); + }]; + }); + class_addMethod( + [self class], testForwardSelector, testForwardImplementation, "v@:"); + } + } +} + +@end diff --git a/extension/apple/Benchmark/Tests/Tests.xcconfig b/extension/apple/Benchmark/Tests/Tests.xcconfig new file mode 100644 index 0000000000..e8168046c3 --- /dev/null +++ b/extension/apple/Benchmark/Tests/Tests.xcconfig @@ -0,0 +1,26 @@ +OTHER_LDFLAGS[sdk=iphonesimulator*] = $(inherited) \ +-force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-simulator-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-release.a + +OTHER_LDFLAGS[sdk=iphoneos*] = $(inherited) \ +-force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-ios-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-ios-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-ios-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-ios-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-ios-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-ios-release.a + +OTHER_LDFLAGS[sdk=macos*] = $(inherited) \ +-force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-macos-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-macos-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-macos-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-macos-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-macos-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-macos-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-macos-release.a diff --git a/extension/apple/Benchmark/Tests/Tests.xctestplan b/extension/apple/Benchmark/Tests/Tests.xctestplan new file mode 100644 index 0000000000..025f50f194 --- /dev/null +++ b/extension/apple/Benchmark/Tests/Tests.xctestplan @@ -0,0 +1,28 @@ +{ + "configurations" : [ + { + "id" : "0430A5ED-FD8D-444E-9933-740E01CCD53C", + "name" : "Test Scheme Action", + "options" : { + + } + } + ], + "defaultOptions" : { + "targetForVariableExpansion" : { + "containerPath" : "container:Benchmark.xcodeproj", + "identifier" : "03B2D3632C8A515A0046936E", + "name" : "App" + } + }, + "testTargets" : [ + { + "target" : { + "containerPath" : "container:Benchmark.xcodeproj", + "identifier" : "03B2D3742C8A515C0046936E", + "name" : "Tests" + } + } + ], + "version" : 1 +} diff --git a/extension/aten_util/test/targets.bzl b/extension/aten_util/test/targets.bzl index b724bbce2b..db2247fd60 100644 --- a/extension/aten_util/test/targets.bzl +++ b/extension/aten_util/test/targets.bzl @@ -18,7 +18,6 @@ def define_common_targets(): "//executorch/runtime/core/exec_aten:lib", "//executorch/runtime/kernel:operator_registry", "//executorch/extension/aten_util:aten_bridge", - "//executorch/extension/runner_util:managed_tensor", "//executorch/runtime/core/exec_aten/testing_util:tensor_util", ], external_deps = [ diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 4f5bab7bc0..2c2e52c744 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -65,6 +65,7 @@ def __init__( dtype, use_kv_cache, example_inputs, + args: Optional[Any] = None, enable_dynamic_shape: bool = False, verbose: bool = False, metadata: Optional[dict] = None, @@ -87,6 +88,7 @@ def __init__( self.output_dir = "." self.dynamic_shapes = dynamic_shapes self._saved_pte_filename = None + self.args = args def set_output_dir(self, output_dir: str) -> "LLMEdgeManager": """ @@ -162,9 +164,20 @@ def capture_pre_autograd_graph(self) -> "LLMEdgeManager": # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up) with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad(): # pyre-fixme[8] - self.pre_autograd_graph_module = capture_pre_autograd_graph( - self.model, self.example_inputs, dynamic_shapes=dynamic_shape - ) + if hasattr(self.args, "qnn") and self.args.qnn: + # TODO: this is temporary and export_for_training doesn't work with qnn either. We need a + # functional graph. See issue https://github.com/pytorch/executorch/pull/4627 for more details + self.pre_autograd_graph_module = torch.export.export( + self.model, + self.example_inputs, + dynamic_shapes=dynamic_shape, + strict=True, + ).module() + else: + self.pre_autograd_graph_module = capture_pre_autograd_graph( + self.model, self.example_inputs, dynamic_shapes=dynamic_shape + ) + return self def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManager": @@ -210,10 +223,8 @@ def export_to_edge(self) -> "LLMEdgeManager": # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up) with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad(): if self.pre_autograd_graph_module is None: - # pyre-fixme[8] - self.pre_autograd_graph_module = capture_pre_autograd_graph( - self.model, self.example_inputs, dynamic_shapes=dynamic_shape - ) + # Run capture_pre_autograd_graph if it didn't run + self.capture_pre_autograd_graph() self.edge_manager = export_to_edge( self.pre_autograd_graph_module, # pyre-fixme[6] self.example_inputs, diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index 0d9f7c6cfd..e75d5bef3f 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -140,7 +140,7 @@ def get_qnn_partitioner( return QnnPartitioner( # pyre-fixme[16] generate_qnn_executorch_compiler_spec( # pyre-fixme[16] - soc_model=QcomChipset.SM8450, # default to SM8450 # pyre-fixme[16] + soc_model=QcomChipset.SM8650, # default to SM8650 # pyre-fixme[16] # pyre-fixme[16] backend_options=generate_htp_compiler_spec( use_fp16=use_fp16, diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt index 27bc84fe11..a9245768b9 100644 --- a/extension/llm/runner/CMakeLists.txt +++ b/extension/llm/runner/CMakeLists.txt @@ -43,7 +43,9 @@ target_include_directories( add_library(extension_llm_runner STATIC ${_extension_llm_runner__srcs}) -set(runner_deps executorch extension_module extension_data_loader) +set(runner_deps executorch extension_data_loader extension_module + extension_tensor +) target_link_libraries(extension_llm_runner PUBLIC ${runner_deps}) diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h index dbffac46fc..70ecafee81 100644 --- a/extension/llm/runner/multimodal_runner.h +++ b/extension/llm/runner/multimodal_runner.h @@ -31,7 +31,6 @@ #include #include #include -#include namespace executorch { namespace extension { @@ -62,6 +61,50 @@ class MultimodalRunner { std::function token_callback = {}, std::function stats_callback = {}) = 0; + /** + * Prefill an LLaVA Module with the given images input. + * @param images The image input to LLaVA. + * @param start_pos The starting position in KV cache of the input in the LLM. + * It's passed as reference and will be updated inside this function. + * @return The error status of prefilling images. + */ + virtual runtime::Error prefill_images( + std::vector& images, + int64_t& start_pos) = 0; + + /** + * Prefill an LLaVA Module with the given text input. + * @param prompt The text prompt to LLaVA. + * @param start_pos The starting position in KV cache of the input in the LLM. + * It's passed as reference and will be updated inside this function. + * @param bos The number of BOS (begin of sequence) token. + * @param eos The number of EOS (end of sequence) token. + * @return The generated token of the LLaVA Module after prefill prompt. + */ + virtual runtime::Result prefill_prompt( + const std::string& prompt, + int64_t& start_pos, + int8_t bos = 0, + int8_t eos = 0) = 0; + + /** + * Generate tokens from the given prompt, starting from the given position. + * @param prompt The text prompt to LLaVA. + * @param seq_len The total sequence length, including the prompt tokens and + * new tokens. + * @param start_pos The starting position in KV cache of the input in the LLM. + * @param token_callback What to do after a token is generated. + * @param stats_callback What to do with Stats. + * @return The error code. + */ + virtual runtime::Error generate_from_pos( + const std::string& prompt, + int32_t seq_len = 1024, + int64_t start_pos = 0, + std::function token_callback = {}, + std::function + stats_callback = {}) = 0; + inline void stop() { text_token_generator_->stop(); } diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl index 4d715980af..f20240956c 100644 --- a/extension/llm/runner/targets.bzl +++ b/extension/llm/runner/targets.bzl @@ -26,7 +26,7 @@ def define_common_targets(): ":stats", "//executorch/extension/llm/sampler:sampler" + aten_suffix, "//executorch/extension/module:module" + aten_suffix, - "//executorch/extension/runner_util:managed_tensor" + aten_suffix, + "//executorch/extension/tensor:tensor" + aten_suffix, ], ) @@ -41,7 +41,7 @@ def define_common_targets(): ":text_decoder_runner" + aten_suffix, "//executorch/extension/llm/tokenizer:tokenizer_header", "//executorch/extension/module:module" + aten_suffix, - "//executorch/extension/runner_util:managed_tensor" + aten_suffix, + "//executorch/extension/tensor:tensor" + aten_suffix, ], ) @@ -55,7 +55,7 @@ def define_common_targets(): ":text_decoder_runner" + aten_suffix, "//executorch/extension/llm/tokenizer:tokenizer_header", "//executorch/extension/module:module" + aten_suffix, - "//executorch/extension/runner_util:managed_tensor" + aten_suffix, + "//executorch/extension/tensor:tensor" + aten_suffix, ], ) diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp index 5b77c69825..928a21244a 100644 --- a/extension/llm/runner/text_decoder_runner.cpp +++ b/extension/llm/runner/text_decoder_runner.cpp @@ -38,14 +38,11 @@ TextDecoderRunner::TextDecoderRunner( // input. It should be safe to call multiple times with the same inputs. The // outer loop (call site) is responsible for managing state. ::executorch::runtime::Result TextDecoderRunner::step( - ManagedTensor& managed_tokens, - ManagedTensor& managed_start_pos) { - auto tokens = managed_tokens.get_aliasing_tensor(); + TensorPtr& tokens, + TensorPtr& start_pos) { // ET_LOG(Info, "Input token %" PRIu64, input_token); if (use_kv_cache_) { - auto start_pos = managed_start_pos.get_aliasing_tensor(); - ::executorch::runtime::Result> - outputs_res = module_->forward({tokens, start_pos}); + auto outputs_res = module_->forward({*tokens, *start_pos}); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); ET_CHECK_MSG( outputs_res.get().size() == 1, @@ -57,10 +54,9 @@ ::executorch::runtime::Result TextDecoderRunner::step( // Return the logits tensor return outputs_res.get()[0].toTensor(); } else { // no kv cache - (void)managed_start_pos; // unused + (void)start_pos; // unused - ::executorch::runtime::Result> - outputs_res = module_->forward(tokens); + auto outputs_res = module_->forward(tokens); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); ET_CHECK_MSG( outputs_res.get().size() == 1, diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h index 70ee1d0136..16adeeed0a 100644 --- a/extension/llm/runner/text_decoder_runner.h +++ b/extension/llm/runner/text_decoder_runner.h @@ -12,7 +12,7 @@ #include #include -#include +#include // patternlint-disable-next-line executorch-cpp-nostdinc #include @@ -38,8 +38,8 @@ class TextDecoderRunner { * @return The output of the LLM Module. This will be a tensor of logits. */ virtual ::executorch::runtime::Result step( - ManagedTensor& input, - ManagedTensor& start_pos); + TensorPtr& input, + TensorPtr& start_pos); /** * Load the Module for text decode purpose. diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp index 53a737e6af..705583d638 100644 --- a/extension/llm/runner/text_prefiller.cpp +++ b/extension/llm/runner/text_prefiller.cpp @@ -25,7 +25,7 @@ TextPrefiller::TextPrefiller( ::executorch::runtime::Result TextPrefiller::prefill( std::vector& prompt_tokens, - int64_t start_pos) { + int64_t& start_pos) { ET_CHECK_MSG(!prompt_tokens.empty(), "Prompt cannot be null"); if (!text_decoder_runner_->is_method_loaded()) { ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load()); @@ -38,54 +38,51 @@ ::executorch::runtime::Result TextPrefiller::prefill( uint64_t cur_token; if (enable_parallel_prefill_ || !use_kv_cache_) { // initialize tensor wrappers - ManagedTensor managed_tokens( + auto tokens = from_blob( prompt_tokens.data(), {1, num_prompt_tokens}, exec_aten::ScalarType::Long); - ManagedTensor managed_start_pos( - &start_pos, {1}, exec_aten::ScalarType::Long); + auto start_pos_tensor = + from_blob(&start_pos, {1}, exec_aten::ScalarType::Long); - ::executorch::runtime::Result outputs_res = - text_decoder_runner_->step(managed_tokens, managed_start_pos); + auto outputs_res = text_decoder_runner_->step(tokens, start_pos_tensor); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); ET_LOG( Info, "Prefill token result numel(): %zu", outputs_res.get().numel()); + start_pos += num_prompt_tokens; cur_token = text_decoder_runner_->logits_to_token(outputs_res.get()); } else { // sequential prefill int64_t pos = 0; // position in the sequence - // token & pos - int64_t pos_data = 0; // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds) cur_token = prompt_tokens[0]; // initialize tensor wrappers - ManagedTensor managed_tokens( - &cur_token, {1, 1}, exec_aten::ScalarType::Long); + auto tokens = from_blob(&cur_token, {1, 1}, exec_aten::ScalarType::Long); - ManagedTensor managed_start_pos( - &pos_data, {1}, exec_aten::ScalarType::Long); + auto start_pos_tensor = + from_blob(&start_pos, {1}, exec_aten::ScalarType::Long); // run the first token and get back logits tensor. Assuming the first token // is bos so don't callback. - exec_aten::Tensor logits_tensor = ET_UNWRAP( - text_decoder_runner_->step(managed_tokens, managed_start_pos)); + auto logits_tensor = + ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos_tensor)); - pos = 1; // start from index 1 + pos += 1; // start the loop from index 1 + start_pos += 1; while (pos < num_prompt_tokens) { // Run the model - pos_data = start_pos + pos; - // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds) cur_token = prompt_tokens[pos]; - logits_tensor = ET_UNWRAP( - text_decoder_runner_->step(managed_tokens, managed_start_pos)); + logits_tensor = + ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos_tensor)); pos++; + start_pos++; } cur_token = text_decoder_runner_->logits_to_token(logits_tensor); diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h index a8ba77b860..0ea126f32d 100644 --- a/extension/llm/runner/text_prefiller.h +++ b/extension/llm/runner/text_prefiller.h @@ -36,7 +36,7 @@ class TextPrefiller { */ ::executorch::runtime::Result prefill( std::vector& prompt_tokens, - int64_t start_pos = 0); + int64_t& start_pos); private: TextDecoderRunner* text_decoder_runner_; diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h index 46d682a4e4..01887e7560 100644 --- a/extension/llm/runner/text_token_generator.h +++ b/extension/llm/runner/text_token_generator.h @@ -12,6 +12,7 @@ #include #include #include +#include namespace executorch { namespace extension { @@ -69,15 +70,18 @@ class TextTokenGenerator { } // initialize tensor wrappers - ManagedTensor tokens_managed( - token_data.data(), token_shape, exec_aten::ScalarType::Long); + auto tokens_managed = from_blob( + token_data.data(), + token_shape, + exec_aten::ScalarType::Long, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); - ManagedTensor start_pos_managed(&pos, {1}, exec_aten::ScalarType::Long); + auto start_pos_managed = from_blob(&pos, {1}, exec_aten::ScalarType::Long); // Generate our tokens while (pos < seq_len - 1) { // Run the model - ::executorch::runtime::Result logits_res = + auto logits_res = text_decoder_runner_->step(tokens_managed, start_pos_managed); ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error()); @@ -98,7 +102,8 @@ class TextTokenGenerator { } else { // push it to the back token_data.push_back(cur_token); - tokens_managed.resize({1, static_cast(token_data.size())}); + ET_CHECK_OK_OR_RETURN_ERROR(resize_tensor_ptr( + tokens_managed, {1, static_cast(token_data.size())})); } // print the token as string, decode it with the Tokenizer object diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h index baf6af328b..2f1d084811 100644 --- a/extension/llm/runner/util.h +++ b/extension/llm/runner/util.h @@ -10,6 +10,9 @@ #include #include #include +#if defined(__linux__) || defined(__ANDROID__) || defined(__unix__) +#include +#endif namespace executorch { namespace extension { @@ -44,6 +47,27 @@ long inline time_in_ms() { return time.tv_sec * 1000 + time.tv_nsec / 1000000; } +// ---------------------------------------------------------------------------- +// utilities: memory usage + +// Returns the current RSS in bytes. Returns 0 if not supported. +// RSS: Resident Set Size, the amount of memory currently in the RAM for this +// process. These values are approximate, and are only used for logging +// purposes. +size_t inline get_rss_bytes() { +#if defined(__linux__) || defined(__ANDROID__) || defined(__unix__) + struct rusage r_usage; + if (getrusage(RUSAGE_SELF, &r_usage) == 0) { + return r_usage.ru_maxrss * 1024; + } +#endif // __linux__ || __ANDROID__ || __unix__ + // Unsupported platform like Windows, or getrusage() failed. + // __APPLE__ and __MACH__ are not supported because r_usage.ru_maxrss does not + // consistently return kbytes on macOS. On older versions of macOS, it + // returns bytes, but on newer versions it returns kbytes. Need to figure out + // when this changed. + return 0; +} } // namespace llm } // namespace extension } // namespace executorch @@ -53,6 +77,7 @@ namespace executor { namespace util { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. +using ::executorch::extension::llm::get_rss_bytes; using ::executorch::extension::llm::safe_printf; using ::executorch::extension::llm::time_in_ms; } // namespace util diff --git a/extension/llm/tokenizer/tiktoken.cpp b/extension/llm/tokenizer/tiktoken.cpp index f8ccf74fd6..f99ac2e955 100644 --- a/extension/llm/tokenizer/tiktoken.cpp +++ b/extension/llm/tokenizer/tiktoken.cpp @@ -266,7 +266,11 @@ Tiktoken::_split_with_allowed_special_token( return std::make_pair(std::nullopt, input); } +#if __cplusplus >= 202002L auto start = input.begin(); +#else + const char* start = input.data(); +#endif std::string special; while (true) { if (!re2::RE2::FindAndConsume(&input, *_special_token_regex, &special)) { @@ -276,9 +280,15 @@ Tiktoken::_split_with_allowed_special_token( if (allowed_special.count(special) == 1) { // Found an allowed special token, split the text with it. +#if __cplusplus >= 202002L return std::make_pair( special, re2::StringPiece(start, input.begin() - start - special.size())); +#else + return std::make_pair( + special, + re2::StringPiece(start, (input.data() - start) - special.size())); +#endif } // else try to find the next special token } diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp index 4ef454e1c7..75cead25a7 100644 --- a/extension/module/test/module_test.cpp +++ b/extension/module/test/module_test.cpp @@ -22,7 +22,7 @@ namespace torch::executor { class ModuleTest : public ::testing::Test { protected: static void SetUpTestSuite() { - model_path_ = std::getenv("RESOURCES_PATH") + std::string("/model.pte"); + model_path_ = std::getenv("RESOURCES_PATH") + std::string("/add.pte"); } static std::string model_path_; @@ -95,7 +95,7 @@ TEST_F(ModuleTest, TestMethodMeta) { const auto meta = module.method_meta("forward"); EXPECT_TRUE(meta.ok()); EXPECT_STREQ(meta->name(), "forward"); - EXPECT_EQ(meta->num_inputs(), 1); + EXPECT_EQ(meta->num_inputs(), 2); EXPECT_EQ(*(meta->input_tag(0)), Tag::Tensor); EXPECT_EQ(meta->num_outputs(), 1); EXPECT_EQ(*(meta->output_tag(0)), Tag::Tensor); @@ -103,9 +103,8 @@ TEST_F(ModuleTest, TestMethodMeta) { const auto input_meta = meta->input_tensor_meta(0); EXPECT_TRUE(input_meta.ok()); EXPECT_EQ(input_meta->scalar_type(), ScalarType::Float); - EXPECT_EQ(input_meta->sizes().size(), 2); + EXPECT_EQ(input_meta->sizes().size(), 1); EXPECT_EQ(input_meta->sizes()[0], 1); - EXPECT_EQ(input_meta->sizes()[1], 2); const auto output_meta = meta->output_tensor_meta(0); EXPECT_TRUE(output_meta.ok()); @@ -124,19 +123,22 @@ TEST_F(ModuleTest, TestNonExistentMethodMeta) { TEST_F(ModuleTest, TestExecute) { Module module(model_path_); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.execute("forward", Tensor(&tensor)); + const auto result = + module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + EXPECT_TRUE(result.ok()); + EXPECT_TRUE(result.ok()); EXPECT_TRUE(module.is_loaded()); EXPECT_TRUE(module.is_method_loaded("forward")); const auto data = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], 1.5, 1e-5); + EXPECT_NEAR(data[0], 2, 1e-5); } TEST_F(ModuleTest, TestExecutePreload) { @@ -145,17 +147,18 @@ TEST_F(ModuleTest, TestExecutePreload) { const auto error = module.load(); EXPECT_EQ(error, Error::Ok); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.execute("forward", Tensor(&tensor)); + const auto result = + module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], 1.5, 1e-5); + EXPECT_NEAR(data[0], 2, 1e-5); } TEST_F(ModuleTest, TestExecutePreload_method) { @@ -164,17 +167,18 @@ TEST_F(ModuleTest, TestExecutePreload_method) { const auto error = module.load_method("forward"); EXPECT_EQ(error, Error::Ok); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.execute("forward", Tensor(&tensor)); + const auto result = + module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], 1.5, 1e-5); + EXPECT_NEAR(data[0], 2, 1e-5); } TEST_F(ModuleTest, TestExecutePreloadProgramAndMethod) { @@ -186,17 +190,18 @@ TEST_F(ModuleTest, TestExecutePreloadProgramAndMethod) { const auto load_method_error = module.load_method("forward"); EXPECT_EQ(load_method_error, Error::Ok); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.execute("forward", Tensor(&tensor)); + const auto result = + module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], 1.5, 1e-5); + EXPECT_NEAR(data[0], 2, 1e-5); } TEST_F(ModuleTest, TestExecuteOnNonExistent) { @@ -218,41 +223,42 @@ TEST_F(ModuleTest, TestExecuteOnCurrupted) { TEST_F(ModuleTest, TestGet) { Module module(model_path_); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.get("forward", Tensor(&tensor)); + const auto result = module.get("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], 1.5, 1e-5); + EXPECT_NEAR(data[0], 2, 1e-5); } TEST_F(ModuleTest, TestForward) { auto module = std::make_unique(model_path_); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module->forward(Tensor(&tensor)); + + const auto result = module->forward({Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], 1.5, 1e-5); + EXPECT_NEAR(data[0], 2, 1e-5); std::array input2{2, 3}; TensorImpl tensor2( ScalarType::Float, sizes.size(), sizes.data(), input2.data()); - const auto result2 = module->forward(Tensor(&tensor2)); + const auto result2 = module->forward({Tensor(&tensor2), Tensor(&tensor2)}); EXPECT_TRUE(result2.ok()); const auto data2 = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data2[0], 2.5, 1e-5); + EXPECT_NEAR(data2[0], 4, 1e-5); } TEST_F(ModuleTest, TestForwardWithInvalidInputs) { @@ -303,23 +309,26 @@ TEST_F(ModuleTest, TestProgramSharingAndDataLoaderManagement) { EXPECT_EQ(load_error, Error::Ok); EXPECT_TRUE(module1->is_loaded()); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - auto result1 = module1->execute("forward", Tensor(&tensor)); + auto result1 = + module1->execute("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result1.ok()); auto module2 = std::make_unique(module1->program()); - auto result2 = module2->execute("forward", Tensor(&tensor)); + auto result2 = + module2->execute("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result2.ok()); module1 = std::make_unique("/path/to/nonexistent/file.pte"); EXPECT_FALSE(module1->is_loaded()); - auto result3 = module2->execute("forward", Tensor(&tensor)); + auto result3 = + module2->execute("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result3.ok()); } @@ -351,17 +360,17 @@ TEST_F(ModuleTest, TestProgramPersistenceAndReuseAfterModuleDestruction) { EXPECT_EQ(module.program(), shared_program); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - auto result = module.execute("forward", Tensor(&tensor)); + auto result = module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result.ok()); auto data = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], 1.5, 1e-5); + EXPECT_NEAR(data[0], 2, 1e-5); } TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) { @@ -379,24 +388,24 @@ TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) { EXPECT_TRUE(program != nullptr); auto thread = [](std::shared_ptr program, - const std::array& input) { + const std::array& input) { Module module(program); - std::array sizes{1, 2}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), (void*)input.data()); - const auto result = module.forward(Tensor(&tensor)); + const auto result = module.forward({Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], (input[0] + input[1]) / 2.0, 1e-5); + EXPECT_NEAR(data[0], (input[0] * 2), 1e-5); }; - std::thread t1(thread, program, std::array{1, 2}); - std::thread t2(thread, program, std::array{2, 3}); - std::thread t3(thread, program, std::array{3, 4}); - std::thread t4(thread, program, std::array{4, 5}); - std::thread t5(thread, program, std::array{5, 6}); + std::thread t1(thread, program, std::array{1}); + std::thread t2(thread, program, std::array{2}); + std::thread t3(thread, program, std::array{3}); + std::thread t4(thread, program, std::array{4}); + std::thread t5(thread, program, std::array{5}); t1.join(); t2.join(); diff --git a/extension/module/test/resources/README.md b/extension/module/test/resources/README.md new file mode 100644 index 0000000000..5067c870a3 --- /dev/null +++ b/extension/module/test/resources/README.md @@ -0,0 +1,4 @@ +## Resources + +### model.pte +- generated via `buck2 run fbcode//executorch/examples/portable/scripts:export -- --model_name="add"` after D62209852. diff --git a/extension/module/test/resources/add.pte b/extension/module/test/resources/add.pte new file mode 100644 index 0000000000000000000000000000000000000000..43252ca7d3d05e8fe847e122c9c7de976e0e0096 GIT binary patch literal 728 zcmZ`$O-_Sg5Pi1LVna-08q!40d z9v`$+<8G4NtCNznyo*d;{901sofq4 z&95FC(_^3>z=$tz@ie@O=wFQ6?sRR{e3+V%PkTlvcFUN0_=L1XT6i=0w*D?~&6+W_ zm?cw28ad;8ZTa+8zxfcf= X8imU+Ugk*@$4qIuZ+H9Wa$n&GV)QgK literal 0 HcmV?d00001 diff --git a/extension/module/test/resources/model.pte b/extension/module/test/resources/model.pte deleted file mode 100644 index 91b52416847fff9a794db423583a0c8c5a303d66..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1600 zcmbtUF>6y%6h2LpG`67x38h1a$0LYCkyaeiEbSs>a5ICDhBhX*B=CHR$qS80K{`1U zXNL|&931)!#9tu#4;(T$Iyg9V(8lk(C%1VGUBnAt?z!iF=R4<~d-K*rl*siS$iyu-6$x zFl7m-M2@Fz2}U6M0xKZM07ofLj3j#QKR!jPJqb zp3{bj>>9(kS26A**tmi#F5`Gk<{-`|+^qF;hi6cJ8ncUKU0gGsaD)4-k27I)K=^>g zK2QX*z%T5LfHQ!1Tmg0fo?+wC_98|5SnV|5io44@V?6_8?BGnalxusq56&>A^EsxI z2bbDU;A7kd;KsS~65M|ejZZO+Wvt;|^6=v{PA3|yZML2Zjnz9vY)0efwVz2_!%%$- z>=QehftP({e(LcZaGWVG;dp#o?mm=(^DD>p?W5Xh51X#mGJMpeZ?@KqEaUz&_O7v= z;(tu!He&fI_$A@U!DGiYUDifD_Q1XcEI8B0nbC1=*2C}8R5@p3;Z9S7{c-idwDEj} z^p^qlW(%lpu6r;2=tbA~B`+KV!Mf)+I$>mnQOwb52hE-d_xfGafV)~dsv-7rHr{_a zXa`X{?B9z9-N5+Qw0x>t*R0xA{{M~tm->%l=bq+mwj0q*+11*OMWY350{ef^=^fU4 GP5BEPFB_=< diff --git a/extension/runner_util/managed_tensor.h b/extension/runner_util/managed_tensor.h deleted file mode 100644 index 5c74f7550a..0000000000 --- a/extension/runner_util/managed_tensor.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -// @nolint PATTERNLINT Ok to use stdlib for this optional library -#include - -#include -#include -#include -#include - -#ifdef USE_ATEN_LIB -#include -#endif - -namespace executorch { -namespace extension { - -/** - * A tensor wrapper takes ownership of all the memory of the necessary metadata - * for exec_aten::Tensor. Note that it doesn't own the data memory. - */ -class ManagedTensor { - public: - /// The type used for elements of `sizes()`. - using SizesType = exec_aten::SizesType; - /// The type used for elements of `dim_order()`. - using DimOrderType = exec_aten::DimOrderType; - /// The type used for elements of `strides()`. - using StridesType = exec_aten::StridesType; - - ManagedTensor() = delete; - - explicit ManagedTensor( - void* data, - const std::vector& sizes, - exec_aten::ScalarType dtype) - : sizes_(sizes) { -#ifdef USE_ATEN_LIB - tensor_ = torch::from_blob(data, sizes, dtype); -#else - // Calculate strides. - strides_ = std::vector(sizes_.size()); - if (sizes_.size() > 0) { - strides_.back() = 1; - for (size_t i = strides_.size() - 1; i > 0; --i) { - strides_[i - 1] = strides_[i] * sizes_[i]; - } - } - - // Allocate TensorImpl. - tensor_impl_ = std::make_unique( - dtype, - sizes_.size(), - sizes_.data(), - data, - /*dim_order=*/nullptr, - strides_.data(), - executorch::runtime::TensorShapeDynamism::DYNAMIC_BOUND); -#endif - } - - void resize(const std::vector& new_sizes) { - auto err = executorch::runtime::resize_tensor( - this->get_aliasing_tensor(), - exec_aten::ArrayRef(new_sizes.data(), new_sizes.size())); - ET_CHECK(err == executorch::runtime::Error::Ok); - } - - /** - * Get the underlying Tensor object. This is assuming the copying is cheap. - */ - exec_aten::Tensor get_aliasing_tensor() { -#ifdef USE_ATEN_LIB - return tensor_; -#else - return exec_aten::Tensor(tensor_impl_.get()); -#endif - } - - private: - std::unique_ptr tensor_impl_; - std::vector sizes_; - std::vector strides_; -#ifdef USE_ATEN_LIB - exec_aten::Tensor tensor_; -#endif -}; - -} // namespace extension -} // namespace executorch - -namespace torch { -namespace executor { -// TODO(T197294990): Remove these deprecated aliases once all users have moved -// to the new `::executorch` namespaces. -using ::executorch::extension::ManagedTensor; -} // namespace executor -} // namespace torch diff --git a/extension/runner_util/targets.bzl b/extension/runner_util/targets.bzl index 43c0ed08f3..bc0fee197d 100644 --- a/extension/runner_util/targets.bzl +++ b/extension/runner_util/targets.bzl @@ -26,18 +26,3 @@ def define_common_targets(): "//executorch/runtime/executor:program" + aten_suffix, ], ) - - runtime.cxx_library( - name = "managed_tensor" + aten_suffix, - exported_headers = [ - "managed_tensor.h", - ], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - deps = [ - "//executorch/runtime/core/exec_aten:lib" + aten_suffix, - "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, - ], - ) diff --git a/extension/runner_util/test/CMakeLists.txt b/extension/runner_util/test/CMakeLists.txt index 6b295611fd..aefb3b0417 100644 --- a/extension/runner_util/test/CMakeLists.txt +++ b/extension/runner_util/test/CMakeLists.txt @@ -23,7 +23,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) include(${EXECUTORCH_ROOT}/build/Test.cmake) -set(_test_srcs inputs_test.cpp managed_tensor_test.cpp) +set(_test_srcs inputs_test.cpp) et_cxx_test( extension_runner_util_test diff --git a/extension/runner_util/test/managed_tensor_test.cpp b/extension/runner_util/test/managed_tensor_test.cpp deleted file mode 100644 index 8ac1285f2b..0000000000 --- a/extension/runner_util/test/managed_tensor_test.cpp +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include - -using namespace ::testing; -using exec_aten::DimOrderType; -using exec_aten::ScalarType; -using exec_aten::SizesType; -using exec_aten::StridesType; -using executorch::extension::ManagedTensor; -using executorch::runtime::ArrayRef; - -class ManagedTensorTest : public ::testing::Test { - protected: - void SetUp() override { - executorch::runtime::runtime_init(); - - data_ = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - sizes_ = {2, 3, 4}; - expected_strides_ = {12, 4, 1}; - managed_tensor_ = - std::make_unique(data_.data(), sizes_, ScalarType::Long); - } - - protected: - std::vector data_; - std::vector sizes_; - std::vector expected_strides_; - std::unique_ptr managed_tensor_; -}; - -TEST_F(ManagedTensorTest, Smoke) { - const auto tensor = managed_tensor_->get_aliasing_tensor(); - - EXPECT_EQ(tensor.sizes(), ArrayRef(sizes_.data(), sizes_.size())); - EXPECT_EQ(tensor.scalar_type(), ScalarType::Long); - EXPECT_EQ(tensor.const_data_ptr(), data_.data()); - for (size_t i = 0; i < expected_strides_.size(); ++i) { - EXPECT_EQ(tensor.strides()[i], expected_strides_[i]); - } -} - -TEST_F(ManagedTensorTest, ResizeWithUpdatedRank) { - // gtest death test doesn't work on iOS: - // https://github.com/google/googletest/issues/2834 -#if !GTEST_OS_IOS - EXPECT_EXIT( - managed_tensor_->resize(std::vector{2, 3, 4, 5}), - ::testing::KilledBySignal(SIGABRT), - ""); -#endif -} - -TEST_F(ManagedTensorTest, ResizeShrink) { - managed_tensor_->resize(std::vector{2, 2, 2}); - const auto tensor = managed_tensor_->get_aliasing_tensor(); - - std::vector expected_sizes = {2, 2, 2}; - EXPECT_EQ( - tensor.sizes(), - ArrayRef(expected_sizes.data(), expected_sizes.size())); - EXPECT_EQ(tensor.scalar_type(), ScalarType::Long); - EXPECT_EQ(tensor.const_data_ptr(), data_.data()); -} - -TEST_F(ManagedTensorTest, Resize) { - managed_tensor_->resize(std::vector{4, 3, 2}); - const auto tensor = managed_tensor_->get_aliasing_tensor(); - - std::vector expected_sizes = {4, 3, 2}; - EXPECT_EQ( - tensor.sizes(), - ArrayRef(expected_sizes.data(), expected_sizes.size())); - EXPECT_EQ(tensor.scalar_type(), ScalarType::Long); - EXPECT_EQ(tensor.const_data_ptr(), data_.data()); -} diff --git a/extension/runner_util/test/targets.bzl b/extension/runner_util/test/targets.bzl index 7c042ca9d9..f55a1ea995 100644 --- a/extension/runner_util/test/targets.bzl +++ b/extension/runner_util/test/targets.bzl @@ -30,15 +30,3 @@ def define_common_targets(is_fbcode = False): "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])", }, ) - - runtime.cxx_test( - name = "managed_tensor_test", - srcs = [ - "managed_tensor_test.cpp", - ], - deps = [ - "//executorch/extension/runner_util:managed_tensor", - "//executorch/runtime/core/exec_aten:lib", - "//executorch/runtime/core/exec_aten/util:tensor_util", - ], - ) diff --git a/kernels/README.md b/kernels/README.md index 4e9656e6e9..026778cc28 100644 --- a/kernels/README.md +++ b/kernels/README.md @@ -355,7 +355,7 @@ cmake . \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_SDK=ON \ -DEXECUTORCH_BUILD_VULKAN=OFF \ -DEXECUTORCH_BUILD_XNNPACK=ON \ diff --git a/kernels/optimized/cpu/binary_ops.h b/kernels/optimized/cpu/binary_ops.h index 01f3eed401..6d941509f7 100644 --- a/kernels/optimized/cpu/binary_ops.h +++ b/kernels/optimized/cpu/binary_ops.h @@ -75,7 +75,8 @@ ElementwiseOptimizedPath inline select_optimized_path( ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); - if (a_type != b_type || a_type != out_type || a_type == ScalarType::Half) { + if (a_type != b_type || a_type != out_type || a_type == ScalarType::Half || + a_type == ScalarType::BFloat16) { return ElementwiseOptimizedPath::kNone; } if (a.sizes().equals(b.sizes()) || diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp index 3b93870a61..31b0f7754f 100644 --- a/kernels/optimized/cpu/op_mul.cpp +++ b/kernels/optimized/cpu/op_mul.cpp @@ -80,7 +80,8 @@ Tensor& opt_mul_out( ScalarType out_type = out.scalar_type(); if (b.numel() == 1) { - if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) { + if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half && + a_type != ScalarType::BFloat16) { auto error = resize_tensor(out, a.sizes()); ET_KERNEL_CHECK_MSG( ctx, @@ -170,12 +171,12 @@ Tensor& opt_mul_out( InvalidArgument, out); - ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() { - ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() { + ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() { + ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() { using CTYPE_IN = typename torch::executor:: promote_types::type; ET_DCHECK(CppTypeToScalarType::value == common_type); - ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() { + ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() { apply_binary_elementwise_fn( [](const CTYPE_A val_a, const CTYPE_B val_b) { CTYPE_IN a_casted = static_cast(val_a); @@ -210,7 +211,7 @@ Tensor& opt_mul_scalar_out( ET_CHECK(common_type == out_type); - if (common_type == ScalarType::Half) { + if (common_type == ScalarType::Half || common_type == ScalarType::BFloat16) { common_type = ScalarType::Float; } @@ -219,7 +220,7 @@ Tensor& opt_mul_scalar_out( ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor."); if (a_type == common_type && a_type == out_type && - a_type != ScalarType::Half) { + a_type != ScalarType::Half && a_type != ScalarType::BFloat16) { ET_SWITCH_REALB_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE, [&]() { ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "mul.Scalar_out", CTYPE_B, [&]() { CTYPE_B b_val; @@ -235,11 +236,11 @@ Tensor& opt_mul_scalar_out( }); }); } else { - ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() { ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "mul.Scalar_out", CTYPE_B, [&]() { ET_SWITCH_REALB_TYPES( common_type, ctx, "mul.Scalar_out", CTYPE_IN, [&]() { - ET_SWITCH_REALHB_TYPES( + ET_SWITCH_REALHBBF16_TYPES( out_type, ctx, "mul.Scalar_out", CTYPE_OUT, [&]() { CTYPE_B b_val; ET_EXTRACT_SCALAR(b, b_val); diff --git a/kernels/portable/cpu/op_masked_fill.cpp b/kernels/portable/cpu/op_masked_fill.cpp index 7a72994b07..e6c0bb4442 100644 --- a/kernels/portable/cpu/op_masked_fill.cpp +++ b/kernels/portable/cpu/op_masked_fill.cpp @@ -39,6 +39,9 @@ Tensor& masked_fill_scalar_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, mask, out), InvalidArgument, out); + ET_SWITCH_REAL_TYPES_AND( Bool, in_type, ctx, "masked_fill.Scalar_out", CTYPE, [&]() { ET_SWITCH_REAL_TYPES_AND( diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp index 8f363ced4e..b36cde42e4 100644 --- a/kernels/portable/cpu/op_max.cpp +++ b/kernels/portable/cpu/op_max.cpp @@ -49,6 +49,24 @@ std::tuple max_out( InvalidArgument, (std::tuple({max, max_indices}))); + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(in, max), + InvalidArgument, + (std::tuple({max, max_indices}))); + + ET_KERNEL_CHECK( + ctx, + tensor_is_default_dim_order(max_indices), + InvalidArgument, + (std::tuple({max, max_indices}))); + + ET_KERNEL_CHECK( + ctx, + tensor_is_default_dim_order(in), + InvalidArgument, + (std::tuple({max, max_indices}))); + dim = dim < 0 ? dim + in.dim() : dim; ET_SWITCH_REAL_TYPES_AND( diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp index 1353479b29..e52a6fd072 100644 --- a/kernels/portable/cpu/op_maximum.cpp +++ b/kernels/portable/cpu/op_maximum.cpp @@ -75,6 +75,9 @@ Tensor& maximum_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp index 79e66c62b5..e930eb6c83 100644 --- a/kernels/portable/cpu/op_mean.cpp +++ b/kernels/portable/cpu/op_mean.cpp @@ -33,6 +33,11 @@ Tensor& mean_dim_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + ET_KERNEL_CHECK( ctx, resize_reduction_out(in, dim_list, keepdim, out) == Error::Ok, diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp index 8e3b5a00b3..e4f5e5714f 100644 --- a/kernels/portable/cpu/op_min.cpp +++ b/kernels/portable/cpu/op_min.cpp @@ -49,6 +49,24 @@ std::tuple min_out( InvalidArgument, (std::tuple({min, min_indices}))); + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(in, min), + InvalidArgument, + (std::tuple({min, min_indices}))); + + ET_KERNEL_CHECK( + ctx, + tensor_is_default_dim_order(min_indices), + InvalidArgument, + (std::tuple({min, min_indices}))); + + ET_KERNEL_CHECK( + ctx, + tensor_is_default_dim_order(in), + InvalidArgument, + (std::tuple({min, min_indices}))); + dim = dim < 0 ? dim + in.dim() : dim; ET_SWITCH_REAL_TYPES_AND( diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp index f18d1a6d36..84024beffa 100644 --- a/kernels/portable/cpu/op_minimum.cpp +++ b/kernels/portable/cpu/op_minimum.cpp @@ -75,6 +75,9 @@ Tensor& minimum_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); diff --git a/kernels/portable/cpu/op_mm.cpp b/kernels/portable/cpu/op_mm.cpp index 6903bf3cad..4a6a8f3cfd 100644 --- a/kernels/portable/cpu/op_mm.cpp +++ b/kernels/portable/cpu/op_mm.cpp @@ -29,6 +29,11 @@ mm_out(RuntimeContext& ctx, const Tensor& in, const Tensor& mat2, Tensor& out) { InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, mat2, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + ET_SWITCH_REAL_TYPES_AND(Half, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() { size_t m = in.size(0); size_t n = in.size(1); diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp index c933d10d27..8fc4f9d459 100644 --- a/kernels/portable/cpu/op_mul.cpp +++ b/kernels/portable/cpu/op_mul.cpp @@ -70,7 +70,14 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { InvalidArgument, out); - ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensor_is_realhbbf16_type(out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); @@ -79,12 +86,12 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); - ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() { - ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() { + ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() { + ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() { using CTYPE_IN = typename torch::executor:: promote_types::type; ET_DCHECK(CppTypeToScalarType::value == common_type); - ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() { + ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() { MulInner< can_cast::value, CTYPE_A, @@ -113,6 +120,9 @@ Tensor& mul_scalar_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out); ScalarType a_type = a.scalar_type(); @@ -123,15 +133,15 @@ Tensor& mul_scalar_out( ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); - if (common_type == ScalarType::Half) { + if (common_type == ScalarType::Half || common_type == ScalarType::BFloat16) { common_type = ScalarType::Float; } - ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() { ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "mul.Scalar_out", CTYPE_B, [&]() { ET_SWITCH_REALB_TYPES( common_type, ctx, "mul.Scalar_out", CTYPE_IN, [&]() { - ET_SWITCH_REALHB_TYPES( + ET_SWITCH_REALHBBF16_TYPES( out_type, ctx, "mul.Scalar_out", CTYPE_OUT, [&]() { CTYPE_B b_val; utils::extract_scalar(b, &b_val); diff --git a/kernels/portable/cpu/op_native_batch_norm.cpp b/kernels/portable/cpu/op_native_batch_norm.cpp index 2e613c0a63..fceb8b24d9 100644 --- a/kernels/portable/cpu/op_native_batch_norm.cpp +++ b/kernels/portable/cpu/op_native_batch_norm.cpp @@ -73,6 +73,28 @@ std::tuple _native_batch_norm_legit_no_training_out( InvalidArgument, ret_val); + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(in, out, mean_out, invstd_out), + InvalidArgument, + ret_val); + + if (weight.has_value()) { + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(in, weight.value()), + InvalidArgument, + ret_val); + } + + if (bias.has_value()) { + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(in, bias.value()), + InvalidArgument, + ret_val); + } + size_t C_dim = in.dim() >= 1 ? 1 : 0; size_t C = in.size(C_dim); size_t outer = getLeadingDims(in, C_dim); diff --git a/kernels/portable/cpu/op_native_group_norm.cpp b/kernels/portable/cpu/op_native_group_norm.cpp index f9213fdeb1..b61f5be676 100644 --- a/kernels/portable/cpu/op_native_group_norm.cpp +++ b/kernels/portable/cpu/op_native_group_norm.cpp @@ -158,6 +158,31 @@ std::tuple native_group_norm_out( InvalidArgument, ret_val); + ET_KERNEL_CHECK( + ctx, tensor_is_default_dim_order(input), InvalidArgument, ret_val); + + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(input, out, mean_out, rstd_out), + InvalidArgument, + ret_val); + + if (weight.has_value()) { + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(input, weight.value()), + InvalidArgument, + ret_val); + } + + if (bias.has_value()) { + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(input, bias.value()), + InvalidArgument, + ret_val); + } + constexpr auto name = "native_group_norm.out"; ET_SWITCH_FLOAT_TYPES(input.scalar_type(), ctx, name, CTYPE, [&]() { diff --git a/kernels/portable/cpu/op_native_layer_norm.cpp b/kernels/portable/cpu/op_native_layer_norm.cpp index f10acda10e..711c747ca2 100644 --- a/kernels/portable/cpu/op_native_layer_norm.cpp +++ b/kernels/portable/cpu/op_native_layer_norm.cpp @@ -117,6 +117,33 @@ std::tuple native_layer_norm_out( InvalidArgument, ret_val); + // Only support default dim order for now. + // TODO: Support other dim orders. + ET_KERNEL_CHECK( + ctx, tensor_is_default_dim_order(input), InvalidArgument, ret_val); + + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(input, out, mean_out, rstd_out), + InvalidArgument, + ret_val); + + if (weight.has_value()) { + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(input, weight.value()), + InvalidArgument, + ret_val); + } + + if (bias.has_value()) { + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(input, bias.value()), + InvalidArgument, + ret_val); + } + Tensor::SizesType mean_rstd_sizes[kTensorDimensionLimit]; size_t mean_rstd_ndim = 0; get_layer_norm_out_target_size( diff --git a/kernels/portable/cpu/op_ne.cpp b/kernels/portable/cpu/op_ne.cpp index 5601fdafbd..2c25dc7029 100644 --- a/kernels/portable/cpu/op_ne.cpp +++ b/kernels/portable/cpu/op_ne.cpp @@ -30,6 +30,9 @@ Tensor& ne_tensor_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); @@ -75,6 +78,9 @@ Tensor& ne_scalar_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); ScalarType out_type = out.scalar_type(); diff --git a/kernels/portable/cpu/op_neg.cpp b/kernels/portable/cpu/op_neg.cpp index 026d1009c4..b88cdb03a2 100644 --- a/kernels/portable/cpu/op_neg.cpp +++ b/kernels/portable/cpu/op_neg.cpp @@ -30,6 +30,9 @@ Tensor& neg_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { ET_KERNEL_CHECK( ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "neg.out", CTYPE, [&] { apply_unary_map_fn( [](const CTYPE val_in) { return static_cast(-val_in); }, diff --git a/kernels/portable/cpu/op_pdist_forward.cpp b/kernels/portable/cpu/op_pdist_forward.cpp index 88b5e88194..9b06b880b6 100644 --- a/kernels/portable/cpu/op_pdist_forward.cpp +++ b/kernels/portable/cpu/op_pdist_forward.cpp @@ -24,6 +24,11 @@ Tensor& _pdist_forward_out( ET_KERNEL_CHECK(ctx, check_pdist_args(in, p, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + Tensor::SizesType target_sizes[kTensorDimensionLimit]; size_t target_ndim = 0; get_pdist_out_target_size(in, target_sizes, &target_ndim); diff --git a/kernels/portable/cpu/op_permute_copy.cpp b/kernels/portable/cpu/op_permute_copy.cpp index e7df5c9657..1362b57c00 100644 --- a/kernels/portable/cpu/op_permute_copy.cpp +++ b/kernels/portable/cpu/op_permute_copy.cpp @@ -46,6 +46,9 @@ Tensor& permute_copy_out( ET_KERNEL_CHECK( ctx, check_permute_copy_args(in, dims, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; size_t expected_out_dim = 0; get_permute_copy_out_target_size( diff --git a/kernels/portable/cpu/op_pixel_shuffle.cpp b/kernels/portable/cpu/op_pixel_shuffle.cpp index 104348f3fe..e1e459b1b2 100644 --- a/kernels/portable/cpu/op_pixel_shuffle.cpp +++ b/kernels/portable/cpu/op_pixel_shuffle.cpp @@ -72,6 +72,10 @@ Tensor& pixel_shuffle_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); Tensor::SizesType expected_out_size[kTensorDimensionLimit]; size_t expected_out_dim = 0; get_pixel_shuffle_out_target_size( diff --git a/kernels/portable/cpu/op_to_copy.cpp b/kernels/portable/cpu/op_to_copy.cpp index 7ecd4f3b5e..c0c04e65e9 100644 --- a/kernels/portable/cpu/op_to_copy.cpp +++ b/kernels/portable/cpu/op_to_copy.cpp @@ -46,10 +46,11 @@ Tensor& to_copy_out( InvalidArgument, out); - ET_SWITCH_REALHB_TYPES(self.scalar_type(), ctx, "to_copy", CTYPE_IN, [&] { - ET_SWITCH_REALHB_TYPES(out.scalar_type(), ctx, "to_copy", CTYPE_OUT, [&] { - _to_impl(self, out); - }); + ET_SWITCH_REALHBBF16_TYPES(self.scalar_type(), ctx, "to_copy", CTYPE_IN, [&] { + ET_SWITCH_REALHBBF16_TYPES( + out.scalar_type(), ctx, "to_copy", CTYPE_OUT, [&] { + _to_impl(self, out); + }); }); return out; diff --git a/kernels/portable/cpu/scalar_utils.h b/kernels/portable/cpu/scalar_utils.h index 3daf3e7252..3d6dfb75e4 100644 --- a/kernels/portable/cpu/scalar_utils.h +++ b/kernels/portable/cpu/scalar_utils.h @@ -94,12 +94,6 @@ struct promote_type_with_scalar_type { static_assert( !is_bits_type::value, "promote_type_with_scalar_type not valid for bits dtypes"); - static_assert( - !std::is_same< - T1, - typename ScalarTypeToCppType::type>:: - value, - "promote_type_with_scalar_type not valid for BFloat16"); using promote_type_with_scalar_type_not_respecting_half_to_float = typename std::conditional< is_complex_type::value || @@ -119,10 +113,14 @@ struct promote_type_with_scalar_type { public: using type = typename std::conditional< half_to_float && - std::is_same< - promote_type_with_scalar_type_not_respecting_half_to_float, - typename ScalarTypeToCppType::type>:: - value, + (std::is_same< + promote_type_with_scalar_type_not_respecting_half_to_float, + typename ScalarTypeToCppType< + exec_aten::ScalarType::Half>::type>::value || + std::is_same< + promote_type_with_scalar_type_not_respecting_half_to_float, + typename ScalarTypeToCppType< + exec_aten::ScalarType::BFloat16>::type>::value), typename ScalarTypeToCppType::type, promote_type_with_scalar_type_not_respecting_half_to_float>::type; }; diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp index 32b69352ef..41a8656f96 100644 --- a/kernels/test/op_mul_test.cpp +++ b/kernels/test/op_mul_test.cpp @@ -72,7 +72,7 @@ class OpMulOutTest : public OperatorTest { #define ENUMERATE_TEST_ENTRY(ctype, dtype) \ test_mul_enumerate_out_types(); - ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY) + ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY) #undef ENUMERATE_TEST_ENTRY } @@ -89,29 +89,99 @@ class OpMulOutTest : public OperatorTest { // Multiply two tensors op_mul_out( - tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.ones(sizes), out); - EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8})); + tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875}), tf.ones(sizes), out); + EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875})); op_mul_out( tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.zeros(sizes), out); EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.0, 0.0, 0.0, 0.0})); op_mul_out( - tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), - tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), + tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875}), + tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875}), out); EXPECT_TENSOR_CLOSE( - out, tf.make(sizes, /*data=*/{1.21, 4.84, 19.36, 77.44})); + out, tf.make(sizes, /*data=*/{1.5625, 6.25, 22.5625, 78.765625})); } void test_mul_enumerate_a_types() { #define ENUMERATE_TEST_ENTRY(ctype, dtype) \ test_mul_enumerate_b_types(); - ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY) + ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY) #undef ENUMERATE_TEST_ENTRY } + + template + void test_optimized_path_ignores_leading_1_dimensions() { + TensorFactory tf; + + const std::vector sizes1 = {1, 1, 2, 2}; + const std::vector sizes2 = {1, 2, 2}; + + // Destination for the mul. + Tensor out = tf.zeros(sizes1); + + // Multiply two tensors + op_mul_out( + tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.ones(sizes2), out); + EXPECT_TENSOR_CLOSE(out, tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8})); + } + + template + void test_broadcast_a2b() { + TensorFactory tf_a; + + std::vector> b_sizeses = { + {2}, + {1, 2}, + }; + for (const auto& b_sizes : b_sizeses) { + // a and b of different shapes + Tensor a = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4}); + Tensor b = tf_a.make(b_sizes, /*data=*/{2, 2}); + + // Destination for output of mul. + Tensor out = tf_a.zeros({2, 2}); + + // Check that it matches the expected output. + EXPECT_TENSOR_CLOSE( + op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8})); + } + } + + template + void test_broadcast_b2a() { + TensorFactory tf_a; + // a and b of different shapes + Tensor a = tf_a.make({2}, /*data=*/{2, 2}); + Tensor b = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4}); + + // Destination for output of mul. + Tensor out = tf_a.zeros({2, 2}); + + // Check that it matches the expected output. + EXPECT_TENSOR_CLOSE( + op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8})); + } + + template + void test_scalar_input_broadcast() { + TensorFactory tf_a; + + // a is a 1d tensor and b is a scalar + Tensor a = tf_a.make({2}, /*data=*/{2, 2}); + Tensor b = tf_a.make({}, /*data=*/{2}); + + // Destination for output of mul. + Tensor out = tf_a.make({2}, /*data=*/{2, 2}); + Tensor expected = tf_a.make({2}, /*data=*/{4, 4}); + + // Check that it matches the expected output. + EXPECT_TENSOR_CLOSE(op_mul_out(a, b, out), expected); + EXPECT_TENSOR_CLOSE(op_mul_out(b, a, out), expected); + } }; class OpMulScalarOutTest : public OperatorTest { @@ -141,6 +211,14 @@ TEST_F(OpMulOutTest, DoubleTensors) { test_floating_point_mul_out(); } +TEST_F(OpMulOutTest, HalfTensors) { + test_floating_point_mul_out(); +} + +TEST_F(OpMulOutTest, BFloat16Tensors) { + test_floating_point_mul_out(); +} + TEST_F(OpMulOutTest, BoolTensors) { TensorFactory tf; @@ -166,18 +244,12 @@ TEST_F(OpMulOutTest, BoolTensors) { } TEST_F(OpMulOutTest, OptimizedPathIgnoresLeading1Dimensions) { - TensorFactory tf; +#define ENUMERATE_TEST_ENTRY(ctype, dtype) \ + test_optimized_path_ignores_leading_1_dimensions(); - const std::vector sizes1 = {1, 1, 2, 2}; - const std::vector sizes2 = {1, 2, 2}; + ET_FORALL_FLOATHBF16_TYPES(ENUMERATE_TEST_ENTRY); - // Destination for the mul. - Tensor out = tf.zeros(sizes1); - - // Multiply two tensors - op_mul_out( - tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.ones(sizes2), out); - EXPECT_TENSOR_CLOSE(out, tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8})); +#undef ENUMERATE_TEST_ENTRY } // Mismatched shape tests. @@ -202,40 +274,16 @@ TEST_F(OpMulOutTest, MismatchedNonBroadcastableInputShapesDies) { // Broadcast tensor b's size to tensor a's size TEST_F(OpMulOutTest, BroadcastA2BTest) { - TensorFactory tf_a; - - std::vector> b_sizeses = { - {2}, - {1, 2}, - }; - for (const auto& b_sizes : b_sizeses) { - // a and b of different shapes - Tensor a = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4}); - Tensor b = tf_a.make(b_sizes, /*data=*/{2, 2}); - - // Destination for output of mul. - Tensor out = tf_a.zeros({2, 2}); - - // Check that it matches the expected output. - EXPECT_TENSOR_CLOSE( - op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8})); - } + test_broadcast_a2b(); + test_broadcast_a2b(); + test_broadcast_a2b(); } // Broadcast tensor a's size to tensor b's size TEST_F(OpMulOutTest, BroadcastB2ATest) { - TensorFactory tf_a; - - // a and b of different shapes - Tensor a = tf_a.make({2}, /*data=*/{2, 2}); - Tensor b = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4}); - - // Destination for output of mul. - Tensor out = tf_a.zeros({2, 2}); - - // Check that it matches the expected output. - EXPECT_TENSOR_CLOSE( - op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8})); + test_broadcast_b2a(); + test_broadcast_b2a(); + test_broadcast_b2a(); } // Broadcast tensor a and b's size to a new size c. @@ -256,19 +304,9 @@ TEST_F(OpMulOutTest, BroadcastAB2CTest) { } TEST_F(OpMulOutTest, ScalarInputBroadcastTest) { - TensorFactory tf_a; - - // a is a 1d tensor and b is a scalar - Tensor a = tf_a.make({2}, /*data=*/{2, 2}); - Tensor b = tf_a.make({}, /*data=*/{2}); - - // Destination for output of mul. - Tensor out = tf_a.make({2}, /*data=*/{2, 2}); - Tensor expected = tf_a.make({2}, /*data=*/{4, 4}); - - // Check that it matches the expected output. - EXPECT_TENSOR_CLOSE(op_mul_out(a, b, out), expected); - EXPECT_TENSOR_CLOSE(op_mul_out(b, a, out), expected); + test_scalar_input_broadcast(); + test_scalar_input_broadcast(); + test_scalar_input_broadcast(); } TEST_F(OpMulOutTest, MismatchedOutputShapesDies) { diff --git a/kernels/test/op_to_copy_test.cpp b/kernels/test/op_to_copy_test.cpp index 1cc892dedb..0a6529e736 100644 --- a/kernels/test/op_to_copy_test.cpp +++ b/kernels/test/op_to_copy_test.cpp @@ -36,7 +36,9 @@ typedef std::map< std::type_index, std::variant< std::vector, - std::vector>> + std::vector, + std::vector, + std::vector>> FloatingTypeToDataMap; typedef std::map< @@ -309,9 +311,9 @@ TEST_F(OpToTest, AllDtypesSupported) { ScalarType::OUTPUT_DTYPE>(test_cases); #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \ - ET_FORALL_REAL_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL); + ET_FORALL_REALHBF16_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL); - ET_FORALL_REAL_TYPES(TEST_ENTRY); + ET_FORALL_REALHBF16_TYPES(TEST_ENTRY); #undef TEST_ENTRY #undef TEST_KERNEL @@ -323,14 +325,14 @@ TEST_F(OpToTest, BoolTests) { #define TEST_TO_BOOL(INPUT_CTYPE, INPUT_DTYPE) \ test_runner_to_bool( \ test_case_to_bool, result_to_bool); - ET_FORALL_REAL_TYPES(TEST_TO_BOOL); + ET_FORALL_REALHBF16_TYPES(TEST_TO_BOOL); std::vector test_case_from_bool = {true, true, false}; std::vector result_from_bool = {1.0, 1.0, 0}; #define TEST_FROM_BOOL(OUTPUT_CTYPE, OUTPUT_DTYPE) \ test_runner_from_bool( \ test_case_from_bool, result_from_bool); - ET_FORALL_REAL_TYPES(TEST_FROM_BOOL); + ET_FORALL_REALHBF16_TYPES(TEST_FROM_BOOL); } TEST_F(OpToTest, NanInfSupported) { @@ -349,9 +351,9 @@ TEST_F(OpToTest, NanInfSupported) { ScalarType::OUTPUT_DTYPE>(test_cases); #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \ - ET_FORALL_FLOAT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL); + ET_FORALL_FLOATHBF16_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL); - ET_FORALL_FLOAT_TYPES(TEST_ENTRY); + ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY); #undef TEST_ENTRY #undef TEST_KERNEL @@ -381,6 +383,13 @@ TEST_F(OpToTest, HardcodeFloatConvertInt) { -0.30919688936285893988}; // clang-format on + std::vector half_data; + std::vector bf16_data; + for (auto d : double_data) { + half_data.emplace_back(d); + bf16_data.emplace_back(d); + } + std::vector int64_data = { -1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0}; std::vector int32_data = { @@ -394,6 +403,8 @@ TEST_F(OpToTest, HardcodeFloatConvertInt) { FloatingTypeToDataMap floating_point_data; floating_point_data[typeid(float)] = float_data; floating_point_data[typeid(double)] = double_data; + floating_point_data[typeid(exec_aten::Half)] = half_data; + floating_point_data[typeid(exec_aten::BFloat16)] = bf16_data; // Gathering all int data together for better traversial IntTypeToDataMap int_data; @@ -412,7 +423,7 @@ TEST_F(OpToTest, HardcodeFloatConvertInt) { #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \ ET_FORALL_INT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL); - ET_FORALL_FLOAT_TYPES(TEST_ENTRY); + ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY); } TEST_F(OpToTest, MismatchedSizesDie) { diff --git a/runtime/core/exec_aten/exec_aten.h b/runtime/core/exec_aten/exec_aten.h index 919b5420b3..808d31502a 100644 --- a/runtime/core/exec_aten/exec_aten.h +++ b/runtime/core/exec_aten/exec_aten.h @@ -17,6 +17,7 @@ #include // @manual #include // @manual #include // @manual +#include // @manual #include // @manual #include // @manual #include // @manual @@ -31,6 +32,7 @@ #else // use executor #include // @manual #include // @manual +#include // @manual #include // @manual #include // @manual #include // @manual diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp index 03dffd208f..0301cc9a51 100644 --- a/runtime/core/exec_aten/testing_util/tensor_util.cpp +++ b/runtime/core/exec_aten/testing_util/tensor_util.cpp @@ -16,6 +16,8 @@ #include #include +using exec_aten::BFloat16; +using exec_aten::Half; using exec_aten::ScalarType; using exec_aten::Tensor; @@ -32,9 +34,7 @@ namespace { * T must be a floating point type. Non-floating point data should be compared * directly. */ -template < - typename T, - typename = std::enable_if_t::value>> +template bool data_is_close( const T* a, const T* b, @@ -119,6 +119,20 @@ bool tensors_are_close( a.numel(), rtol, atol); + } else if (a.scalar_type() == ScalarType::Half) { + return data_is_close( + a.const_data_ptr(), + b.const_data_ptr(), + a.numel(), + rtol, + atol); + } else if (a.scalar_type() == ScalarType::BFloat16) { + return data_is_close( + a.const_data_ptr(), + b.const_data_ptr(), + a.numel(), + rtol, + atol); } else { // Non-floating-point types can be compared bitwise. return memcmp(a.const_data_ptr(), b.const_data_ptr(), a.nbytes()) == 0; @@ -269,7 +283,7 @@ std::ostream& operator<<(std::ostream& os, const Tensor& t) { break; switch (t.scalar_type()) { - ET_FORALL_REAL_TYPES_AND2(Half, Bool, PRINT_CASE) + ET_FORALL_REAL_TYPES_AND3(Half, Bool, BFloat16, PRINT_CASE) default: ET_CHECK_MSG( false, diff --git a/runtime/core/exec_aten/util/genScalarTypeTable.py b/runtime/core/exec_aten/util/genScalarTypeTable.py index 07100472ae..c2bc84c270 100644 --- a/runtime/core/exec_aten/util/genScalarTypeTable.py +++ b/runtime/core/exec_aten/util/genScalarTypeTable.py @@ -4,20 +4,35 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -indexToType = ["U1", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "B1"] +indexToType = [ + "U1", + "I1", + "I2", + "I4", + "I8", + "F2", + "F4", + "F8", + "C2", + "C4", + "C8", + "B1", + "BF", +] promoteTypesLookup = [ - ["U1", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "U1"], - ["I2", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I1"], - ["I2", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I2"], - ["I4", "I4", "I4", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I4"], - ["I8", "I8", "I8", "I8", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I8"], - ["F2", "F2", "F2", "F2", "F2", "F2", "F4", "F8", "C2", "C4", "C8", "F2"], - ["F4", "F4", "F4", "F4", "F4", "F4", "F4", "F8", "C4", "C4", "C8", "F4"], - ["F8", "F8", "F8", "F8", "F8", "F8", "F8", "F8", "C8", "C8", "C8", "F8"], - ["C2", "C2", "C2", "C2", "C2", "C2", "C4", "C8", "C2", "C4", "C8", "C2"], - ["C4", "C4", "C4", "C4", "C4", "C4", "C4", "C8", "C4", "C4", "C8", "C4"], - ["C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8"], - ["U1", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "B1"], + ["U1", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "U1", "BF"], + ["I2", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I1", "BF"], + ["I2", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I2", "BF"], + ["I4", "I4", "I4", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I4", "BF"], + ["I8", "I8", "I8", "I8", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I8", "BF"], + ["F2", "F2", "F2", "F2", "F2", "F2", "F4", "F8", "C2", "C4", "C8", "F2", "F4"], + ["F4", "F4", "F4", "F4", "F4", "F4", "F4", "F8", "C4", "C4", "C8", "F4", "F4"], + ["F8", "F8", "F8", "F8", "F8", "F8", "F8", "F8", "C8", "C8", "C8", "F8", "F8"], + ["C2", "C2", "C2", "C2", "C2", "C2", "C4", "C8", "C2", "C4", "C8", "C2", "C4"], + ["C4", "C4", "C4", "C4", "C4", "C4", "C4", "C8", "C4", "C4", "C8", "C4", "C4"], + ["C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8"], + ["U1", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "B1", "BF"], + ["BF", "BF", "BF", "BF", "BF", "F4", "F4", "F8", "C4", "C4", "C8", "BF", "BF"], ] for rowIndex, row in enumerate(promoteTypesLookup): for colIndex, col in enumerate(row): diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h index c92f910431..479767b4ab 100644 --- a/runtime/core/exec_aten/util/scalar_type_util.h +++ b/runtime/core/exec_aten/util/scalar_type_util.h @@ -21,6 +21,7 @@ #pragma once +#include #include #include #include @@ -164,8 +165,21 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType) ::exec_aten::ScalarType::SCALARTYPE>::type, \ SCALARTYPE) +#define ET_FORALL_FLOAT_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \ + _(float, Float) \ + _(double, Double) \ + _(::executorch::runtime::ScalarTypeToCppType< \ + ::exec_aten::ScalarType::SCALARTYPE1>::type, \ + SCALARTYPE1) \ + _(::executorch::runtime::ScalarTypeToCppType< \ + ::exec_aten::ScalarType::SCALARTYPE2>::type, \ + SCALARTYPE2) + #define ET_FORALL_FLOATH_TYPES(_) ET_FORALL_FLOAT_TYPES_AND(Half, _) +#define ET_FORALL_FLOATHBF16_TYPES(_) \ + ET_FORALL_FLOAT_TYPES_AND2(Half, BFloat16, _) + // Here `ANOTHER_INPUT` should be another variable to be forwarded to a given // function. Not to be confused with another scalar type as in // `ET_FORALL_FLOAT_TYPES_AND`. @@ -177,6 +191,12 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType) _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float) \ _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double) +#define ET_FORALL_FLOATHBF16_TYPES_WITH2(ANOTHER_INPUT1, ANOTHER_INPUT2, _) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::exec_aten::Half, Half) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::exec_aten::BFloat16, BFloat16) + // In this context, "REAL" means integer/float C types, which is why BFloat16 // and Half are not included. #define ET_FORALL_REAL_TYPES(_) \ @@ -209,6 +229,17 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType) _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float) \ _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double) +#define ET_FORALL_REALHBF16_TYPES_WITH2(ANOTHER_INPUT1, ANOTHER_INPUT2, _) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, uint8_t, Byte) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, int8_t, Char) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, int16_t, Short) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, int32_t, Int) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, int64_t, Long) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, exec_aten::Half, Half) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, exec_aten::BFloat16, BFloat16) + // For macros that take `SCALARTYPEn` parameters, those parameters should be // an unquoted/unqualified enumerator name like `Int` or `Float`. #define ET_FORALL_REAL_TYPES_AND(SCALARTYPE, _) \ @@ -223,8 +254,29 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType) ::exec_aten::ScalarType::SCALARTYPE>::type, \ SCALARTYPE) +#define ET_FORALL_REAL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \ + _(uint8_t, Byte) \ + _(int8_t, Char) \ + _(int16_t, Short) \ + _(int32_t, Int) \ + _(int64_t, Long) \ + _(float, Float) \ + _(double, Double) \ + _(::executorch::runtime::ScalarTypeToCppType< \ + ::exec_aten::ScalarType::SCALARTYPE1>::type, \ + SCALARTYPE1) \ + _(::executorch::runtime::ScalarTypeToCppType< \ + ::exec_aten::ScalarType::SCALARTYPE2>::type, \ + SCALARTYPE2) + #define ET_FORALL_REALH_TYPES(_) ET_FORALL_REAL_TYPES_AND(Half, _) +#define ET_FORALL_REALHBF16_TYPES(_) \ + ET_FORALL_REAL_TYPES_AND2(Half, BFloat16, _) + +#define ET_FORALL_REALHBBF16_TYPES(_) \ + ET_FORALL_REAL_TYPES_AND3(Bool, Half, BFloat16, _) + #define ET_FORALL_REAL_TYPES_AND_WITH(SCALARTYPE, ANOTHER_INPUT, _) \ _(ANOTHER_INPUT, uint8_t, Byte) \ _(ANOTHER_INPUT, int8_t, Char) \ @@ -381,6 +433,10 @@ inline bool isRealHBType(exec_aten::ScalarType t) { return (isRealHType(t) || t == exec_aten::ScalarType::Bool); } +inline bool isRealHBBF16Type(exec_aten::ScalarType t) { + return (isRealHBType(t) || t == exec_aten::ScalarType::BFloat16); +} + inline constexpr bool isComplexType(exec_aten::ScalarType t) { return ( t == exec_aten::ScalarType::ComplexHalf || @@ -589,6 +645,7 @@ using C4 = using C8 = typename ScalarTypeToCppType::type; using B1 = typename ScalarTypeToCppType::type; +using BF = typename ScalarTypeToCppType::type; #define TABLE_ENTRY(key1, key2, value) \ template <> \ @@ -613,6 +670,7 @@ TABLE_ENTRY(U1, C2, C2); TABLE_ENTRY(U1, C4, C4); TABLE_ENTRY(U1, C8, C8); TABLE_ENTRY(U1, B1, U1); +TABLE_ENTRY(U1, BF, BF); TABLE_ENTRY(I1, U1, I2); TABLE_ENTRY(I1, I1, I1); TABLE_ENTRY(I1, I2, I2); @@ -625,6 +683,7 @@ TABLE_ENTRY(I1, C2, C2); TABLE_ENTRY(I1, C4, C4); TABLE_ENTRY(I1, C8, C8); TABLE_ENTRY(I1, B1, I1); +TABLE_ENTRY(I1, BF, BF); TABLE_ENTRY(I2, U1, I2); TABLE_ENTRY(I2, I1, I2); TABLE_ENTRY(I2, I2, I2); @@ -637,6 +696,7 @@ TABLE_ENTRY(I2, C2, C2); TABLE_ENTRY(I2, C4, C4); TABLE_ENTRY(I2, C8, C8); TABLE_ENTRY(I2, B1, I2); +TABLE_ENTRY(I2, BF, BF); TABLE_ENTRY(I4, U1, I4); TABLE_ENTRY(I4, I1, I4); TABLE_ENTRY(I4, I2, I4); @@ -649,6 +709,7 @@ TABLE_ENTRY(I4, C2, C2); TABLE_ENTRY(I4, C4, C4); TABLE_ENTRY(I4, C8, C8); TABLE_ENTRY(I4, B1, I4); +TABLE_ENTRY(I4, BF, BF); TABLE_ENTRY(I8, U1, I8); TABLE_ENTRY(I8, I1, I8); TABLE_ENTRY(I8, I2, I8); @@ -661,6 +722,7 @@ TABLE_ENTRY(I8, C2, C2); TABLE_ENTRY(I8, C4, C4); TABLE_ENTRY(I8, C8, C8); TABLE_ENTRY(I8, B1, I8); +TABLE_ENTRY(I8, BF, BF); TABLE_ENTRY(F2, U1, F2); TABLE_ENTRY(F2, I1, F2); TABLE_ENTRY(F2, I2, F2); @@ -673,6 +735,7 @@ TABLE_ENTRY(F2, C2, C2); TABLE_ENTRY(F2, C4, C4); TABLE_ENTRY(F2, C8, C8); TABLE_ENTRY(F2, B1, F2); +TABLE_ENTRY(F2, BF, F4); TABLE_ENTRY(F4, U1, F4); TABLE_ENTRY(F4, I1, F4); TABLE_ENTRY(F4, I2, F4); @@ -685,6 +748,7 @@ TABLE_ENTRY(F4, C2, C4); TABLE_ENTRY(F4, C4, C4); TABLE_ENTRY(F4, C8, C8); TABLE_ENTRY(F4, B1, F4); +TABLE_ENTRY(F4, BF, F4); TABLE_ENTRY(F8, U1, F8); TABLE_ENTRY(F8, I1, F8); TABLE_ENTRY(F8, I2, F8); @@ -697,6 +761,7 @@ TABLE_ENTRY(F8, C2, C8); TABLE_ENTRY(F8, C4, C8); TABLE_ENTRY(F8, C8, C8); TABLE_ENTRY(F8, B1, F8); +TABLE_ENTRY(F8, BF, F8); TABLE_ENTRY(C2, U1, C2); TABLE_ENTRY(C2, I1, C2); TABLE_ENTRY(C2, I2, C2); @@ -709,6 +774,7 @@ TABLE_ENTRY(C2, C2, C2); TABLE_ENTRY(C2, C4, C4); TABLE_ENTRY(C2, C8, C8); TABLE_ENTRY(C2, B1, C2); +TABLE_ENTRY(C2, BF, C4); TABLE_ENTRY(C4, U1, C4); TABLE_ENTRY(C4, I1, C4); TABLE_ENTRY(C4, I2, C4); @@ -721,6 +787,7 @@ TABLE_ENTRY(C4, C2, C4); TABLE_ENTRY(C4, C4, C4); TABLE_ENTRY(C4, C8, C8); TABLE_ENTRY(C4, B1, C4); +TABLE_ENTRY(C4, BF, C4); TABLE_ENTRY(C8, U1, C8); TABLE_ENTRY(C8, I1, C8); TABLE_ENTRY(C8, I2, C8); @@ -733,6 +800,7 @@ TABLE_ENTRY(C8, C2, C8); TABLE_ENTRY(C8, C4, C8); TABLE_ENTRY(C8, C8, C8); TABLE_ENTRY(C8, B1, C8); +TABLE_ENTRY(C8, BF, C8); TABLE_ENTRY(B1, U1, U1); TABLE_ENTRY(B1, I1, I1); TABLE_ENTRY(B1, I2, I2); @@ -745,6 +813,20 @@ TABLE_ENTRY(B1, C2, C2); TABLE_ENTRY(B1, C4, C4); TABLE_ENTRY(B1, C8, C8); TABLE_ENTRY(B1, B1, B1); +TABLE_ENTRY(B1, BF, BF); +TABLE_ENTRY(BF, U1, BF); +TABLE_ENTRY(BF, I1, BF); +TABLE_ENTRY(BF, I2, BF); +TABLE_ENTRY(BF, I4, BF); +TABLE_ENTRY(BF, I8, BF); +TABLE_ENTRY(BF, F2, F4); +TABLE_ENTRY(BF, F4, F4); +TABLE_ENTRY(BF, F8, F8); +TABLE_ENTRY(BF, C2, C4); +TABLE_ENTRY(BF, C4, C4); +TABLE_ENTRY(BF, C8, C8); +TABLE_ENTRY(BF, B1, BF); +TABLE_ENTRY(BF, BF, BF); } // namespace internal @@ -760,26 +842,20 @@ struct promote_types { (!is_bits_type::value && !is_bits_type::value), "promote_types not valid for bits dtypes"); - static_assert( - !std::is_same< - T1, - typename ScalarTypeToCppType::type>:: - value && - !std::is_same< - T2, - typename ScalarTypeToCppType< - exec_aten::ScalarType::BFloat16>::type>::value, - "promote_types not valid for BFloat16"); using promoted_type_not_respecting_half_to_float = typename internal::promote_types_lookup::type; public: using type = typename std::conditional< half_to_float && - std::is_same< - promoted_type_not_respecting_half_to_float, - typename ScalarTypeToCppType::type>:: - value, + (std::is_same< + promoted_type_not_respecting_half_to_float, + typename ScalarTypeToCppType< + exec_aten::ScalarType::Half>::type>::value || + std::is_same< + promoted_type_not_respecting_half_to_float, + typename ScalarTypeToCppType< + exec_aten::ScalarType::BFloat16>::type>::value), typename ScalarTypeToCppType::type, promoted_type_not_respecting_half_to_float>::type; }; @@ -787,7 +863,8 @@ struct promote_types { /** * Implements type promotion rules that are consistent with ATen behaviour, * which in turn is consistent with NumPy's promote_types. - * If half_to_float is set to true, then half will be promoted to float instead + * If half_to_float is set to true, then half and bfloat16 will be promoted to + * float instead */ inline exec_aten::ScalarType promoteTypes( exec_aten::ScalarType a, @@ -806,6 +883,7 @@ inline exec_aten::ScalarType promoteTypes( constexpr auto c4 = exec_aten::ScalarType::ComplexFloat; constexpr auto c8 = exec_aten::ScalarType::ComplexDouble; constexpr auto b1 = exec_aten::ScalarType::Bool; + constexpr auto bf = exec_aten::ScalarType::BFloat16; // For QInt types, only allow exact match if (executorch::runtime::isQIntType(a) && a == b) { @@ -825,34 +903,41 @@ inline exec_aten::ScalarType promoteTypes( ET_CHECK_MSG(false, "promoteTypes not valid for bits dtypes"); } - ET_CHECK_MSG( - a != exec_aten::ScalarType::BFloat16 && - b != exec_aten::ScalarType::BFloat16, - "promoteTypes not valid for BFloat16"); // 12 types are handled by this function, see the constexpr definitions above - const int NUM_PROMOTE_TYPES = 12; - + const int NUM_PROMOTE_TYPES = 13; + + static constexpr std::array + dtype2index = {{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + -1, -1, -1, 12, -1, -1, -1, -1, -1, -1, -1, -1, + }}; + auto ix_a = dtype2index[(int)a]; + ET_CHECK(ix_a != -1); + auto ix_b = dtype2index[(int)b]; + ET_CHECK(ix_b != -1); static constexpr exec_aten::ScalarType _promoteTypesLookup[NUM_PROMOTE_TYPES][NUM_PROMOTE_TYPES] = { - /* u1 i1 i2 i4 i8 f2 f4 f8 c2 c4 c8 b1 */ - /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1}, - /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, i1}, - /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, i2}, - /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, c2, c4, c8, i4}, - /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, c2, c4, c8, i8}, - /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, c2, c4, c8, f2}, - /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, c4, c4, c8, f4}, - /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, c8, c8, c8, f8}, - /* c2 */ {c2, c2, c2, c2, c2, c2, c4, c8, c2, c4, c8, c2}, - /* c4 */ {c4, c4, c4, c4, c4, c4, c4, c8, c4, c4, c8, c4}, - /* c8 */ {c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8}, - /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1}, + /* u1 i1 i2 i4 i8 f2 f4 f8 c2 c4 c8 b1 bf*/ + /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1, bf}, + /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, i1, bf}, + /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, i2, bf}, + /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, c2, c4, c8, i4, bf}, + /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, c2, c4, c8, i8, bf}, + /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, c2, c4, c8, f2, f4}, + /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, c4, c4, c8, f4, f4}, + /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, c8, c8, c8, f8, f8}, + /* c2 */ {c2, c2, c2, c2, c2, c2, c4, c8, c2, c4, c8, c2, c4}, + /* c4 */ {c4, c4, c4, c4, c4, c4, c4, c8, c4, c4, c8, c4, c4}, + /* c8 */ {c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8}, + /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1, bf}, + /* bf */ {bf, bf, bf, bf, bf, f4, f4, f8, c4, c4, c8, bf, bf}, }; - exec_aten::ScalarType promoted_type = - _promoteTypesLookup[static_cast(a)][static_cast(b)]; + exec_aten::ScalarType promoted_type = _promoteTypesLookup[ix_a][ix_b]; - if (half_to_float && promoted_type == exec_aten::ScalarType::Half) { + if (half_to_float && + (promoted_type == exec_aten::ScalarType::Half || + promoted_type == exec_aten::ScalarType::BFloat16)) { promoted_type = exec_aten::ScalarType::Float; } @@ -974,6 +1059,13 @@ inline exec_aten::ScalarType promoteTypes( ET_INTERNAL_SWITCH_CASE( \ exec_aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__) +#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3( \ + ADDITIONAL1, ADDITIONAL2, ADDITIONAL3, CTYPE_ALIAS, ...) \ + ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2( \ + ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + exec_aten::ScalarType::ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__) + #define ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, ...) \ ET_INTERNAL_SWITCH_CASE( \ exec_aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__) \ @@ -1001,6 +1093,13 @@ inline exec_aten::ScalarType promoteTypes( ET_INTERNAL_SWITCH_CASE( \ exec_aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__) +#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND2( \ + ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, ...) \ + ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND( \ + ADDITIONAL1, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + exec_aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__) + #define ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, ...) \ ET_INTERNAL_SWITCH_CASE( \ exec_aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__) \ @@ -1112,6 +1211,22 @@ inline exec_aten::ScalarType promoteTypes( ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2( \ ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)) +#define ET_SWITCH_REAL_TYPES_AND3( \ + ADDITIONAL1, \ + ADDITIONAL2, \ + ADDITIONAL3, \ + TYPE, \ + CONTEXT, \ + NAME, \ + CTYPE_ALIAS, \ + ...) \ + ET_INTERNAL_SWITCH( \ + TYPE, \ + CONTEXT, \ + NAME, \ + ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3( \ + ADDITIONAL1, ADDITIONAL2, ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__)) + #define ET_SWITCH_REALH_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ ET_SWITCH_REAL_TYPES_AND(Half, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__) @@ -1122,6 +1237,10 @@ inline exec_aten::ScalarType promoteTypes( ET_SWITCH_REAL_TYPES_AND2( \ Half, Bool, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__) +#define ET_SWITCH_REALHBBF16_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ + ET_SWITCH_REAL_TYPES_AND3( \ + Half, Bool, BFloat16, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__) + #define ET_SWITCH_INT_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ ET_INTERNAL_SWITCH( \ TYPE, \ @@ -1154,9 +1273,22 @@ inline exec_aten::ScalarType promoteTypes( ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND( \ ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)) +#define ET_SWITCH_FLOAT_TYPES_AND2( \ + ADDITIONAL1, ADDITIONAL2, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ + ET_INTERNAL_SWITCH( \ + TYPE, \ + CONTEXT, \ + NAME, \ + ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND2( \ + ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)) + #define ET_SWITCH_FLOATH_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ ET_SWITCH_FLOAT_TYPES_AND(Half, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__) +#define ET_SWITCH_FLOATHBF16_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ + ET_SWITCH_FLOAT_TYPES_AND2( \ + Half, BFloat16, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__) + #define ET_SWITCH_QINT_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ ET_INTERNAL_SWITCH( \ TYPE, \ diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h index cadb5ecd9a..630f0cdb4a 100644 --- a/runtime/core/exec_aten/util/tensor_util.h +++ b/runtime/core/exec_aten/util/tensor_util.h @@ -516,6 +516,15 @@ inline bool tensor_is_realhb_type(exec_aten::Tensor t) { return true; } +inline bool tensor_is_realhbbf16_type(exec_aten::Tensor t) { + ET_LOG_MSG_AND_RETURN_IF_FALSE( + executorch::runtime::isRealHBBF16Type(t.scalar_type()), + "Expected to find a real type, but tensor has type %s", + torch::executor::toString(t.scalar_type())); + + return true; +} + inline bool tensor_is_complex_type(exec_aten::Tensor t) { ET_LOG_MSG_AND_RETURN_IF_FALSE( torch::executor::isComplexType(t.scalar_type()), diff --git a/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp b/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp index b91c7009f4..9df01b7be9 100644 --- a/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp +++ b/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp @@ -139,37 +139,38 @@ TEST(ScalarTypeUtilTest, promoteTypesTest) { // Check some common cases - ET_CHECK( - promoteTypes(ScalarType::Float, ScalarType::Double) == - ScalarType::Double); - ET_CHECK( - promoteTypes(ScalarType::Float, ScalarType::Short) == ScalarType::Float); - - ET_CHECK( - promoteTypes(ScalarType::Float, ScalarType::Int) == ScalarType::Float); - ET_CHECK( - promoteTypes(ScalarType::Long, ScalarType::Float) == ScalarType::Float); - - ET_CHECK( - promoteTypes(ScalarType::Bool, ScalarType::Bool) == ScalarType::Bool); - - ET_CHECK(promoteTypes(ScalarType::Byte, ScalarType::Int) == ScalarType::Int); - ET_CHECK( - promoteTypes(ScalarType::Char, ScalarType::Bool) == ScalarType::Char); - ET_CHECK(promoteTypes(ScalarType::Bool, ScalarType::Int) == ScalarType::Int); + EXPECT_EQ( + promoteTypes(ScalarType::Float, ScalarType::Double), ScalarType::Double); + EXPECT_EQ( + promoteTypes(ScalarType::Float, ScalarType::Short), ScalarType::Float); + + EXPECT_EQ( + promoteTypes(ScalarType::Float, ScalarType::Int), ScalarType::Float); + EXPECT_EQ( + promoteTypes(ScalarType::Long, ScalarType::Float), ScalarType::Float); + + EXPECT_EQ(promoteTypes(ScalarType::Bool, ScalarType::Bool), ScalarType::Bool); + + EXPECT_EQ(promoteTypes(ScalarType::Byte, ScalarType::Int), ScalarType::Int); + EXPECT_EQ(promoteTypes(ScalarType::Char, ScalarType::Bool), ScalarType::Char); + EXPECT_EQ(promoteTypes(ScalarType::Bool, ScalarType::Int), ScalarType::Int); + + EXPECT_EQ( + promoteTypes(ScalarType::BFloat16, ScalarType::Half), ScalarType::Float); + EXPECT_EQ( + promoteTypes(ScalarType::BFloat16, ScalarType::Bool), + ScalarType::BFloat16); } template struct promote_types_is_valid : std::integral_constant< bool, - !std::is_same::value && - !std::is_same::value && - (std::is_same::value || - (!executorch::runtime::is_qint_type::value && - !executorch::runtime::is_qint_type::value && - !executorch::runtime::is_bits_type::value && - !executorch::runtime::is_bits_type::value))> {}; + (std::is_same::value || + (!executorch::runtime::is_qint_type::value && + !executorch::runtime::is_qint_type::value && + !executorch::runtime::is_bits_type::value && + !executorch::runtime::is_bits_type::value))> {}; template struct CompileTimePromoteTypesTestCase { @@ -195,7 +196,8 @@ struct CompileTimePromoteTypesTestCase { auto expected = executorch::runtime::promoteTypes( scalarType1, scalarType2, half_to_float); EXPECT_EQ(actual, expected) - << "promoting " << (int)scalarType1 << " to " << (int)scalarType2; + << "promoting " << (int)scalarType1 << " to " << (int)scalarType2 + << " (half to float: " << half_to_float << ')'; } template < diff --git a/runtime/core/portable_type/bfloat16.h b/runtime/core/portable_type/bfloat16.h index a1ceb0c56a..e665e6152e 100644 --- a/runtime/core/portable_type/bfloat16.h +++ b/runtime/core/portable_type/bfloat16.h @@ -8,11 +8,41 @@ #pragma once +#include #include +#include +#include +#include namespace torch { namespace executor { +namespace internal { +inline float f32_from_bits(uint16_t src) { + float res = 0; + uint32_t tmp = src; + tmp <<= 16; + std::memcpy(&res, &tmp, sizeof(tmp)); + return res; +} + +inline uint16_t bits_from_f32(float src) { + uint32_t res = 0; + std::memcpy(&res, &src, sizeof(res)); + return res >> 16; +} + +inline uint16_t round_to_nearest_even(float src) { + if (std::isnan(src)) { + return UINT16_C(0x7FC0); + } + uint32_t U32 = 0; + std::memcpy(&U32, &src, sizeof(U32)); + uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF); + return static_cast((U32 + rounding_bias) >> 16); +} +} // namespace internal + /** * The "brain floating-point" type, compatible with c10/util/BFloat16.h from * pytorch core. @@ -22,7 +52,288 @@ namespace executor { */ struct alignas(2) BFloat16 { uint16_t x; + + BFloat16() = default; + struct from_bits_t {}; + static constexpr from_bits_t from_bits() { + return from_bits_t(); + } + + constexpr BFloat16(unsigned short bits, from_bits_t) : x(bits) {} + /* implicit */ BFloat16(float value) + : x(internal::round_to_nearest_even(value)) {} + operator float() const { + return internal::f32_from_bits(x); + } }; +inline std::ostream& operator<<(std::ostream& out, const BFloat16& value) { + out << (float)value; + return out; +} + +/// Arithmetic + +inline BFloat16 operator+(const BFloat16& a, const BFloat16& b) { + return static_cast(a) + static_cast(b); +} + +inline BFloat16 operator-(const BFloat16& a, const BFloat16& b) { + return static_cast(a) - static_cast(b); +} + +inline BFloat16 operator*(const BFloat16& a, const BFloat16& b) { + return static_cast(a) * static_cast(b); +} + +inline BFloat16 operator/(const BFloat16& a, const BFloat16& b) { + return static_cast(a) / static_cast(b); +} + +inline BFloat16 operator-(const BFloat16& a) { + return -static_cast(a); +} + +inline BFloat16& operator+=(BFloat16& a, const BFloat16& b) { + a = a + b; + return a; +} + +inline BFloat16& operator-=(BFloat16& a, const BFloat16& b) { + a = a - b; + return a; +} + +inline BFloat16& operator*=(BFloat16& a, const BFloat16& b) { + a = a * b; + return a; +} + +inline BFloat16& operator/=(BFloat16& a, const BFloat16& b) { + a = a / b; + return a; +} + +inline BFloat16& operator|(BFloat16& a, const BFloat16& b) { + a.x = a.x | b.x; + return a; +} + +inline BFloat16& operator^(BFloat16& a, const BFloat16& b) { + a.x = a.x ^ b.x; + return a; +} + +inline BFloat16& operator&(BFloat16& a, const BFloat16& b) { + a.x = a.x & b.x; + return a; +} + +/// Arithmetic with floats + +inline float operator+(BFloat16 a, float b) { + return static_cast(a) + b; +} +inline float operator-(BFloat16 a, float b) { + return static_cast(a) - b; +} +inline float operator*(BFloat16 a, float b) { + return static_cast(a) * b; +} +inline float operator/(BFloat16 a, float b) { + return static_cast(a) / b; +} + +inline float operator+(float a, BFloat16 b) { + return a + static_cast(b); +} +inline float operator-(float a, BFloat16 b) { + return a - static_cast(b); +} +inline float operator*(float a, BFloat16 b) { + return a * static_cast(b); +} +inline float operator/(float a, BFloat16 b) { + return a / static_cast(b); +} + +inline float& operator+=(float& a, const BFloat16& b) { + return a += static_cast(b); +} +inline float& operator-=(float& a, const BFloat16& b) { + return a -= static_cast(b); +} +inline float& operator*=(float& a, const BFloat16& b) { + return a *= static_cast(b); +} +inline float& operator/=(float& a, const BFloat16& b) { + return a /= static_cast(b); +} + +/// Arithmetic with doubles + +inline double operator+(BFloat16 a, double b) { + return static_cast(a) + b; +} +inline double operator-(BFloat16 a, double b) { + return static_cast(a) - b; +} +inline double operator*(BFloat16 a, double b) { + return static_cast(a) * b; +} +inline double operator/(BFloat16 a, double b) { + return static_cast(a) / b; +} + +inline double operator+(double a, BFloat16 b) { + return a + static_cast(b); +} +inline double operator-(double a, BFloat16 b) { + return a - static_cast(b); +} +inline double operator*(double a, BFloat16 b) { + return a * static_cast(b); +} +inline double operator/(double a, BFloat16 b) { + return a / static_cast(b); +} + +/// Arithmetic with ints + +inline BFloat16 operator+(BFloat16 a, int b) { + return a + static_cast(b); +} +inline BFloat16 operator-(BFloat16 a, int b) { + return a - static_cast(b); +} +inline BFloat16 operator*(BFloat16 a, int b) { + return a * static_cast(b); +} +inline BFloat16 operator/(BFloat16 a, int b) { + return a / static_cast(b); +} + +inline BFloat16 operator+(int a, BFloat16 b) { + return static_cast(a) + b; +} +inline BFloat16 operator-(int a, BFloat16 b) { + return static_cast(a) - b; +} +inline BFloat16 operator*(int a, BFloat16 b) { + return static_cast(a) * b; +} +inline BFloat16 operator/(int a, BFloat16 b) { + return static_cast(a) / b; +} + +//// Arithmetic with int64_t + +inline BFloat16 operator+(BFloat16 a, int64_t b) { + return a + static_cast(b); +} +inline BFloat16 operator-(BFloat16 a, int64_t b) { + return a - static_cast(b); +} +inline BFloat16 operator*(BFloat16 a, int64_t b) { + return a * static_cast(b); +} +inline BFloat16 operator/(BFloat16 a, int64_t b) { + return a / static_cast(b); +} + +inline BFloat16 operator+(int64_t a, BFloat16 b) { + return static_cast(a) + b; +} +inline BFloat16 operator-(int64_t a, BFloat16 b) { + return static_cast(a) - b; +} +inline BFloat16 operator*(int64_t a, BFloat16 b) { + return static_cast(a) * b; +} +inline BFloat16 operator/(int64_t a, BFloat16 b) { + return static_cast(a) / b; +} + +// Overloading < and > operators, because std::max and std::min use them. + +inline bool operator>(BFloat16& lhs, BFloat16& rhs) { + return float(lhs) > float(rhs); +} + +inline bool operator<(BFloat16& lhs, BFloat16& rhs) { + return float(lhs) < float(rhs); +} + } // namespace executor } // namespace torch + +namespace std { + +template <> +class numeric_limits { + public: + static constexpr bool is_signed = true; + static constexpr bool is_specialized = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr auto has_denorm = numeric_limits::has_denorm; + static constexpr auto has_denorm_loss = + numeric_limits::has_denorm_loss; + static constexpr auto round_style = numeric_limits::round_style; + static constexpr bool is_iec559 = false; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + static constexpr int digits = 8; + static constexpr int digits10 = 2; + static constexpr int max_digits10 = 4; + static constexpr int radix = 2; + static constexpr int min_exponent = -125; + static constexpr int min_exponent10 = -37; + static constexpr int max_exponent = 128; + static constexpr int max_exponent10 = 38; + static constexpr auto traps = numeric_limits::traps; + static constexpr auto tinyness_before = + numeric_limits::tinyness_before; + + static constexpr torch::executor::BFloat16 min() { + return torch::executor::BFloat16( + 0x0080, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 lowest() { + return torch::executor::BFloat16( + 0xFF7F, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 max() { + return torch::executor::BFloat16( + 0x7F7F, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 epsilon() { + return torch::executor::BFloat16( + 0x3C00, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 round_error() { + return torch::executor::BFloat16( + 0x3F00, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 infinity() { + return torch::executor::BFloat16( + 0x7F80, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 quiet_NaN() { + return torch::executor::BFloat16( + 0x7FC0, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 signaling_NaN() { + return torch::executor::BFloat16( + 0x7F80, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 denorm_min() { + return torch::executor::BFloat16( + 0x0001, torch::executor::BFloat16::from_bits()); + } +}; + +} // namespace std diff --git a/runtime/core/portable_type/bfloat16_math.h b/runtime/core/portable_type/bfloat16_math.h new file mode 100644 index 0000000000..68ee77cf34 --- /dev/null +++ b/runtime/core/portable_type/bfloat16_math.h @@ -0,0 +1,290 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace std { + +template +struct is_reduced_floating_point + : std::integral_constant< + bool, + std::is_same::value || + std::is_same::value> {}; + +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T acos(T a) { + return std::acos(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T asin(T a) { + return std::asin(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T atan(T a) { + return std::atan(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T atanh(T a) { + return std::atanh(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T erf(T a) { + return std::erf(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T erfc(T a) { + return std::erfc(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T exp(T a) { + return std::exp(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T expm1(T a) { + return std::expm1(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline bool isfinite(T a) { + return std::isfinite(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T log(T a) { + return std::log(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T log10(T a) { + return std::log10(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T log1p(T a) { + return std::log1p(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T log2(T a) { + return std::log2(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T ceil(T a) { + return std::ceil(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T cos(T a) { + return std::cos(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T floor(T a) { + return std::floor(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T nearbyint(T a) { + return std::nearbyint(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T sin(T a) { + return std::sin(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T tan(T a) { + return std::tan(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T sinh(T a) { + return std::sinh(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T cosh(T a) { + return std::cosh(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T tanh(T a) { + return std::tanh(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T trunc(T a) { + return std::trunc(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T lgamma(T a) { + return std::lgamma(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T sqrt(T a) { + return std::sqrt(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T rsqrt(T a) { + return 1.0 / std::sqrt(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T abs(T a) { + return std::abs(float(a)); +} +#if defined(_MSC_VER) && defined(__CUDACC__) +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T pow(T a, double b) { + return std::pow(float(a), float(b)); +} +#else +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T pow(T a, double b) { + return std::pow(float(a), b); +} +#endif +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T pow(T a, T b) { + return std::pow(float(a), float(b)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T fmod(T a, T b) { + return std::fmod(float(a), float(b)); +} + +/* + The following function is inspired from the implementation in `musl` + Link to License: https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT + ---------------------------------------------------------------------- + Copyright © 2005-2020 Rich Felker, et al. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ---------------------------------------------------------------------- + */ +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T nextafter(T from, T to) { + // Reference: + // https://git.musl-libc.org/cgit/musl/tree/src/math/nextafter.c + using int_repr_t = uint16_t; + constexpr uint8_t bits = 16; + union { + T f; + int_repr_t i; + } ufrom = {from}, uto = {to}; + + // get a mask to get the sign bit i.e. MSB + int_repr_t sign_mask = int_repr_t{1} << (bits - 1); + + // short-circuit: if either is NaN, return NaN + if (from != from || to != to) { + return from + to; + } + + // short-circuit: if they are exactly the same. + if (ufrom.i == uto.i) { + return from; + } + + // mask the sign-bit to zero i.e. positive + // equivalent to abs(x) + int_repr_t abs_from = ufrom.i & ~sign_mask; + int_repr_t abs_to = uto.i & ~sign_mask; + if (abs_from == 0) { + // if both are zero but with different sign, + // preserve the sign of `to`. + if (abs_to == 0) { + return to; + } + // smallest subnormal with sign of `to`. + ufrom.i = (uto.i & sign_mask) | int_repr_t{1}; + return ufrom.f; + } + + // if abs(from) > abs(to) or sign(from) != sign(to) + if (abs_from > abs_to || ((ufrom.i ^ uto.i) & sign_mask)) { + ufrom.i--; + } else { + ufrom.i++; + } + + return ufrom.f; +} + +} // namespace std diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl index 0d65ef36b8..b8ccbe602e 100644 --- a/runtime/core/portable_type/targets.bzl +++ b/runtime/core/portable_type/targets.bzl @@ -43,6 +43,7 @@ def define_common_targets(): name = "scalar_type", exported_headers = [ "bfloat16.h", + "bfloat16_math.h", "complex.h", "half.h", "scalar_type.h", diff --git a/runtime/core/portable_type/test/CMakeLists.txt b/runtime/core/portable_type/test/CMakeLists.txt index 21eb4feae0..58a69f656e 100644 --- a/runtime/core/portable_type/test/CMakeLists.txt +++ b/runtime/core/portable_type/test/CMakeLists.txt @@ -24,7 +24,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..) include(${EXECUTORCH_ROOT}/build/Test.cmake) set(_test_srcs optional_test.cpp tensor_test.cpp half_test.cpp scalar_test.cpp - tensor_impl_test.cpp + tensor_impl_test.cpp bfloat16_test.cpp ) et_cxx_test(runtime_core_portable_type_test SOURCES ${_test_srcs} EXTRA_LIBS) diff --git a/runtime/core/portable_type/test/bfloat16_test.cpp b/runtime/core/portable_type/test/bfloat16_test.cpp new file mode 100644 index 0000000000..9ea53e6cba --- /dev/null +++ b/runtime/core/portable_type/test/bfloat16_test.cpp @@ -0,0 +1,191 @@ +#include + +#include + +using torch::executor::BFloat16; + +namespace { +float float_from_bytes(uint32_t sign, uint32_t exponent, uint32_t fraction) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + uint32_t bytes; + bytes = 0; + bytes |= sign; + bytes <<= 8; + bytes |= exponent; + bytes <<= 23; + bytes |= fraction; + + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + float res; + std::memcpy(&res, &bytes, sizeof(res)); + return res; +} + +TEST(BFloat16Conversion, FloatToBFloat16AndBack) { + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) + float in[100]; + for (int i = 0; i < 100; ++i) { + // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers) + in[i] = i + 1.25; + } + + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) + BFloat16 bfloats[100]; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) + float out[100]; + + for (int i = 0; i < 100; ++i) { + bfloats[i].x = torch::executor::internal::bits_from_f32(in[i]); + out[i] = torch::executor::internal::f32_from_bits(bfloats[i].x); + + // The relative error should be less than 1/(2^7) since BFloat16 + // has 7 bits mantissa. + EXPECT_LE(std::fabs(out[i] - in[i]) / in[i], 1.0 / 128); + } +} + +TEST(BFloat16Conversion, FloatToBFloat16RNEAndBack) { + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) + float in[100]; + for (int i = 0; i < 100; ++i) { + // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers) + in[i] = i + 1.25; + } + + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) + BFloat16 bfloats[100]; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) + float out[100]; + + for (int i = 0; i < 100; ++i) { + bfloats[i].x = torch::executor::internal::round_to_nearest_even(in[i]); + out[i] = torch::executor::internal::f32_from_bits(bfloats[i].x); + + // The relative error should be less than 1/(2^7) since BFloat16 + // has 7 bits mantissa. + EXPECT_LE(std::fabs(out[i] - in[i]) / in[i], 1.0 / 128); + } +} + +TEST(BFloat16Conversion, NaN) { + float inNaN = float_from_bytes(0, 0xFF, 0x7FFFFF); + EXPECT_TRUE(std::isnan(inNaN)); + + BFloat16 a = BFloat16(inNaN); + float out = torch::executor::internal::f32_from_bits(a.x); + + EXPECT_TRUE(std::isnan(out)); +} + +TEST(BFloat16Conversion, Inf) { + float inInf = float_from_bytes(0, 0xFF, 0); + EXPECT_TRUE(std::isinf(inInf)); + + BFloat16 a = BFloat16(inInf); + float out = torch::executor::internal::f32_from_bits(a.x); + + EXPECT_TRUE(std::isinf(out)); +} + +TEST(BFloat16Conversion, SmallestDenormal) { + float in = std::numeric_limits::denorm_min(); // The smallest non-zero + // subnormal number + BFloat16 a = BFloat16(in); + float out = torch::executor::internal::f32_from_bits(a.x); + + EXPECT_FLOAT_EQ(in, out); +} + +TEST(BFloat16Math, Addition) { + // This test verifies that if only first 7 bits of float's mantissa are + // changed after addition, we should have no loss in precision. + + // input bits + // S | Exponent | Mantissa + // 0 | 10000000 | 10010000000000000000000 = 3.125 + float input = float_from_bytes(0, 0, 0x40480000); + + // expected bits + // S | Exponent | Mantissa + // 0 | 10000001 | 10010000000000000000000 = 6.25 + float expected = float_from_bytes(0, 0, 0x40c80000); + + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) + BFloat16 b; + b.x = torch::executor::internal::bits_from_f32(input); + b = b + b; + + float res = torch::executor::internal::f32_from_bits(b.x); + EXPECT_EQ(res, expected); +} + +TEST(BFloat16Math, Subtraction) { + // This test verifies that if only first 7 bits of float's mantissa are + // changed after subtraction, we should have no loss in precision. + + // input bits + // S | Exponent | Mantissa + // 0 | 10000001 | 11101000000000000000000 = 7.625 + float input = float_from_bytes(0, 0, 0x40f40000); + + // expected bits + // S | Exponent | Mantissa + // 0 | 10000000 | 01010000000000000000000 = 2.625 + float expected = float_from_bytes(0, 0, 0x40280000); + + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) + BFloat16 b; + b.x = torch::executor::internal::bits_from_f32(input); + b = b - 5; + + float res = torch::executor::internal::f32_from_bits(b.x); + EXPECT_EQ(res, expected); +} + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +TEST(BFloat16Math, NextAfterZero) { + const BFloat16 zero{0}; + + auto check_nextafter = [](BFloat16 from, BFloat16 to, BFloat16 expected) { + BFloat16 actual = std::nextafter(from, to); + // Check for bitwise equality! + ASSERT_EQ(actual.x ^ expected.x, uint16_t{0}); + }; + check_nextafter(zero, zero, /*expected=*/zero); + check_nextafter(zero, -zero, /*expected=*/-zero); + check_nextafter(-zero, zero, /*expected=*/zero); + check_nextafter(-zero, -zero, /*expected=*/-zero); +} + +float BinaryToFloat(uint32_t bytes) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + float res; + std::memcpy(&res, &bytes, sizeof(res)); + return res; +} + +struct BFloat16TestParam { + uint32_t input; + uint16_t rne; +}; + +class BFloat16Test : public ::testing::Test, + public ::testing::WithParamInterface {}; + +TEST_P(BFloat16Test, BFloat16RNETest) { + float value = BinaryToFloat(GetParam().input); + uint16_t rounded = torch::executor::internal::round_to_nearest_even(value); + EXPECT_EQ(GetParam().rne, rounded); +} + +INSTANTIATE_TEST_SUITE_P( + BFloat16TestInstantiation, + BFloat16Test, + ::testing::Values( + BFloat16TestParam{0x3F848000, 0x3F84}, + BFloat16TestParam{0x3F848010, 0x3F85}, + BFloat16TestParam{0x3F850000, 0x3F85}, + BFloat16TestParam{0x3F858000, 0x3F86}, + BFloat16TestParam{0x3FFF8000, 0x4000})); + +} // namespace diff --git a/runtime/core/portable_type/test/targets.bzl b/runtime/core/portable_type/test/targets.bzl index af55f95e45..c0b4ef00c7 100644 --- a/runtime/core/portable_type/test/targets.bzl +++ b/runtime/core/portable_type/test/targets.bzl @@ -6,6 +6,14 @@ def define_common_targets(): The directory containing this targets.bzl file should also contain both TARGETS and BUCK files that call this function. """ + runtime.cxx_test( + name = "bfloat16_test", + srcs = ["bfloat16_test.cpp"], + deps = [ + "//executorch/runtime/core/portable_type:portable_type", + ], + ) + runtime.cxx_test( name = "optional_test", srcs = ["optional_test.cpp"], diff --git a/runtime/executor/test/method_test.cpp b/runtime/executor/test/method_test.cpp index 06b84d338e..0163c8ceef 100644 --- a/runtime/executor/test/method_test.cpp +++ b/runtime/executor/test/method_test.cpp @@ -59,11 +59,9 @@ class MethodTest : public ::testing::Test { load_program(std::getenv("ET_MODULE_INDEX_PATH"), "index"); load_program( std::getenv("ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH"), "cat"); + load_program(std::getenv("ET_MODULE_LINEAR_PATH"), "linear"); load_program( - std::getenv("ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH"), - "linear_constant_segment"); - load_program( - std::getenv("ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH"), + std::getenv("DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH"), "linear_constant_buffer"); } @@ -274,7 +272,7 @@ TEST_F(MethodTest, ConstantSegmentTest) { // Execute model with constants stored in segment. ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes); Result method = - programs_["linear_constant_segment"]->load_method("forward", &mmm.get()); + programs_["linear"]->load_method("forward", &mmm.get()); ASSERT_EQ(method.error(), Error::Ok); // Can execute the method. diff --git a/runtime/executor/test/program_test.cpp b/runtime/executor/test/program_test.cpp index 00e8b0e234..80f91f1af6 100644 --- a/runtime/executor/test/program_test.cpp +++ b/runtime/executor/test/program_test.cpp @@ -379,11 +379,32 @@ TEST_F(ProgramTest, DEPRECATEDLoad) { EXPECT_EQ(program_res.error(), Error::Ok); } +TEST_F(ProgramTest, LoadConstantSegmentWithNoConstantSegment) { + Result program = + Program::load(add_loader_.get(), kDefaultVerification); + ASSERT_EQ(program.error(), Error::Ok); + + // Load constant segment data should fail. + const auto segment_info = DataLoader::SegmentInfo( + DataLoader::SegmentInfo::Type::Constant, + /*segment_index=*/0); + Result segment = + ProgramTestFriend::LoadSegment(&program.get(), segment_info); + EXPECT_NE(segment.error(), Error::Ok); + + const executorch_flatbuffer::Program* flatbuffer_program = + ProgramTestFriend::GetInternalProgram(&program.get()); + + // The constant buffer should be empty. + EXPECT_EQ(flatbuffer_program->constant_buffer()->size(), 0); + + // Expect 1 constant segment, placeholder for non-const tensors. + EXPECT_EQ(flatbuffer_program->segments()->size(), 1); +} + TEST_F(ProgramTest, LoadConstantSegment) { - // Load the serialized ModuleLinear data, with constants in the segment and no - // constants in the flatbuffer. - const char* linear_path = - std::getenv("ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH"); + // Load the serialized ModuleLinear data, with constants in the segment. + const char* linear_path = std::getenv("ET_MODULE_LINEAR_PATH"); Result linear_loader = FileDataLoader::from(linear_path); ASSERT_EQ(linear_loader.error(), Error::Ok); @@ -424,11 +445,11 @@ TEST_F(ProgramTest, LoadConstantSegment) { EXPECT_GE(flatbuffer_program->constant_segment()->offsets()->size(), 1); } -TEST_F(ProgramTest, LoadConstantSegmentWithNoConstantSegment) { +TEST_F(ProgramTest, LoadConstantSegmentWhenConstantBufferExists) { // Load the serialized ModuleLinear data, with constants in the flatbuffer and // no constants in the segment. const char* linear_path = - std::getenv("ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH"); + std::getenv("DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH"); Result linear_loader = FileDataLoader::from(linear_path); ASSERT_EQ(linear_loader.error(), Error::Ok); @@ -505,8 +526,8 @@ TEST_F(ProgramTest, LoadFromMutableSegment) { const executorch_flatbuffer::Program* flatbuffer_program = ProgramTestFriend::GetInternalProgram(&program.get()); - // Expect 1 segment. 1 mutable segment and no constant segment. - EXPECT_EQ(flatbuffer_program->segments()->size(), 1); + // Expect 2 segments. 1 mutable segment and 1 constant segment. + EXPECT_EQ(flatbuffer_program->segments()->size(), 2); // Expect a mutable data segment. EXPECT_EQ(flatbuffer_program->mutable_data_segments()->size(), 1); diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl index d6e3bc3d89..72923e9868 100644 --- a/runtime/executor/test/targets.bzl +++ b/runtime/executor/test/targets.bzl @@ -97,6 +97,8 @@ def define_common_targets(is_fbcode = False): # file in fbcode. See https://fburl.com/9esapdmd if not runtime.is_oss and is_fbcode: modules_env = { + # Deprecated model that still works with ExecuTorch runtime. + "DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH": "$(location fbcode//executorch/test/models/deprecated:ModuleLinear-no-constant-segment.pte)", # The tests use this var to find the program file to load. This uses # an fbcode target path because the authoring/export tools # intentionally don't work in xplat (since they're host-only tools). @@ -104,8 +106,7 @@ def define_common_targets(is_fbcode = False): "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])", "ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleDynamicCatUnallocatedIO.pte])", "ET_MODULE_INDEX_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleIndex.pte])", - "ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear-no-constant-segment.pte])", - "ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear.pte])", + "ET_MODULE_LINEAR_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear.pte])", "ET_MODULE_MULTI_ENTRY_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleMultipleEntry.pte])", "ET_MODULE_SIMPLE_TRAIN_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleSimpleTrain.pte])", } diff --git a/schema/program.fbs b/schema/program.fbs index cbdda2d360..e3c7597fcd 100644 --- a/schema/program.fbs +++ b/schema/program.fbs @@ -429,6 +429,7 @@ table Program { // Each constant is assigned an index into the table which are each individually aligned. // 0 index is reserved to be pointed to by non-constant Tensors. // If this field is non-empty, constant_segment.offsets must be empty. + // DEPRECATED: After D61996249 on 2024-09-05, no new PTE files will use this field. constant_buffer:[Buffer]; // List of delegate data. Pointed to by BackendDelegateDataReference. diff --git a/test/end2end/exported_module.py b/test/end2end/exported_module.py index 656b570512..6e6b97b718 100644 --- a/test/end2end/exported_module.py +++ b/test/end2end/exported_module.py @@ -67,7 +67,6 @@ def export( ignore_to_out_var_failure: bool = False, dynamic_memory_planning_mode: DynamicMemoryPlanningMode = DynamicMemoryPlanningMode.UPPER_BOUND, capture_config=None, - extract_constant_segment: bool = True, skip_type_promotion: bool = False, export_joint_graph: bool = False, ) -> "ExportedModule": @@ -206,7 +205,6 @@ def __init__(self, method): dynamic_memory_planning_mode=dynamic_memory_planning_mode, memory_planning_pass=memory_planning_pass, to_out_var_pass=ToOutVarPass(ignore_to_out_var_failure), - extract_constant_segment=extract_constant_segment, ) ) diff --git a/test/models/deprecated/ModuleLinear-no-constant-segment.pte b/test/models/deprecated/ModuleLinear-no-constant-segment.pte new file mode 100644 index 0000000000000000000000000000000000000000..42b8643fb91a6709d40a70b0ac68d0a194d9e878 GIT binary patch literal 1040 zcmaJ=y-osQ5F8NBlMr(;giugi!WBnC!4V5XMWG#)xyo2bg2qB6ArKQ|Vc{eA7(Ruu zpvqi8F`s$h zkmQi46HkoWS)`BY%>2wVj&dzRybJ2k4DvPOXga|j&%xyKuG<|9hxDKS<=jqh$f1N} zr$C9P(OqKfk(dxx(^2XN+%w&r;&ay5efdQ~I`k}GF(FR*W%@2ZikfeeXCRqbcLi&< zu1?ME5Z8Rr-e&Zry+x=;f01a)tB07HLwwz}_F;4s#eXw=N57Zd;VEObmoBQO#jD{L z^{xR2d<%DZ>W!3nj3nmxJSv2GAg!nRs0Mt5b#zv|CY z&C>F2d`H@}dpp-}P31LB;_E-RoYqVB#c)<@+s?Ed{nF}ZcPy^vP~EuGh#KuMjv7%I VG+W2*u-Rld>2)<4-w(z^`~V(NT@e5P literal 0 HcmV?d00001 diff --git a/test/models/deprecated/README.md b/test/models/deprecated/README.md new file mode 100644 index 0000000000..f1d47d0326 --- /dev/null +++ b/test/models/deprecated/README.md @@ -0,0 +1,14 @@ +## Deprecated Models + +This readme documents deprecated models that remain compatible with versions of the ExecuTorch runtime. + +ModuleLinear-no-constant-segment.pte +- This file contains constants stored in the constant_buffer, which was deprecated in D61996249 on 2024-09-05. Now, constants are stored in a separate segment. +- This .pte file was generated internally using hg commit hash rFBS5e49dc0319b1d2d9969bbcef92857ab76a899c34, with command: + ``` + buck2 build fbcode//executorch/test/models:exported_programs[ModuleLinear-no-constant-segment.pte] --show-output + ``` +- In OSS, the same .pte file can be generated with https://github.com/pytorch/executorch/commit/cea5abbcdded, via: + ``` + python -m test.models.export_program --modules "ModuleLinear" --outdir . + ``` diff --git a/test/models/deprecated/TARGETS b/test/models/deprecated/TARGETS new file mode 100644 index 0000000000..369fc3c406 --- /dev/null +++ b/test/models/deprecated/TARGETS @@ -0,0 +1,12 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +oncall("executorch") + +runtime.export_file( + name = "ModuleLinear-no-constant-segment.pte", + src = "ModuleLinear-no-constant-segment.pte", + visibility = [ + "//executorch/runtime/executor/test/...", + "//executorch/test/...", + ], +) diff --git a/test/models/export_program.py b/test/models/export_program.py index 7941af376f..d753475b82 100644 --- a/test/models/export_program.py +++ b/test/models/export_program.py @@ -190,7 +190,6 @@ def export_joint(): def export_module_to_program( module_class: Type[nn.Module], - extract_constant_segment: bool, skip_type_promotion: bool, ): """Exports the module and returns the serialized program data.""" @@ -211,7 +210,6 @@ def export_module_to_program( module = ExportedModule.export( module_class, methods, - extract_constant_segment=extract_constant_segment, skip_type_promotion=skip_type_promotion, export_joint_graph=export_joint, **export_kwargs, @@ -259,18 +257,15 @@ def main() -> None: # Skip type promotion to keep the model in fp16. # Type promotion will convert to fp32. skip_type_promotion = True - for extract_constant_segment in (True, False): - suffix = "" if extract_constant_segment else "-no-constant-segment" - outfile = os.path.join(args.outdir, f"{module_name}{suffix}.pte") - with open(outfile, "wb") as fp: - fp.write( - export_module_to_program( - module_class, - extract_constant_segment=extract_constant_segment, - skip_type_promotion=skip_type_promotion, - ) + outfile = os.path.join(args.outdir, f"{module_name}.pte") + with open(outfile, "wb") as fp: + fp.write( + export_module_to_program( + module_class, + skip_type_promotion=skip_type_promotion, ) - print(f"Exported {module_name} and wrote program data to {outfile}") + ) + print(f"Exported {module_name} and wrote program data to {outfile}") if __name__ == "__main__": diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh index 3693700e83..078196bfc1 100755 --- a/test/run_oss_cpp_tests.sh +++ b/test/run_oss_cpp_tests.sh @@ -56,23 +56,23 @@ export_test_model() { python3 -m test.models.export_program --modules "ModuleAdd,ModuleAddHalf,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleLinear,ModuleMultipleEntry,ModuleSimpleTrain" --outdir "cmake-out" 2> /dev/null python3 -m test.models.export_delegated_program --modules "ModuleAddMul" --backend_id "StubBackend" --outdir "cmake-out" || true + DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH="$(realpath test/models/deprecated/ModuleLinear-no-constant-segment.pte)" ET_MODULE_ADD_HALF_PATH="$(realpath cmake-out/ModuleAddHalf.pte)" ET_MODULE_ADD_PATH="$(realpath cmake-out/ModuleAdd.pte)" ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH="$(realpath cmake-out/ModuleDynamicCatUnallocatedIO.pte)" ET_MODULE_INDEX_PATH="$(realpath cmake-out/ModuleIndex.pte)" - ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH="$(realpath cmake-out/ModuleLinear-no-constant-segment.pte)" - ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH="$(realpath cmake-out/ModuleLinear.pte)" + ET_MODULE_LINEAR_PATH="$(realpath cmake-out/ModuleLinear.pte)" ET_MODULE_MULTI_ENTRY_PATH="$(realpath cmake-out/ModuleMultipleEntry.pte)" ET_MODULE_ADD_MUL_NOSEGMENTS_DA1024_PATH="$(realpath cmake-out/ModuleAddMul-nosegments-da1024.pte)" ET_MODULE_ADD_MUL_NOSEGMENTS_PATH="$(realpath cmake-out/ModuleAddMul-nosegments.pte)" ET_MODULE_ADD_MUL_PATH="$(realpath cmake-out/ModuleAddMul.pte)" ET_MODULE_SIMPLE_TRAIN_PATH="$(realpath cmake-out/ModuleSimpleTrain.pte)" + export DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH export ET_MODULE_ADD_HALF_PATH export ET_MODULE_ADD_PATH export ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH export ET_MODULE_INDEX_PATH - export ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH - export ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH + export ET_MODULE_LINEAR_PATH export ET_MODULE_MULTI_ENTRY_PATH export ET_MODULE_ADD_MUL_NOSEGMENTS_DA1024_PATH export ET_MODULE_ADD_MUL_NOSEGMENTS_PATH diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json index 93ae82acc3..dca2a7bbbc 100644 --- a/test/utils/OSSTestConfig.json +++ b/test/utils/OSSTestConfig.json @@ -52,8 +52,7 @@ { "directory": "extension/runner_util/test", "sources": [ - "inputs_test.cpp", - "managed_tensor_test.cpp" + "inputs_test.cpp" ], "additional_libs": [ "extension_data_loader",