Kleidi Integration (#5162)

Summary: # Bringing KleidiAI QB4 Kernels to ExecuTorch KleidiAI has released QB4 Kernels which pack the activation while dynamically quantizating to improve performance of the gemm kernel. We leverage these kernels through XNNPACK by wiring up these kernels there. This Integration is still waiting on a couple of dependent PRs in other Repos to land. ## Dependent PR Tracking * google/XNNPACK#7003 * https://gitlab.arm.com/kleidi/kleidiai/-/merge_requests/28 ## Notes on the Update When updating XNNPACK to the branch with the integrated Kleidi Kernels, we have to make some changes to the cmake because of refactoring done in XNNPACK. prod-microkernels and kleidiai are both static libraries linked to libXNNPACK.a, since llama runner (which links against xnnpack_backend) is in a seperate project, we need to install these new static libraries so that we can later properly link them to llama runner. These changes can be seen in the corresponding cmake files. The new feature is currently guarded behind EXECUTORCH_XNNPACK_ENABLE_KLEIDI flag. ## Repro ``` git submodule sync git submodule update --init ``` I used the following alias's to make it easier to build llama_main for android: ``` alias build_et_android="cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI=arm64-v8a \ -DANDROID_PLATFORM=android-23 \ -DCMAKE_INSTALL_PREFIX=cmake-out-android \ -DEXECUTORCH_ENABLE_LOGGING=1 \ -DCMAKE_BUILD_TYPE=Release \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_XNNPACK_ENABLE_KLEIDI=ON \ -DXNNPACK_ENABLE_ARM_BF16=OFF \ -Bcmake-out-android . && cmake --build cmake-out-android -j16 --target install --config Release " alias build_llama_android="cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI=arm64-v8a \ -DANDROID_PLATFORM=android-23 \ -DCMAKE_INSTALL_PREFIX=cmake-out-android \ -DCMAKE_BUILD_TYPE=Release \ -DPYTHON_EXECUTABLE=python \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_USE_TIKTOKEN=ON \ -Bcmake-out-android/examples/models/llama2 \ examples/models/llama2 && cmake --build cmake-out-android/examples/models/llama2 -j16 --config Release " ``` I run the following: ``` build_et_android build_llama_android cd cmake-out-android/examples/models/llama2 adb push llama_main /data/local/tmp/ adb push <path/to/llama3.pte> /data/local/tmp adb push <path/to/tiktokenizer> /data/local/tmp adb shell "cd /data/local/tmp && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.bin> --cpu_threads=4 ``` ## Benchmarks I ran llama3.1 with * sdpa_w_kvcache * quantized embeddings * 4bit blockwise quantized weights * dynamic shapes * parallel prefill on Samsung S22 w/4 threads ### Baseline (QD8) ``` I 00:00:32.772974 executorch:stats.h:84] Prompt Tokens: 8 Generated Tokens: 119 I 00:00:32.772980 executorch:stats.h:90] Model Load Time: 15.273000 (seconds) I 00:00:32.773014 executorch:stats.h:100] Total inference time: 17.488000 (seconds) Rate: 6.804666 (tokens/second) I 00:00:32.773019 executorch:stats.h:108] Prompt evaluation: 2.971000 (seconds) Rate: 2.692696 (tokens/second) I 00:00:32.773023 executorch:stats.h:119] Generated 119 tokens: 14.517000 (seconds) Rate: 8.197286 (tokens/second) I 00:00:32.773027 executorch:stats.h:127] Time to first generated token: 2.971000 (seconds) I 00:00:32.773030 executorch:stats.h:134] Sampling time over 127 tokens: 0.173000 (seconds) ``` ### QP8 ``` I 00:00:46.767429 executorch:stats.h:84] Prompt Tokens: 8 Generated Tokens: 119 I 00:00:46.767437 executorch:stats.h:90] Model Load Time: 28.297000 (seconds) I 00:00:46.767475 executorch:stats.h:100] Total inference time: 18.436000 (seconds) Rate: 6.454762 (tokens/second) I 00:00:46.767483 executorch:stats.h:108] Prompt evaluation: 1.770000 (seconds) Rate: 4.519774 (tokens/second) I 00:00:46.767491 executorch:stats.h:119] Generated 119 tokens: 16.666000 (seconds) Rate: 7.140286 (tokens/second) I 00:00:46.767522 executorch:stats.h:127] Time to first generated token: 1.770000 (seconds) I 00:00:46.767527 executorch:stats.h:134] Sampling time over 127 tokens: 0.189000 (seconds) ``` We see ~+68% Perf Improvement on Prefill, ad ~-13% regression on Decode. See the dependent XNNPACK PR for more benchmarking details Pull Request resolved: #5162 Reviewed By: digantdesai Differential Revision: D63651987 Pulled By: mcr229
pytorch · Sep 30, 2024 · 3abbc5e · 3abbc5e
1 parent b60fa71
commit 3abbc5e
Show file tree

Hide file tree

Showing 482 changed files with 195 additions and 3,857 deletions.
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
@@ -32,14 +32,20 @@ if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
 endif()
 
-# NB: Enabling this will serialize execution of delegate instances.
-# This setting may have performance implications.
+# NB: Enabling this will serialize execution of delegate instances
+# Keeping this OFF by default to maintain existing behavior, to be revisited.
 option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
-       "Enable workspace sharing across different delegate instances" ON
-)
+  "Enable workspace sharing across different delegate instances" ON)
+# Keeping this OFF by default due to regressions in decode
+# and model load with kleidi kernels
+option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI
+  "Enable workspace sharing across different delegate instances" OFF)
 if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE)
   add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE)
 endif()
+if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI)
+  add_definitions(-DENABLE_XNNPACK_KLEIDI)
+endif()
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)

diff --git a/backends/xnnpack/cmake/Dependencies.cmake b/backends/xnnpack/cmake/Dependencies.cmake
@@ -36,13 +36,39 @@ set(XNNPACK_ENABLE_AVXVNNI
     OFF
     CACHE BOOL ""
 )
-set(XNNPACK_ENABLE_KLEIDIAI
+
+if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI)
+    set(XNNPACK_ENABLE_KLEIDIAI
+        ON
+        CACHE BOOL ""
+    )
+else()
+    set(XNNPACK_ENABLE_KLEIDIAI
+        OFF
+        CACHE BOOL ""
+    )
+endif()
+
+
+set(XNNPACK_BUILD_ALL_MICROKERNELS
     OFF
     CACHE BOOL ""
 )
 add_subdirectory("${XNNPACK_SOURCE_DIR}")
 include_directories(SYSTEM ${XNNPACK_INCLUDE_DIR})
 list(APPEND xnnpack_third_party XNNPACK)
+install(TARGETS microkernels-prod
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+
+if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI)
+    install(TARGETS kleidiai
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+endif()
 
 # Revert PIC Flag to what it originally was
 set(CMAKE_POSITION_INDEPENDENT_CODE

diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -630,7 +630,14 @@ Error defineConvertNode(
       subgraph_ptr,
       remapped_ids.at(graph_node->input_id()),
       remapped_ids.at(graph_node->output_id()),
+#ifdef ENABLE_XNNPACK_KLEIDI
+      // This maps to XNNPACK's XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM
+      // however this is not currently exposed at top level
+      // xnnpack.h Header
+      0x00000100);
+#else
       graph_node->flags());
+#endif
 
   ET_CHECK_OR_RETURN_ERROR(
       status == xnn_status_success,

diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
@@ -49,6 +49,8 @@ def define_common_targets():
         preprocessor_flags = [
             # Uncomment to enable per operator timings
             # "-DENABLE_XNNPACK_PROFILING",
+            # Uncomment to enable using KleidiAI Kernels
+            # "-DENABLE_XNNPACK_KLEIDI"
         ] + _get_preprocessor_flags(),
         exported_deps = [
             "//executorch/runtime/backend:interface",

diff --git a/backends/xnnpack/test/CMakeLists.txt b/backends/xnnpack/test/CMakeLists.txt
@@ -39,6 +39,7 @@ et_cxx_test(
   XNNPACK
   pthreadpool
   cpuinfo
+  microkernels-prod
 )
 target_include_directories(
   backends_xnnpack_test

diff --git a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
@@ -9,7 +9,7 @@
 #include <executorch/backends/xnnpack/runtime/XNNExecutor.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <gtest/gtest.h>
-#include <xnnpack/subgraph.h>
+#include <xnnpack.h>
 
 using torch::executor::Error;
 using torch::executor::EValue;
@@ -26,7 +26,7 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
   std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> auto_subgraph(
       subgraph, xnn_delete_subgraph);
 
-  auto input_id = XNN_INVALID_NODE_ID;
+  auto input_id = XNN_INVALID_VALUE_ID;
   std::vector<size_t> dims = {
       1,
   };
@@ -43,9 +43,9 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
           /*external_id=*/0,
           /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT,
           &input_id));
-  ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
+  ASSERT_NE(input_id, XNN_INVALID_VALUE_ID);
 
-  auto output_id = XNN_INVALID_NODE_ID;
+  auto output_id = XNN_INVALID_VALUE_ID;
   ASSERT_EQ(
       xnn_status_success,
       xnn_define_quantized_tensor_value(
@@ -59,7 +59,7 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
           /*external_id=*/0,
           /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
           &output_id));
-  ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
+  ASSERT_NE(output_id, XNN_INVALID_VALUE_ID);
 
   ASSERT_EQ(
       xnn_status_success,

diff --git a/backends/xnnpack/test/targets.bzl b/backends/xnnpack/test/targets.bzl
@@ -24,6 +24,7 @@ def define_common_targets():
         srcs = ["runtime/test_xnnexecutor.cpp"],
         deps = [
             third_party_dep("XNNPACK"),
+            third_party_dep("FP16"),
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/backends/xnnpack:xnnpack_backend",

diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK
diff --git a/backends/xnnpack/third-party/cpuinfo b/backends/xnnpack/third-party/cpuinfo
diff --git a/backends/xnnpack/third-party/generate-xnnpack-wrappers.py b/backends/xnnpack/third-party/generate-xnnpack-wrappers.py
+2 −2		.github/workflows/build.yml
+1 −1		CMakeLists.txt
+3 −1		README.md
+9 −0		include/cpuinfo.h
+59 −0		scripts/android-riscv64-build.sh
+3 −0		src/arm/linux/aarch64-isa.c
+1 −0		src/arm/linux/api.h
+11 −15		src/freebsd/topology.c
+21 −5		src/x86/freebsd/init.c