From 9a06ca90e5a25972071690b2ad47c24f52556f6f Mon Sep 17 00:00:00 2001
From: Max Ren <maxren@meta.com>
Date: Fri, 6 Sep 2024 21:29:02 -0700
Subject: [PATCH] Add ENABLE_KLEIDI_FLAG to use new QP8 Kernels

---
 backends/xnnpack/CMakeLists.txt           | 13 +++++++-----
 backends/xnnpack/cmake/Dependencies.cmake | 24 ++++++++++++++++++++++-
 backends/xnnpack/runtime/XNNCompiler.cpp  |  7 +++++++
 backends/xnnpack/targets.bzl              |  2 ++
 build/executorch-config.cmake             |  3 +++
 examples/models/llama2/CMakeLists.txt     |  3 +++
 examples/models/llava/CMakeLists.txt      |  5 ++++-
 7 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index 6829d8cb245..8704e7c14d6 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -32,14 +32,17 @@ if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
 endif()
 
-# NB: Enabling this will serialize execution of delegate instances.
-# This setting may have performance implications.
-option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
-       "Enable workspace sharing across different delegate instances" ON
-)
+# NB: Enabling this will serialize execution of delegate instances
+# Keeping this OFF by default to maintain existing behavior, to be revisited.
+option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE "Enable workspace sharing across different delegate instances" ON)
+# Keeping this OFF by default do to regressions in decode and model load with kleidi kernels
+option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI "Enable workspace sharing across different delegate instances" OFF)
 if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE)
   add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE)
 endif()
+if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI)
+  add_definitions(-DENABLE_XNNPACK_KLEIDI)
+endif()
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)
diff --git a/backends/xnnpack/cmake/Dependencies.cmake b/backends/xnnpack/cmake/Dependencies.cmake
index 3a5ed5fc01f..fef63badf23 100644
--- a/backends/xnnpack/cmake/Dependencies.cmake
+++ b/backends/xnnpack/cmake/Dependencies.cmake
@@ -36,7 +36,21 @@ set(XNNPACK_ENABLE_AVXVNNI
     OFF
     CACHE BOOL ""
 )
-set(XNNPACK_ENABLE_KLEIDIAI
+
+if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI)
+    set(XNNPACK_ENABLE_KLEIDIAI
+        ON
+        CACHE BOOL ""
+    )
+else()
+    set(XNNPACK_ENABLE_KLEIDIAI
+        OFF
+        CACHE BOOL ""
+    )
+endif()
+
+
+set(XNNPACK_BUILD_ALL_MICROKERNELS
     OFF
     CACHE BOOL ""
 )
@@ -48,6 +62,14 @@ install(TARGETS microkernels-prod
     ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
     PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
+
+if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI)
+    install(TARGETS kleidiai
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+endif()
+
 # Revert PIC Flag to what it originally was
 set(CMAKE_POSITION_INDEPENDENT_CODE
     ${ORIGINAL_CMAKE_POSITION_INDEPENDENT_CODE_FLAG}
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index 2145ea15199..196ed29cc4f 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -630,7 +630,14 @@ Error defineConvertNode(
       subgraph_ptr,
       remapped_ids.at(graph_node->input_id()),
       remapped_ids.at(graph_node->output_id()),
+#ifdef ENABLE_XNNPACK_KLEIDI
+      // This maps to XNNPACK's XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM
+      // however this is not currently exposed at top level
+      // xnnpack.h Header
+      0x00000100);
+#else
       graph_node->flags());
+#endif
 
   ET_CHECK_OR_RETURN_ERROR(
       status == xnn_status_success,
diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
index 65499eb8364..1d959048032 100644
--- a/backends/xnnpack/targets.bzl
+++ b/backends/xnnpack/targets.bzl
@@ -49,6 +49,8 @@ def define_common_targets():
         preprocessor_flags = [
             # Uncomment to enable per operator timings
             # "-DENABLE_XNNPACK_PROFILING",
+            # Uncomment to enable using KleidiAI Kernels
+            # "-DENABLE_XNNPACK_KLEIDI"
         ] + _get_preprocessor_flags(),
         exported_deps = [
             "//executorch/runtime/backend:interface",
diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 6c339390d1f..3d9e9af5ee6 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -50,8 +50,11 @@ set(lib_list
     extension_threadpool
     extension_training
     xnnpack_backend
+    # Start XNNPACK Lib Deps
     XNNPACK
     microkernels-prod
+    kleidiai
+    # End XNNPACK Lib Deps
     cpuinfo
     pthreadpool
     vulkan_backend
diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt
index e7e87c8f880..b1401a0bca6 100644
--- a/examples/models/llama2/CMakeLists.txt
+++ b/examples/models/llama2/CMakeLists.txt
@@ -142,6 +142,9 @@ endif()
 # XNNPACK
 if(TARGET xnnpack_backend)
   set(xnnpack_backend_libs xnnpack_backend XNNPACK microkernels-prod)
+  if(TARGET kleidiai)
+    list(APPEND xnnpack_backend_libs kleidiai)
+  endif()
   list(APPEND link_libraries ${xnnpack_backend_libs})
   target_link_options_shared_lib(xnnpack_backend)
 endif()
diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
index c36e39a04cb..ed4cbc46344 100644
--- a/examples/models/llava/CMakeLists.txt
+++ b/examples/models/llava/CMakeLists.txt
@@ -157,7 +157,10 @@ endif()
 
 # XNNPACK
 if(TARGET xnnpack_backend)
-  set(xnnpack_backend_libs xnnpack_backend XNNPACK)
+  set(xnnpack_backend_libs xnnpack_backend XNNPACK microkernels-prod)
+  if(TARGET kleidiai)
+    list(APPEND xnnpack_backend_libs kleidiai)
+  endif()
   list(APPEND link_libraries ${xnnpack_backend_libs})
   target_link_options_shared_lib(xnnpack_backend)
 endif()