From 9a06ca90e5a25972071690b2ad47c24f52556f6f Mon Sep 17 00:00:00 2001 From: Max Ren Date: Fri, 6 Sep 2024 21:29:02 -0700 Subject: [PATCH] Add ENABLE_KLEIDI_FLAG to use new QP8 Kernels --- backends/xnnpack/CMakeLists.txt | 13 +++++++----- backends/xnnpack/cmake/Dependencies.cmake | 24 ++++++++++++++++++++++- backends/xnnpack/runtime/XNNCompiler.cpp | 7 +++++++ backends/xnnpack/targets.bzl | 2 ++ build/executorch-config.cmake | 3 +++ examples/models/llama2/CMakeLists.txt | 3 +++ examples/models/llava/CMakeLists.txt | 5 ++++- 7 files changed, 50 insertions(+), 7 deletions(-) diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt index 6829d8cb245..8704e7c14d6 100644 --- a/backends/xnnpack/CMakeLists.txt +++ b/backends/xnnpack/CMakeLists.txt @@ -32,14 +32,17 @@ if(NOT PYTHON_EXECUTABLE) resolve_python_executable() endif() -# NB: Enabling this will serialize execution of delegate instances. -# This setting may have performance implications. -option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE - "Enable workspace sharing across different delegate instances" ON -) +# NB: Enabling this will serialize execution of delegate instances +# Keeping this OFF by default to maintain existing behavior, to be revisited. +option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE "Enable workspace sharing across different delegate instances" ON) +# Keeping this OFF by default do to regressions in decode and model load with kleidi kernels +option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI "Enable workspace sharing across different delegate instances" OFF) if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE) add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE) endif() +if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI) + add_definitions(-DENABLE_XNNPACK_KLEIDI) +endif() set(_common_include_directories ${EXECUTORCH_ROOT}/..) set(_common_compile_options -Wno-deprecated-declarations -fPIC) diff --git a/backends/xnnpack/cmake/Dependencies.cmake b/backends/xnnpack/cmake/Dependencies.cmake index 3a5ed5fc01f..fef63badf23 100644 --- a/backends/xnnpack/cmake/Dependencies.cmake +++ b/backends/xnnpack/cmake/Dependencies.cmake @@ -36,7 +36,21 @@ set(XNNPACK_ENABLE_AVXVNNI OFF CACHE BOOL "" ) -set(XNNPACK_ENABLE_KLEIDIAI + +if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI) + set(XNNPACK_ENABLE_KLEIDIAI + ON + CACHE BOOL "" + ) +else() + set(XNNPACK_ENABLE_KLEIDIAI + OFF + CACHE BOOL "" + ) +endif() + + +set(XNNPACK_BUILD_ALL_MICROKERNELS OFF CACHE BOOL "" ) @@ -48,6 +62,14 @@ install(TARGETS microkernels-prod ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + +if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI) + install(TARGETS kleidiai + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) +endif() + # Revert PIC Flag to what it originally was set(CMAKE_POSITION_INDEPENDENT_CODE ${ORIGINAL_CMAKE_POSITION_INDEPENDENT_CODE_FLAG} diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp index 2145ea15199..196ed29cc4f 100644 --- a/backends/xnnpack/runtime/XNNCompiler.cpp +++ b/backends/xnnpack/runtime/XNNCompiler.cpp @@ -630,7 +630,14 @@ Error defineConvertNode( subgraph_ptr, remapped_ids.at(graph_node->input_id()), remapped_ids.at(graph_node->output_id()), +#ifdef ENABLE_XNNPACK_KLEIDI + // This maps to XNNPACK's XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM + // however this is not currently exposed at top level + // xnnpack.h Header + 0x00000100); +#else graph_node->flags()); +#endif ET_CHECK_OR_RETURN_ERROR( status == xnn_status_success, diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl index 65499eb8364..1d959048032 100644 --- a/backends/xnnpack/targets.bzl +++ b/backends/xnnpack/targets.bzl @@ -49,6 +49,8 @@ def define_common_targets(): preprocessor_flags = [ # Uncomment to enable per operator timings # "-DENABLE_XNNPACK_PROFILING", + # Uncomment to enable using KleidiAI Kernels + # "-DENABLE_XNNPACK_KLEIDI" ] + _get_preprocessor_flags(), exported_deps = [ "//executorch/runtime/backend:interface", diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 6c339390d1f..3d9e9af5ee6 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -50,8 +50,11 @@ set(lib_list extension_threadpool extension_training xnnpack_backend + # Start XNNPACK Lib Deps XNNPACK microkernels-prod + kleidiai + # End XNNPACK Lib Deps cpuinfo pthreadpool vulkan_backend diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt index e7e87c8f880..b1401a0bca6 100644 --- a/examples/models/llama2/CMakeLists.txt +++ b/examples/models/llama2/CMakeLists.txt @@ -142,6 +142,9 @@ endif() # XNNPACK if(TARGET xnnpack_backend) set(xnnpack_backend_libs xnnpack_backend XNNPACK microkernels-prod) + if(TARGET kleidiai) + list(APPEND xnnpack_backend_libs kleidiai) + endif() list(APPEND link_libraries ${xnnpack_backend_libs}) target_link_options_shared_lib(xnnpack_backend) endif() diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt index c36e39a04cb..ed4cbc46344 100644 --- a/examples/models/llava/CMakeLists.txt +++ b/examples/models/llava/CMakeLists.txt @@ -157,7 +157,10 @@ endif() # XNNPACK if(TARGET xnnpack_backend) - set(xnnpack_backend_libs xnnpack_backend XNNPACK) + set(xnnpack_backend_libs xnnpack_backend XNNPACK microkernels-prod) + if(TARGET kleidiai) + list(APPEND xnnpack_backend_libs kleidiai) + endif() list(APPEND link_libraries ${xnnpack_backend_libs}) target_link_options_shared_lib(xnnpack_backend) endif()