Compute Library v24.11

ARM-software · Nov 11, 2024 · f44f09d · f44f09d
1 parent c61bd33
commit f44f09d
Show file tree

Hide file tree

Showing 137 changed files with 4,412 additions and 1,659 deletions.
diff --git a/Android.bp b/Android.bp
@@ -1025,11 +1025,14 @@ cc_library_static {
         "src/runtime/experimental/operators/CpuActivation.cpp",
         "src/runtime/experimental/operators/CpuAdd.cpp",
         "src/runtime/experimental/operators/CpuDepthwiseConv2d.cpp",
+        "src/runtime/experimental/operators/CpuDequantize.cpp",
         "src/runtime/experimental/operators/CpuElementwise.cpp",
+        "src/runtime/experimental/operators/CpuGEMMLowp.cpp",
         "src/runtime/experimental/operators/CpuGemm.cpp",
         "src/runtime/experimental/operators/CpuGemmConv2d.cpp",
         "src/runtime/experimental/operators/CpuGemmDirectConv2d.cpp",
         "src/runtime/experimental/operators/CpuMul.cpp",
+        "src/runtime/experimental/operators/CpuQuantize.cpp",
         "src/runtime/experimental/operators/CpuSoftmax.cpp",
         "src/runtime/experimental/operators/CpuSub.cpp",
         "src/runtime/experimental/operators/CpuTranspose.cpp",

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -28,7 +28,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 list(APPEND CMAKE_MESSAGE_CONTEXT ArmCompute)
 project(
   ArmCompute
-  VERSION 42.0.0
+  VERSION 43.0.0
   DESCRIPTION
     "The Arm Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A CPU and Arm® Mali™ GPU architectures"
   LANGUAGES C CXX ASM)

diff --git a/LICENSES/Apache-2.0.txt b/LICENSES/Apache-2.0.txt
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: 2008-2023 The Khronos Group Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/LICENSE → LICENSES/MIT.txt b/LICENSE → LICENSES/MIT.txt
@@ -1,6 +1,11 @@
-MIT License
+# SPDX-FileCopyrightText: 2012-2017 Christian Rau
+# SPDX-FileCopyrightText: 2017 Leon Merten Lohse
+# SPDX-FileCopyrightText: 2017 Sean Barrett
+# SPDX-FileCopyrightText: 2017-2024 Arm Limited
+#
+# SPDX-License-Identifier: MIT
 
-Copyright (c) 2017-2024 Arm Limited
+MIT License
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
  <img src="https://raw.githubusercontent.com/ARM-software/ComputeLibrary/gh-pages/ACL_logo.png"/><br><br>
 </div>
 
-# Compute Library ![](https://img.shields.io/badge/latest_release-24.09-green)
+# Compute Library ![](https://img.shields.io/badge/latest_release-24.11-green)
 
 
 The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A, Arm® Neoverse® and Arm® Mali™ GPUs architectures.<br>
@@ -37,7 +37,7 @@ Key Features:
 <br>
 
 ## Documentation
-[![Documentation](https://img.shields.io/badge/documentation-24.09-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.09/index.xhtml)
+[![Documentation](https://img.shields.io/badge/documentation-24.11-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.11/index.xhtml)
 
 > Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc.
 
@@ -50,22 +50,22 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C
 
 | Platform       | Operating System | Release archive (Download) |
 | -------------- | ---------------- | -------------------------- |
-| Raspberry Pi 4 | Linux® 32bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-armv7a-cpu-bin.tar.gz) |
-| Raspberry Pi 4 | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) |
-| Odroid N2      | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-gpu-bin.tar.gz) |
-| HiKey960       | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-gpu-bin.tar.gz) |
+| Raspberry Pi 4 | Linux® 32bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-armv7a-cpu-bin.tar.gz) |
+| Raspberry Pi 4 | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) |
+| Odroid N2      | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-gpu-bin.tar.gz) |
+| HiKey960       | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-gpu-bin.tar.gz) |
 
 <br>
 
 | Architecture | Operating System | Release archive (Download) |
 | ------------ | ---------------- | -------------------------- |
-| armv7        | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-armv7a-cpu-gpu-bin.tar.gz) |
-| arm64-v8a    | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-android-aarch64-cpu-gpu-bin.tar.gz) |
-| arm64-v8a    | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-gpu-bin.tar.gz) |
+| armv7        | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-armv7a-cpu-gpu-bin.tar.gz) |
+| arm64-v8a    | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-android-aarch64-cpu-gpu-bin.tar.gz) |
+| arm64-v8a    | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-gpu-bin.tar.gz) |
 
 <br>
 
-Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.09-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.09)
+Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.11-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.11)
 
 Pre-build binaries are generated with the following security / good coding practices related flags:
 > -Wall, -Wextra, -Wformat=2, -Winit-self, -Wstrict-overflow=2, -Wswitch-default, -Woverloaded-virtual, -Wformat-security, -Wctor-dtor-privacy, -Wsign-promo, -Weffc++, -pedantic, -fstack-protector-strong
@@ -107,13 +107,13 @@ Pre-build binaries are generated with the following security / good coding pract
 
 ## Experimental builds
 
-**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.09/how_to_build.xhtml) for more details.
+**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.11/how_to_build.xhtml) for more details.
 
 <br>
 
 ## How to contribute
 
-Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.09/contribution_guidelines.xhtml).
+Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.11/contribution_guidelines.xhtml).
 
 ### Developer Certificate of Origin (DCO)
 Before the Compute Library accepts your contribution, you need to certify its origin and give us your permission. To manage this process we use the Developer Certificate of Origin (DCO) V1.1 (https://developercertificate.org/)

diff --git a/SConscript b/SConscript
@@ -33,8 +33,8 @@ import codecs
 import platform
 import SCons
 
-VERSION = "v24.09"
-LIBRARY_VERSION_MAJOR = 42
+VERSION = "v24.11"
+LIBRARY_VERSION_MAJOR = 43
 LIBRARY_VERSION_MINOR = 0
 LIBRARY_VERSION_PATCH = 0
 SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH)
@@ -627,12 +627,8 @@ custom_operators = []
 custom_types = []
 custom_layouts = []
 
-use_custom_ops = env['high_priority'] or env['build_config']
+use_custom_ops = env['build_config']
 
-if env['high_priority']:
-    custom_operators = filelist['high_priority']
-    custom_types = ['all']
-    custom_layouts = ['all']
 
 if env['build_config']:
     custom_operators, custom_types, custom_layouts = read_build_config_json(env['build_config'])

diff --git a/SConstruct b/SConstruct
@@ -116,7 +116,6 @@ vars.AddVariables(
     PathVariable("build_dir", "Specify sub-folder for the build", ".", PathVariable.PathAccept),
     PathVariable("install_dir", "Specify sub-folder for the install", "", PathVariable.PathAccept),
     BoolVariable("exceptions", "Enable/disable C++ exception support", True),
-    BoolVariable("high_priority", "Generate a library containing only the high priority operators", False),
     PathVariable("linker_script", "Use an external linker script", "", PathVariable.PathAccept),
     PathVariable("external_tests_dir", """Add examples, benchmarks and tests to the tests suite from an external path. In order to use this option, the external tests directory must have the following structure:
     EXTERNAL_TESTS_DIR:
@@ -519,21 +518,11 @@ if not GetOption("help"):
             # Thus for backward compatibility, we include this flag only for NDK < r23
             env.Append(CXXFLAGS = ['-no-integrated-as'])
 
-if env['high_priority'] and env['build_config']:
-    print("The high priority library cannot be built in conjunction with a user-specified build configuration")
-    Exit(1)
-
-if not env['high_priority'] and not env['build_config']:
-    env.Append(CPPDEFINES = ['ARM_COMPUTE_GRAPH_ENABLED'])
-
 data_types = []
 data_layouts = []
 
 # Set correct data types / layouts to build
-if env['high_priority']:
-    data_types = ['all']
-    data_layouts = ['all']
-elif env['build_config']:
+if env['build_config']:
     data_types, data_layouts = read_build_config_json(env['build_config'])
 else:
     data_types = env['data_type_support']
@@ -613,7 +602,9 @@ else:
         env.Append(CXXFLAGS = ['-O3'])
     else:
         # on windows we use clang-cl which does not support the option -O3
-        env.Append(CXXFLAGS = ['-O2'])
+        if not version_at_least(compiler_ver, '17.0.0'):
+            # Disable optimizations in clang 17 or later because the compiler crashes with -O2
+            env.Append(CXXFLAGS = ['-O2'])
 
 if env['asserts']:
     env.Append(CPPDEFINES = ['ARM_COMPUTE_ASSERTS_ENABLED'])
@@ -653,7 +644,7 @@ Export('version_at_least')
 
 SConscript('./SConscript', variant_dir=build_path, duplicate=0)
 
-if env['examples'] and (env['build_config'] or env['high_priority']):
+if env['examples'] and env['build_config']:
     print("WARNING: Building examples for selected operators not supported. Use examples=0")
     Return()
 
@@ -664,7 +655,7 @@ if env['examples'] and env['exceptions']:
     SConscript('./examples/SConscript', variant_dir='%s/examples' % build_path, duplicate=0)
 
 if env['exceptions']:
-    if env['build_config'] or env['high_priority']:
+    if env['build_config']:
         print("WARNING: Building tests for selected operators not supported")
         Return()
     if env['os'] == 'bare_metal' and env['arch'] == 'armv7a':

diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Error.h"
 
+#include <cstdint>
 #include <memory>
 
 namespace arm_compute
@@ -180,7 +181,12 @@ class CPUInfo final
      *
      * @return Vector length if sme2 is enabled, otherwise returns 0.
      */
-    uint64_t get_sme2_vector_length() const;
+    uint64_t get_sme2_vector_length_in_bytes() const;
+    /** Return the vector length in bits for sme2
+     *
+     * @return Vector length if sme2 is enabled, otherwise returns 0.
+     */
+    uint64_t get_sme2_vector_length_in_bits() const;
 
 private:
     struct Impl;

diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h
@@ -63,6 +63,31 @@ struct UniformQuantizationInfo
     int32_t offset;
 };
 
+/** Quantization info when assuming per layer quantization */
+struct UniformRequantizationInfo
+{
+    /** Default constructor */
+    UniformRequantizationInfo() : scale(0.f), offset(0.f)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in] scale  Quantization scale
+     * @param[in] offset Quantization offset
+     */
+    UniformRequantizationInfo(float scale, float offset) : scale(scale), offset(offset)
+    {
+    }
+    /** Checks if the scale and offset are both zero */
+    bool empty() const
+    {
+        return (scale == 0) && (offset == 0);
+    }
+
+    float scale;
+    float offset;
+};
+
 /** Quantization information */
 class QuantizationInfo
 {
@@ -232,6 +257,13 @@ struct Qasymm8QuantizationHelper
         return static_cast<QUANTIZED_TYPE>(arm_compute::utility::clamp<decltype(quantized), QUANTIZED_TYPE>(quantized));
     }
 
+    static inline QUANTIZED_TYPE quantize(float value, const UniformRequantizationInfo &qinfo)
+    {
+        ARM_COMPUTE_ERROR_ON(qinfo.scale == 0);
+        const int quantized = support::cpp11::lround(value / qinfo.scale + qinfo.offset);
+        return static_cast<QUANTIZED_TYPE>(arm_compute::utility::clamp<decltype(quantized), QUANTIZED_TYPE>(quantized));
+    }
+
     /** Quantize a value given a 8-bit asymmetric quantization scheme using a specific rounding policy
      *
      * @param[in] value           Value to quantize
@@ -253,6 +285,21 @@ struct Qasymm8QuantizationHelper
         return static_cast<QUANTIZED_TYPE>(arm_compute::utility::clamp<decltype(quantized), QUANTIZED_TYPE>(quantized));
     }
 
+    static inline QUANTIZED_TYPE
+    quantize(float value, const UniformRequantizationInfo &qinfo, RoundingPolicy rounding_policy)
+    {
+        if (rounding_policy == RoundingPolicy::TO_NEAREST_UP)
+        {
+            return quantize(value, qinfo);
+        }
+
+        ARM_COMPUTE_ERROR_ON(qinfo.scale == 0);
+
+        // We round after adding the offset, because the offset is also float
+        const int quantized = arm_compute::round(value / qinfo.scale + qinfo.offset, rounding_policy);
+        return static_cast<QUANTIZED_TYPE>(arm_compute::utility::clamp<decltype(quantized), QUANTIZED_TYPE>(quantized));
+    }
+
     /** Quantize a value given a 8-bit asymmetric quantization scheme
      *
      * @param[in] value           Value to quantize
@@ -588,7 +635,11 @@ inline float dequantize_s32(int32_t value, const QuantizationInfo &qinfo)
     return dequantize_s32(value, qinfo.uniform());
 }
 
-/*
+/** Compute the requantization offset and scale
+ *
+ * @deprecated because reequantization using integer offsets creates rounding issues.
+ * Please use @ref arm_compute::compute_requantization_scale_float_offset() instead.
+ *
  * In case of requantization of a quantized input tensor to an output tensor with another quantization
  * instead of applying dequantization and then a quantization functions, we just compute new scale and
  * offset.
@@ -628,9 +679,32 @@ inline UniformQuantizationInfo compute_requantization_scale_offset(const Uniform
     // In order to minimize flooring we convert the offset to a float,
     // then compute the new offset in the float domain,
     // finally we convert it back as int32_t
-    offset_to_apply -= static_cast<int32_t>(static_cast<float>(uqinfo_in.offset) * uqinfo_in.scale / uqinfo_out.scale);
+
+#ifdef __aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP;
+#endif //__aarch64__
+
+    offset_to_apply -=
+        arm_compute::round(static_cast<float>(uqinfo_in.offset) * uqinfo_in.scale / uqinfo_out.scale, rounding_policy);
     return UniformQuantizationInfo(scale_to_apply, offset_to_apply);
 }
 
+/** Similar to @ref arm_compute::compute_requantization_scale_offset()
+ *  but returning offset as float instead of integer
+ */
+inline UniformRequantizationInfo compute_requantization_scale_float_offset(const UniformQuantizationInfo &uqinfo_in,
+                                                                           const UniformQuantizationInfo &uqinfo_out)
+{
+    float scale_to_apply  = uqinfo_out.scale;
+    float offset_to_apply = static_cast<float>(uqinfo_out.offset);
+
+    scale_to_apply /= uqinfo_in.scale;
+    offset_to_apply -= static_cast<float>(uqinfo_in.offset) * uqinfo_in.scale / uqinfo_out.scale;
+
+    return UniformRequantizationInfo(scale_to_apply, offset_to_apply);
+}
+
 } // namespace arm_compute
 #endif // ACL_ARM_COMPUTE_CORE_QUANTIZATIONINFO_H
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
@@ -327,6 +327,9 @@ class TensorInfo final : public ITensorInfo
 
 private:
     /** Calculates strides, offset and total size resulting from the specified padding around the XY plane.
+     *
+     * @note When interpreting the required_strides in the return value, only the values up to the corresponding dimension in the tensor is
+     *       valid. For example, 1D tensor should only refer to 1D in required_strides, 2D tensor up to 2D in required_strides, and so on.
      *
      * @param[in] padding Padding around the XY plane in elements.
      */