Add gatherf_byte_inds for gathers using byte indices from memory (#511)

Adds a new function to wrap gathers using byte indices from memory, avoiding the byte-to-int conversion for ISAs that don't have native gathers. Adds a new build option ASTCENC_X86_GATHERS (default ON) to allow builds to disable use of native gathers on X86 as they are much slower than scalar fallbacks on some microarchitectures (AMD Zen, pre-Skylake Intel). Co-authored-by: Fabian Giesen <fabian.giesen@epicgames.com> Co-authored-by: Pete Harris <peter.harris@arm.com>
ARM-software · Nov 4, 2024 · 546f9dd · 546f9dd
1 parent 521179c
commit 546f9dd
Show file tree

Hide file tree

Showing 13 changed files with 144 additions and 36 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -51,6 +51,7 @@ option(ASTCENC_UBSAN "Enable astcenc builds with undefined behavior sanitizer")
 option(ASTCENC_UNITTEST "Enable astcenc builds with unit tests")
 option(ASTCENC_INVARIANCE "Enable astcenc floating point invariance" ON)
 option(ASTCENC_CLI "Enable build of astcenc command line tools" ON)
+option(ASTCENC_X86_GATHERS "Enable use of native x86 gathers" ON)
 
 # Preflight for some macOS-specific build options
 if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
@@ -127,6 +128,7 @@ message(STATUS "x86-64 backend options")
 printopt("AVX2 backend       " ${ASTCENC_ISA_AVX2})
 printopt("SSE4.1 backend     " ${ASTCENC_ISA_SSE41})
 printopt("SSE2 backend       " ${ASTCENC_ISA_SSE2})
+printopt("Use native gathers " ${ASTCENC_X86_GATHERS})
 message(STATUS "Agnostic backend options")
 printopt("NONE backend       " ${ASTCENC_ISA_NONE})
 printopt("NATIVE backend     " ${ASTCENC_ISA_NATIVE})

diff --git a/Docs/Building.md b/Docs/Building.md
@@ -203,6 +203,17 @@ To enable this binary variant add `-DASTCENC_ISA_NONE=ON` to the CMake command
 line when configuring. It is NOT recommended to use this for production; it is
 significantly slower than the vectorized SIMD builds.
 
+### No x86 gather instruction builds
+
+On many x86 microarchitectures the native AVX gather instructions are slower
+than simply performing manual scalar loads and combining the results. Gathers
+are enabled by default, but can be disabled by setting the CMake option
+`-DASTCENC_X86_GATHERS=OFF` on the command line when configuring.
+
+Note that we have seen mixed results when compiling the scalar fallback path,
+so we would recommend testing which option works best for the compiler and
+microarchitecture pairing that you are targeting.
+
 ### Test builds
 
 We support building unit tests. These use the `googletest` framework, which is

diff --git a/Source/astcenc_averages_and_directions.cpp b/Source/astcenc_averages_and_directions.cpp
@@ -778,12 +778,12 @@ void compute_error_squared_rgba(
 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vmask mask = lane_ids < vint(texel_count);
-			vint texel_idxs(texel_indexes + i);
+			const uint8_t* texel_idxs = texel_indexes + i;
 
-			vfloat data_r = gatherf(blk.data_r, texel_idxs);
-			vfloat data_g = gatherf(blk.data_g, texel_idxs);
-			vfloat data_b = gatherf(blk.data_b, texel_idxs);
-			vfloat data_a = gatherf(blk.data_a, texel_idxs);
+			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
+			vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
+			vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
+			vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, texel_idxs);
 
 			vfloat uncor_param = (data_r * l_uncor_bs0)
 			                   + (data_g * l_uncor_bs1)
@@ -892,11 +892,11 @@ void compute_error_squared_rgb(
 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
 			vmask mask = lane_ids < vint(texel_count);
-			vint texel_idxs(texel_indexes + i);
+			const uint8_t* texel_idxs = texel_indexes + i;
 
-			vfloat data_r = gatherf(blk.data_r, texel_idxs);
-			vfloat data_g = gatherf(blk.data_g, texel_idxs);
-			vfloat data_b = gatherf(blk.data_b, texel_idxs);
+			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
+			vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
+			vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
 
 			vfloat uncor_param = (data_r * l_uncor_bs0)
 			                   + (data_g * l_uncor_bs1)

diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp
@@ -41,16 +41,16 @@ static vfloat bilinear_infill_vla(
 	unsigned int index
 ) {
 	// Load the bilinear filter texel weight indexes in the decimated grid
-	vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
-	vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
-	vint weight_idx2 = vint(di.texel_weights_tr[2] + index);
-	vint weight_idx3 = vint(di.texel_weights_tr[3] + index);
+	const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
+	const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
+	const uint8_t* weight_idx2 = di.texel_weights_tr[2] + index;
+	const uint8_t* weight_idx3 = di.texel_weights_tr[3] + index;
 
 	// Load the bilinear filter weights from the decimated grid
-	vfloat weight_val0 = gatherf(weights, weight_idx0);
-	vfloat weight_val1 = gatherf(weights, weight_idx1);
-	vfloat weight_val2 = gatherf(weights, weight_idx2);
-	vfloat weight_val3 = gatherf(weights, weight_idx3);
+	vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
+	vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
+	vfloat weight_val2 = gatherf_byte_inds<vfloat>(weights, weight_idx2);
+	vfloat weight_val3 = gatherf_byte_inds<vfloat>(weights, weight_idx3);
 
 	// Load the weight contribution factors for each decimated weight
 	vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
@@ -81,12 +81,12 @@ static vfloat bilinear_infill_vla_2(
 	unsigned int index
 ) {
 	// Load the bilinear filter texel weight indexes in the decimated grid
-	vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
-	vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
+	const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
+	const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
 
 	// Load the bilinear filter weights from the decimated grid
-	vfloat weight_val0 = gatherf(weights, weight_idx0);
-	vfloat weight_val1 = gatherf(weights, weight_idx1);
+	vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
+	vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
 
 	// Load the weight contribution factors for each decimated weight
 	vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
@@ -894,18 +894,18 @@ void compute_ideal_weights_for_decimation(
 
 		for (unsigned int j = 0; j < max_texel_count; j++)
 		{
-			vint texel(di.weight_texels_tr[j] + i);
+			const uint8_t* texel = di.weight_texels_tr[j] + i;
 			vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
 
 			if (!constant_wes)
 			{
-				weight_error_scale = gatherf(ei.weight_error_scale, texel);
+				weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
 			}
 
 			vfloat contrib_weight = weight * weight_error_scale;
 
 			weight_weight += contrib_weight;
-			initial_weight += gatherf(ei.weights, texel) * contrib_weight;
+			initial_weight += gatherf_byte_inds<vfloat>(ei.weights, texel) * contrib_weight;
 		}
 
 		storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
@@ -952,17 +952,17 @@ void compute_ideal_weights_for_decimation(
 
 		for (unsigned int j = 0; j < max_texel_count; j++)
 		{
-			vint texel(di.weight_texels_tr[j] + i);
+			const uint8_t* texel = di.weight_texels_tr[j] + i;
 			vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
 
 			if (!constant_wes)
 			{
- 				weight_error_scale = gatherf(ei.weight_error_scale, texel);
+				weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
 			}
 
 			vfloat scale = weight_error_scale * contrib_weight;
-			vfloat old_weight = gatherf(infilled_weights, texel);
-			vfloat ideal_weight = gatherf(ei.weights, texel);
+			vfloat old_weight = gatherf_byte_inds<vfloat>(infilled_weights, texel);
+			vfloat ideal_weight = gatherf_byte_inds<vfloat>(ei.weights, texel);
 
 			error_change0 += contrib_weight * scale;
 			error_change1 += (old_weight - ideal_weight) * scale;

diff --git a/Source/astcenc_mathlib.h b/Source/astcenc_mathlib.h
@@ -58,8 +58,10 @@
 #ifndef ASTCENC_AVX
   #if defined(__AVX2__)
     #define ASTCENC_AVX 2
+    #define ASTCENC_X86_GATHERS 1
   #elif defined(__AVX__)
     #define ASTCENC_AVX 1
+    #define ASTCENC_X86_GATHERS 1
   #else
     #define ASTCENC_AVX 0
   #endif

diff --git a/Source/astcenc_pick_best_endpoint_format.cpp b/Source/astcenc_pick_best_endpoint_format.cpp
@@ -123,21 +123,21 @@ static void compute_error_squared_rgb_single_partition(
 	vint lane_ids = vint::lane_id();
 	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 	{
-		vint tix(texel_indexes + i);
+		const uint8_t* tix = texel_indexes + i;
 
 		vmask mask = lane_ids < vint(texel_count);
 		lane_ids += vint(ASTCENC_SIMD_WIDTH);
 
 		// Compute the error that arises from just ditching alpha
-		vfloat data_a = gatherf(blk.data_a, tix);
+		vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, tix);
 		vfloat alpha_diff = data_a - default_a;
 		alpha_diff = alpha_diff * alpha_diff;
 
 		haccumulate(a_drop_errv, alpha_diff, mask);
 
-		vfloat data_r = gatherf(blk.data_r, tix);
-		vfloat data_g = gatherf(blk.data_g, tix);
-		vfloat data_b = gatherf(blk.data_b, tix);
+		vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, tix);
+		vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, tix);
+		vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, tix);
 
 		// Compute uncorrelated error
 		vfloat param = data_r * uncor_bs0

diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h
@@ -77,6 +77,8 @@
 	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
 #endif
 
+template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices);
+
 #if ASTCENC_AVX >= 2
 	// If we have AVX2 expose 8-wide VLA.
 	#include "astcenc_vecmathlib_sse_4.h"

diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h
@@ -903,6 +903,33 @@ ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
 	return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
 }
 
+/**
+ * @brief Load a vector of gathered results from an array using byte indices from memory
+ */
+template<>
+ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds<vfloat8>(const float* base, const uint8_t* indices)
+{
+#if ASTCENC_X86_GATHERS == 0
+	// Perform manual gather using scalar loads in two separate dependency chains,
+	// then merge late. MSVC translates this 1:1, which is OK. Clang turns it
+	// into a bunch of memory-operand inserts on 128-bit halves then merges late,
+	// which performs significantly worse in tests.
+	__m256 m0 = _mm256_broadcast_ss(base + indices[0]);
+	__m256 m1 = _mm256_broadcast_ss(base + indices[1]);
+	m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[2]), 1 << 2);
+	m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[3]), 1 << 3);
+	m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[4]), 1 << 4);
+	m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[5]), 1 << 5);
+	m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[6]), 1 << 6);
+	m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[7]), 1 << 7);
+
+	return vfloat8(_mm256_blend_ps(m0, m1, 0xaa));
+#else
+	vint8 inds(indices);
+	return gatherf(base, inds);
+#endif
+}
+
 /**
  * @brief Store a vector to an unaligned memory address.
  */

diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
@@ -828,6 +828,25 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 #endif
 }
 
+/**
+ * @brief Load a vector of gathered results from an array using byte indices from memory
+ */
+template<>
+ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
+{
+#if ASTCENC_SVE == 0
+	alignas(16) float vals[4];
+	vals[0] = base[indices[0]];
+	vals[1] = base[indices[1]];
+	vals[2] = base[indices[2]];
+	vals[3] = base[indices[3]];
+	return vfloat4(vals);
+#else
+	svint32_t offsets = svld1ub_s32(svptrue_pat_b32(SV_VL4), indices);
+	svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
+	return vfloat4(svget_neonq_f32(data));
+#endif
+}
 /**
  * @brief Store a vector to an unaligned memory address.
  */

diff --git a/Source/astcenc_vecmathlib_none_4.h b/Source/astcenc_vecmathlib_none_4.h
@@ -943,6 +943,18 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 	               base[indices.m[3]]);
 }
 
+/**
+ * @brief Load a vector of gathered results from an array using byte indices from memory
+ */
+template<>
+ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
+{
+	return vfloat4(base[indices[0]],
+	               base[indices[1]],
+	               base[indices[2]],
+	               base[indices[3]]);
+}
+
 /**
  * @brief Store a vector to an unaligned memory address.
  */

diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h
@@ -900,7 +900,7 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
  */
 ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 {
-#if ASTCENC_AVX >= 2
+#if ASTCENC_AVX >= 2 && ASTCENC_X86_GATHERS != 0
 	return vfloat4(_mm_i32gather_ps(base, indices.m, 4));
 #else
 	alignas(16) int idx[4];
@@ -909,6 +909,23 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
 #endif
 }
 
+/**
+ * @brief Load a vector of gathered results from an array using byte indices from memory
+ */
+template<>
+ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
+{
+	// Experimentally, in this particular use case (byte indices in memory),
+	// using 4 separate scalar loads is appreciably faster than using gathers
+	// even if they're available, on every x86 uArch tried, so always do the
+	// separate loads even when ASTCENC_X86_GATHERS is enabled.
+	//
+	// Tested on:
+	//   - Intel Skylake-X, Coffee Lake, Crestmont, Redwood Cove
+	//   - AMD Zen 2, Zen 4
+	return vfloat4(base[indices[0]], base[indices[1]], base[indices[2]], base[indices[3]]);
+}
+
 /**
  * @brief Store a vector to an unaligned memory address.
  */

diff --git a/Source/astcenc_vecmathlib_sve_8.h b/Source/astcenc_vecmathlib_sve_8.h
@@ -841,6 +841,16 @@ ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
 	return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, indices.m));
 }
 
+/**
+ * @brief Load a vector of gathered results from an array using byte indices from memory
+ */
+template<>
+ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds<vfloat8>(const float* base, const uint8_t* indices)
+{
+	svint32_t offsets = svld1ub_s32(svptrue_b32(), indices);
+	return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, offsets));
+}
+
 /**
  * @brief Store a vector to an unaligned memory address.
  */

diff --git a/Source/cmake_core.cmake b/Source/cmake_core.cmake
@@ -359,7 +359,8 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE)
                 ASTCENC_SSE=20
                 ASTCENC_AVX=0
                 ASTCENC_POPCNT=0
-                ASTCENC_F16C=0)
+                ASTCENC_F16C=0
+                ASTCENC_X86_GATHERS=0)
 
         # Force SSE2 on AppleClang (normally SSE4.1 is the default)
         target_compile_options(${ASTCENC_TARGET_NAME}
@@ -377,7 +378,8 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE)
                 ASTCENC_SSE=41
                 ASTCENC_AVX=0
                 ASTCENC_POPCNT=1
-                ASTCENC_F16C=0)
+                ASTCENC_F16C=0
+                ASTCENC_X86_GATHERS=0)
 
         if (${ASTCENC_VENEER_TYPE} GREATER 0)
             # Force SSE2 on AppleClang (normally SSE4.1 is the default)
@@ -395,12 +397,16 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE)
         endif()
 
     elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
+        # Gathers are quite slow on many x86 microarchitectures, to the point where
+        # it can be significantly faster to just avoid them use scalar loads.
+
         target_compile_definitions(${ASTCENC_TARGET_NAME}
             PRIVATE
                 ASTCENC_NEON=0
                 ASTCENC_SVE=0
                 ASTCENC_SSE=41
                 ASTCENC_AVX=2
+                ASTCENC_X86_GATHERS=$<BOOL:${ASTCENC_X86_GATHERS}>
                 ASTCENC_POPCNT=1
                 ASTCENC_F16C=1)