From b57658280e970f79e28870b1a1bf1ad5e1e620e1 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Thu, 31 Aug 2023 22:02:38 -0500 Subject: [PATCH] code cleanup --- include/kernels/block_transpose.cuh | 5 +- include/kernels/coarse_op_kernel.cuh | 4 -- include/kernels/color_spinor_pack.cuh | 16 ++--- include/kernels/gauge_stout.cuh | 4 +- include/kernels/gauge_utils.cuh | 1 - include/kernels/gauge_wilson_flow.cuh | 4 +- include/kernels/hisq_paths_force.cuh | 1 - include/targets/cuda/shared_memory_helper.h | 21 ++++-- include/targets/cuda/thread_array.h | 1 + include/targets/generic/helpers.h | 18 +++++ .../generic/shared_memory_cache_helper.h | 69 ++++++------------- include/targets/generic/thread_array.h | 15 ++-- include/targets/generic/thread_local_cache.h | 39 +++++++---- include/targets/hip/shared_memory_helper.h | 21 ++++-- 14 files changed, 110 insertions(+), 109 deletions(-) diff --git a/include/kernels/block_transpose.cuh b/include/kernels/block_transpose.cuh index 2b41f5c520..3c54345a6c 100644 --- a/include/kernels/block_transpose.cuh +++ b/include/kernels/block_transpose.cuh @@ -47,7 +47,7 @@ namespace quda constexpr BlockTransposeKernel(const Arg &arg) : arg(arg) { } static constexpr const char *filename() { return KERNEL_FILE; } - struct Dims { + struct CacheDims { static constexpr dim3 dims(dim3 block) { block.x += 1; block.z = 1; @@ -68,8 +68,7 @@ namespace quda int parity = parity_color / Arg::nColor; using color_spinor_t = ColorSpinor; - //SharedMemoryCache cache({target::block_dim().x + 1, target::block_dim().y, 1}); - SharedMemoryCache cache; + SharedMemoryCache cache; int x_offset = target::block_dim().x * target::block_idx().x; int v_offset = target::block_dim().y * target::block_idx().y; diff --git a/include/kernels/coarse_op_kernel.cuh b/include/kernels/coarse_op_kernel.cuh index be411eb68c..04c05fda4e 100644 --- a/include/kernels/coarse_op_kernel.cuh +++ b/include/kernels/coarse_op_kernel.cuh @@ -1398,8 +1398,6 @@ namespace quda { using real = typename Arg::Float; using TileType = typename Arg::vuvTileType; const int dim_index = arg.dim_index % arg.Y_atomic.geometry; - //__shared__ complex X[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin]; - //__shared__ complex Y[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin]; Cache cache; auto &X = cache.data()[0]; auto &Y = cache.data()[1]; @@ -1424,7 +1422,6 @@ namespace quda { } } - //__syncthreads(); cache.sync(); #pragma unroll @@ -1454,7 +1451,6 @@ namespace quda { } } - //__syncthreads(); cache.sync(); if (tx < Arg::coarseSpin*Arg::coarseSpin && (parity == 0 || arg.parity_flip == 1) ) { diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh index 2cc489181a..a67d86ded2 100644 --- a/include/kernels/color_spinor_pack.cuh +++ b/include/kernels/color_spinor_pack.cuh @@ -172,12 +172,12 @@ namespace quda { }; template <> struct site_max { - template - struct DimsPadX { + template struct CacheDims { static constexpr int Ms = spins_per_thread(Arg::nSpin); static constexpr int Mc = colors_per_thread(Arg::nColor); static constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc); static constexpr dim3 dims(dim3 block) { + // pad the shared block size to avoid bank conflicts for native ordering if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size(); block.y = color_spin_threads; // state the y block since we know it at compile time return block; @@ -187,16 +187,8 @@ namespace quda { template __device__ inline auto operator()(typename Arg::real thread_max, Arg &) { using real = typename Arg::real; - //constexpr int Ms = spins_per_thread(Arg::nSpin); - //constexpr int Mc = colors_per_thread(Arg::nColor); - //constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc); - constexpr int color_spin_threads = DimsPadX::color_spin_threads; - //auto block = target::block_dim(); - // pad the shared block size to avoid bank conflicts for native ordering - //if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size(); - //block.y = color_spin_threads; // state the y block since we know it at compile time - //SharedMemoryCache cache(block); - SharedMemoryCache> cache; + constexpr int color_spin_threads = CacheDims::color_spin_threads; + SharedMemoryCache> cache; cache.save(thread_max); cache.sync(); real this_site_max = static_cast(0); diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh index 56bd00f425..4577e66fcd 100644 --- a/include/kernels/gauge_stout.cuh +++ b/include/kernels/gauge_stout.cuh @@ -135,8 +135,8 @@ namespace quda } Link U, Q; - ThreadLocalCache Stap{}; - ThreadLocalCache Rect{}; // offset by Stap type to ensure non-overlapping allocations + ThreadLocalCache Stap; + ThreadLocalCache Rect; // offset by Stap type to ensure non-overlapping allocations // This function gets stap = S_{mu,nu} i.e., the staple of length 3, // and the 1x2 and 2x1 rectangles of length 5. From the following paper: diff --git a/include/kernels/gauge_utils.cuh b/include/kernels/gauge_utils.cuh index 6e91e1ac31..48c7e6c1cc 100644 --- a/include/kernels/gauge_utils.cuh +++ b/include/kernels/gauge_utils.cuh @@ -2,7 +2,6 @@ #include #include #include -#include namespace quda { diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh index 050295f271..ae28956112 100644 --- a/include/kernels/gauge_wilson_flow.cuh +++ b/include/kernels/gauge_wilson_flow.cuh @@ -72,8 +72,8 @@ namespace quda // This function gets stap = S_{mu,nu} i.e., the staple of length 3, // and the 1x2 and 2x1 rectangles of length 5. From the following paper: // https://arxiv.org/abs/0801.1165 - ThreadLocalCache Stap{}; - ThreadLocalCache Rect{}; // offset by Stap type to ensure non-overlapping allocations + ThreadLocalCache Stap; + ThreadLocalCache Rect; // offset by Stap type to ensure non-overlapping allocations computeStapleRectangle(arg, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim); Z = arg.coeff1x1 * static_cast(Stap) + arg.coeff2x1 * static_cast(Rect); break; diff --git a/include/kernels/hisq_paths_force.cuh b/include/kernels/hisq_paths_force.cuh index 84173a9a5a..35ddde688d 100644 --- a/include/kernels/hisq_paths_force.cuh +++ b/include/kernels/hisq_paths_force.cuh @@ -538,7 +538,6 @@ namespace quda { * The "extra" low point corresponds to the Lepage contribution to the * force_mu term. * - * * sig * F E * | | diff --git a/include/targets/cuda/shared_memory_helper.h b/include/targets/cuda/shared_memory_helper.h index b7dacbb9cf..2da596f6ed 100644 --- a/include/targets/cuda/shared_memory_helper.h +++ b/include/targets/cuda/shared_memory_helper.h @@ -14,9 +14,9 @@ namespace quda /** @brief Class which is used to allocate and access shared memory. The shared memory is treated as an array of type T, with the - number of elements given by the call to the static member - S::size(target::block_dim()). The offset from the beginning of - the total shared memory block is given by the static member + number of elements given by a call to the static member + S::size(target::block_dim()). The byte offset from the beginning + of the total shared memory block is given by the static member O::shared_mem_size(target::block_dim()), or 0 if O is void. */ template class SharedMemory @@ -26,7 +26,6 @@ namespace quda private: T *data; - const unsigned int size; // number of elements of type T /** @brief This is a dummy instantiation for the host compiler @@ -57,6 +56,9 @@ namespace quda } public: + /** + @brief Byte offset for this shared memory object. + */ static constexpr unsigned int get_offset(dim3 block) { unsigned int o = 0; @@ -64,6 +66,9 @@ namespace quda return o; } + /** + @brief Shared memory size in bytes. + */ static constexpr unsigned int shared_mem_size(dim3 block) { return get_offset(block) + S::size(block)*sizeof(T); @@ -72,10 +77,12 @@ namespace quda /** @brief Constructor for SharedMemory object. */ - constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), - size(S::size(target::block_dim())) {} + constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) {} - constexpr auto smem() const { return *this; } + /** + @brief Return this SharedMemory object. + */ + constexpr auto sharedMem() const { return *this; } /** @brief Subscripting operator returning a reference to element. diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h index f5e965576d..c88a178e2c 100644 --- a/include/targets/cuda/thread_array.h +++ b/include/targets/cuda/thread_array.h @@ -7,6 +7,7 @@ #else #include + namespace quda { template struct thread_array : array {}; diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h index a08cefa84b..f8faf41b44 100644 --- a/include/targets/generic/helpers.h +++ b/include/targets/generic/helpers.h @@ -3,21 +3,33 @@ namespace quda { + /** + @brief Element type used for coalesced storage. + */ template using atom_t = std::conditional_t>; + /** + @brief Used to declare an object of fixed size. + */ template struct SizeStatic { static constexpr unsigned int size(dim3) { return N; } }; + /** + @brief Used to declare an object of fixed size per thread, N. + */ template struct SizePerThread { static constexpr unsigned int size(dim3 block) { return N * block.x * block.y * block.z; } }; + /** + @brief Used to declare an object of fixed size per thread, N, with thread dimensions derermined by D. + */ template struct SizeDims { static constexpr unsigned int size(dim3 block) { dim3 dims = D::dims(block); @@ -25,12 +37,18 @@ namespace quda } }; + /** + @brief Used to declare an object with dimensions given by the block size. + */ struct DimsBlock { static constexpr dim3 dims(dim3 block) { return block; } }; + /** + @brief Used to declare an object with fixed dimensions. + */ template struct DimsStatic { static constexpr dim3 dims(dim3) { diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h index 149a7016f3..1866375f76 100644 --- a/include/targets/generic/shared_memory_cache_helper.h +++ b/include/targets/generic/shared_memory_cache_helper.h @@ -11,46 +11,38 @@ sharing data between threads in a thread block. */ -/** - @file shared_memory_cache_helper.h - @brief Convenience overloads to allow SharedMemoryCache objects to - appear in simple expressions. The actual implementation of - SharedMemoryCache is target specific, and located in e.g., - include/targets/cuda/shared_memory_cache_helper.h, etc. - */ - namespace quda { /** @brief Class which wraps around a shared memory cache for type T, where each thread in the thread block stores a unique value in - the cache which any other thread can access. + the cache which any other thread can access. The data is stored + in a coalesced order with element size atom_t. - This accessor supports both explicit run-time block size and - compile-time sizing. + The dimensions of the cache is determined by a call to + D::dims(target::block_dim()), and D defaults to having dimensions + equal to the block dimensions. - * For run-time block size, the constructor should be initialied - with the desired block size. - - * For compile-time block size, no arguments should be passed to - the constructor, and then the second and third template - parameters correspond to the y and z dimensions of the block, - respectively. The x dimension of the block will be set - according the maximum number of threads possible, given these - dimensions. + A byte offset into the shared memory region can be specified with + the type O, and is given by + O::shared_mem_size(target::block_dim()) if O is not void. */ template class SharedMemoryCache : SharedMemory, SizeDims)>, O> { + using Smem = SharedMemory, SizeDims)>, O>; + public: using value_type = T; using dims_type = D; - using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one - using Smem = SharedMemory, SizeDims)>, O>; + using offset_type = O; using Smem::shared_mem_size; private: + const dim3 block; + const int stride; + using Smem::sharedMem; using atom_t = atom_t; static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types"); @@ -60,21 +52,13 @@ namespace quda // used to avoid instantiation of load functions if unused, in case T is not a valid return type (e.g. C array) template using maybeT = std::conditional_t,T,void>; - const dim3 block; - const int stride; - - //constexpr Smem smem() const { return *dynamic_cast(this); } - using Smem::smem; - //constexpr Smem smem() const { return Smem::smem(); } - //constexpr Smem smem() const { return *Smem::smemp(); } - __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const { atom_t tmp[n_element]; memcpy(tmp, (void *)&a, sizeof(T)); int j = (z * block.y + y) * block.x + x; #pragma unroll - for (int i = 0; i < n_element; i++) smem()[i * stride + j] = tmp[i]; + for (int i = 0; i < n_element; i++) sharedMem()[i * stride + j] = tmp[i]; } template @@ -83,7 +67,7 @@ namespace quda atom_t tmp[n_element]; int j = (z * block.y + y) * block.x + x; #pragma unroll - for (int i = 0; i < n_element; i++) tmp[i] = smem()[i * stride + j]; + for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[i * stride + j]; T a; memcpy((void *)&a, tmp, sizeof(T)); return a; @@ -105,28 +89,20 @@ namespace quda public: /** - @brief constructor for SharedMemory cache. If no arguments are - pass, then the dimensions are set according to the templates - block_size_y and block_size_z, together with the derived - block_size_x. Otherwise use the block sizes passed into the - constructor. - - @param[in] block Block dimensions for the 3-d shared memory object - @param[in] thread_offset "Perceived" offset from dynamic shared - memory base pointer (used when we have multiple caches in - scope). Need to include block size to actual offset. + @brief Constructor for SharedMemoryCache. */ constexpr SharedMemoryCache() : block(D::dims(target::block_dim())), stride(block.x * block.y * block.z) { - static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizeDims::size(dim3{8,8,8})*sizeof(T)); + // sanity check + static_assert(shared_mem_size(dim3{32,16,8})==Smem::get_offset(dim3{32,16,8})+SizeDims::size(dim3{32,16,8})*sizeof(T)); } /** @brief Grab the raw base address to shared memory. */ __device__ __host__ inline auto data() const { - return reinterpret_cast(&smem()[0]); + return reinterpret_cast(&sharedMem()[0]); } /** @@ -302,11 +278,10 @@ namespace quda /** @brief Uniform helper for exposing type T, whether we are dealing - with an instance of T or SharedMemoryCache + with an instance of T or SharedMemoryCache */ template - struct get_type< - T, std::enable_if_t>>> { + struct get_type>>> { using type = typename T::value_type; }; diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h index 583468c036..5325e5ab8f 100644 --- a/include/targets/generic/thread_array.h +++ b/include/targets/generic/thread_array.h @@ -8,29 +8,28 @@ namespace quda { /** - @brief Class that provides indexable per-thread storage. On CUDA - this maps to using assigning each thread a unique window of - shared memory using the SharedMemoryCache object. + @brief Class that provides indexable per-thread storage for n + elements of type T. This version uses shared memory for storage. + The offset into the shared memory region is determined from the + type O. */ template class thread_array : SharedMemory, SizePerThread<1>, O> { using Smem = SharedMemory, SizePerThread<1>, O>; + using Smem::sharedMem; array &array_; - //constexpr Smem smem() const { return *dynamic_cast(this); } - using Smem::smem; - public: __device__ __host__ constexpr thread_array() : - array_(smem()[target::thread_idx_linear<3>()]) + array_(sharedMem()[target::thread_idx_linear<3>()]) { array_ = array(); // call default constructor } template __device__ __host__ constexpr thread_array(T first, const Ts... other) : - array_(smem()[target::thread_idx_linear<3>()]) + array_(sharedMem()[target::thread_idx_linear<3>()]) { array_ = array {first, other...}; } diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h index 545a365e25..277b16903c 100644 --- a/include/targets/generic/thread_local_cache.h +++ b/include/targets/generic/thread_local_cache.h @@ -15,39 +15,38 @@ namespace quda { /** - @brief Class for threads to store a unique value, or array of values, which can use - shared memory for optimization purposes. + @brief Class for threads to store a unique value (for N_ == 0), + or array of values (for N_ > 0), which can use shared memory for + optimization purposes. */ template class ThreadLocalCache : SharedMemory, SizePerThread)>, O> { + using Smem = SharedMemory, SizePerThread)>, O>; + public: using value_type = T; static constexpr int N = N_; // size of array, 0 means to behave like T instead of array using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one static constexpr int len = std::max(1,N); // actual number of elements to store - using Smem = SharedMemory, SizePerThread)>, O>; using Smem::shared_mem_size; private: + const int stride; + using Smem::sharedMem; using atom_t = atom_t; static_assert(sizeof(T) % 4 == 0, "Thread local cache does not support sub-word size types"); // The number of elements of type atom_t that we break T into for optimal shared-memory access static constexpr int n_element = sizeof(T) / sizeof(atom_t); - const int stride; - - //constexpr Smem smem() const { return *dynamic_cast(this); } - using Smem::smem; - __device__ __host__ inline void save_detail(const T &a, const int k) const { atom_t tmp[n_element]; memcpy(tmp, (void *)&a, sizeof(T)); int j = target::thread_idx_linear<3>(); #pragma unroll - for (int i = 0; i < n_element; i++) smem()[(k*n_element + i) * stride + j] = tmp[i]; + for (int i = 0; i < n_element; i++) sharedMem()[(k*n_element + i) * stride + j] = tmp[i]; } __device__ __host__ inline T load_detail(const int k) const @@ -55,7 +54,7 @@ namespace quda atom_t tmp[n_element]; int j = target::thread_idx_linear<3>(); #pragma unroll - for (int i = 0; i < n_element; i++) tmp[i] = smem()[(k*n_element + i) * stride + j]; + for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[(k*n_element + i) * stride + j]; T a; memcpy((void *)&a, tmp, sizeof(T)); return a; @@ -66,7 +65,8 @@ namespace quda @brief Constructor for ThreadLocalCache. */ constexpr ThreadLocalCache() : stride(target::block_size<3>()) { - static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizePerThread::size(dim3{8,8,8})*sizeof(T)); + // sanity check + static_assert(shared_mem_size(dim3{32,16,8})==Smem::get_offset(dim3{32,16,8})+SizePerThread::size(dim3{32,16,8})*sizeof(T)); } /** @@ -83,7 +83,10 @@ namespace quda @param[in] a The value to store in the thread local cache @param[in] k The index to use */ - __device__ __host__ inline void save(const T &a, const int k) const { save_detail(a, k); } + __device__ __host__ inline void save(const T &a, const int k) const { + static_assert(N > 0); + save_detail(a, k); + } /** @brief Load a value from the thread local cache. Used when N==0 so cache acts like single object. @@ -99,7 +102,10 @@ namespace quda @param[in] k The index to use @return The value stored in the thread local cache at that index */ - __device__ __host__ inline T load(const int k) const { return load_detail(k); } + __device__ __host__ inline T load(const int k) const { + static_assert(N > 0); + return load_detail(k); + } /** @brief Cast operator to allow cache objects to be used where T is expected (when N==0). @@ -123,7 +129,10 @@ namespace quda @param[in] i The index to use @return The value stored in the thread local cache at that index */ - __device__ __host__ T operator[](int i) { return load(i); } + __device__ __host__ T operator[](int i) { + static_assert(N > 0); + return load(i); + } }; template __device__ __host__ inline T operator+(const ThreadLocalCache &a, const T &b) @@ -165,7 +174,7 @@ namespace quda /** @brief Uniform helper for exposing type T, whether we are dealing - with an instance of T or ThreadLocalCache + with an instance of T or ThreadLocalCache */ template struct get_type>>> { diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h index b7dacbb9cf..2da596f6ed 100644 --- a/include/targets/hip/shared_memory_helper.h +++ b/include/targets/hip/shared_memory_helper.h @@ -14,9 +14,9 @@ namespace quda /** @brief Class which is used to allocate and access shared memory. The shared memory is treated as an array of type T, with the - number of elements given by the call to the static member - S::size(target::block_dim()). The offset from the beginning of - the total shared memory block is given by the static member + number of elements given by a call to the static member + S::size(target::block_dim()). The byte offset from the beginning + of the total shared memory block is given by the static member O::shared_mem_size(target::block_dim()), or 0 if O is void. */ template class SharedMemory @@ -26,7 +26,6 @@ namespace quda private: T *data; - const unsigned int size; // number of elements of type T /** @brief This is a dummy instantiation for the host compiler @@ -57,6 +56,9 @@ namespace quda } public: + /** + @brief Byte offset for this shared memory object. + */ static constexpr unsigned int get_offset(dim3 block) { unsigned int o = 0; @@ -64,6 +66,9 @@ namespace quda return o; } + /** + @brief Shared memory size in bytes. + */ static constexpr unsigned int shared_mem_size(dim3 block) { return get_offset(block) + S::size(block)*sizeof(T); @@ -72,10 +77,12 @@ namespace quda /** @brief Constructor for SharedMemory object. */ - constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))), - size(S::size(target::block_dim())) {} + constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) {} - constexpr auto smem() const { return *this; } + /** + @brief Return this SharedMemory object. + */ + constexpr auto sharedMem() const { return *this; } /** @brief Subscripting operator returning a reference to element.