From b57658280e970f79e28870b1a1bf1ad5e1e620e1 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Thu, 31 Aug 2023 22:02:38 -0500
Subject: [PATCH] code cleanup

---
 include/kernels/block_transpose.cuh           |  5 +-
 include/kernels/coarse_op_kernel.cuh          |  4 --
 include/kernels/color_spinor_pack.cuh         | 16 ++---
 include/kernels/gauge_stout.cuh               |  4 +-
 include/kernels/gauge_utils.cuh               |  1 -
 include/kernels/gauge_wilson_flow.cuh         |  4 +-
 include/kernels/hisq_paths_force.cuh          |  1 -
 include/targets/cuda/shared_memory_helper.h   | 21 ++++--
 include/targets/cuda/thread_array.h           |  1 +
 include/targets/generic/helpers.h             | 18 +++++
 .../generic/shared_memory_cache_helper.h      | 69 ++++++-------------
 include/targets/generic/thread_array.h        | 15 ++--
 include/targets/generic/thread_local_cache.h  | 39 +++++++----
 include/targets/hip/shared_memory_helper.h    | 21 ++++--
 14 files changed, 110 insertions(+), 109 deletions(-)
diff --git a/include/kernels/block_transpose.cuh b/include/kernels/block_transpose.cuh
index 2b41f5c520..3c54345a6c 100644
--- a/include/kernels/block_transpose.cuh
+++ b/include/kernels/block_transpose.cuh
@@ -47,7 +47,7 @@ namespace quda
     constexpr BlockTransposeKernel(const Arg &arg) : arg(arg) { }
     static constexpr const char *filename() { return KERNEL_FILE; }
 
-    struct Dims {
+    struct CacheDims {
       static constexpr dim3 dims(dim3 block) {
 	block.x += 1;
 	block.z = 1;
@@ -68,8 +68,7 @@ namespace quda
       int parity = parity_color / Arg::nColor;
       using color_spinor_t = ColorSpinor<typename Arg::real, 1, Arg::nSpin>;
 
-      //SharedMemoryCache<color_spinor_t> cache({target::block_dim().x + 1, target::block_dim().y, 1});
-      SharedMemoryCache<color_spinor_t, Dims> cache;
+      SharedMemoryCache<color_spinor_t, CacheDims> cache;
 
       int x_offset = target::block_dim().x * target::block_idx().x;
       int v_offset = target::block_dim().y * target::block_idx().y;
diff --git a/include/kernels/coarse_op_kernel.cuh b/include/kernels/coarse_op_kernel.cuh
index be411eb68c..04c05fda4e 100644
--- a/include/kernels/coarse_op_kernel.cuh
+++ b/include/kernels/coarse_op_kernel.cuh
@@ -1398,8 +1398,6 @@ namespace quda {
       using real = typename Arg::Float;
       using TileType = typename Arg::vuvTileType;
       const int dim_index = arg.dim_index % arg.Y_atomic.geometry;
-      //__shared__ complex<storeType> X[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
-      //__shared__ complex<storeType> Y[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
       Cache<Arg> cache;
       auto &X = cache.data()[0];
       auto &Y = cache.data()[1];
@@ -1424,7 +1422,6 @@ namespace quda {
         }
       }
 
-      //__syncthreads();
       cache.sync();
 
 #pragma unroll
@@ -1454,7 +1451,6 @@ namespace quda {
         }
       }
 
-      //__syncthreads();
       cache.sync();
 
       if (tx < Arg::coarseSpin*Arg::coarseSpin && (parity == 0 || arg.parity_flip == 1) ) {
diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh
index 2cc489181a..a67d86ded2 100644
--- a/include/kernels/color_spinor_pack.cuh
+++ b/include/kernels/color_spinor_pack.cuh
@@ -172,12 +172,12 @@ namespace quda {
   };
 
   template <> struct site_max<true> {
-    template <typename Arg>
-    struct DimsPadX {
+    template <typename Arg> struct CacheDims {
       static constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
       static constexpr int Mc = colors_per_thread<true>(Arg::nColor);
       static constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc);
       static constexpr dim3 dims(dim3 block) {
+	// pad the shared block size to avoid bank conflicts for native ordering
 	if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
 	block.y = color_spin_threads; // state the y block since we know it at compile time
 	return block;
@@ -187,16 +187,8 @@ namespace quda {
     template <typename Arg> __device__ inline auto operator()(typename Arg::real thread_max, Arg &)
     {
       using real = typename Arg::real;
-      //constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
-      //constexpr int Mc = colors_per_thread<true>(Arg::nColor);
-      //constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc);
-      constexpr int color_spin_threads = DimsPadX<Arg>::color_spin_threads;
-      //auto block = target::block_dim();
-      // pad the shared block size to avoid bank conflicts for native ordering
-      //if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
-      //block.y = color_spin_threads; // state the y block since we know it at compile time
-      //SharedMemoryCache<real> cache(block);
-      SharedMemoryCache<real, DimsPadX<Arg>> cache;
+      constexpr int color_spin_threads = CacheDims<Arg>::color_spin_threads;
+      SharedMemoryCache<real, CacheDims<Arg>> cache;
       cache.save(thread_max);
       cache.sync();
       real this_site_max = static_cast<real>(0);
diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh
index 56bd00f425..4577e66fcd 100644
--- a/include/kernels/gauge_stout.cuh
+++ b/include/kernels/gauge_stout.cuh
@@ -135,8 +135,8 @@ namespace quda
       }
 
       Link U, Q;
-      ThreadLocalCache<Link> Stap{};
-      ThreadLocalCache<Link,0,decltype(Stap)> Rect{}; // offset by Stap type to ensure non-overlapping allocations
+      ThreadLocalCache<Link> Stap;
+      ThreadLocalCache<Link,0,decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
 
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
diff --git a/include/kernels/gauge_utils.cuh b/include/kernels/gauge_utils.cuh
index 6e91e1ac31..48c7e6c1cc 100644
--- a/include/kernels/gauge_utils.cuh
+++ b/include/kernels/gauge_utils.cuh
@@ -2,7 +2,6 @@
 #include <index_helper.cuh>
 #include <quda_matrix.h>
 #include <thread_array.h>
-#include <thread_local_cache.h>
 
 namespace quda
 {
diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh
index 050295f271..ae28956112 100644
--- a/include/kernels/gauge_wilson_flow.cuh
+++ b/include/kernels/gauge_wilson_flow.cuh
@@ -72,8 +72,8 @@ namespace quda
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
       // https://arxiv.org/abs/0801.1165
-      ThreadLocalCache<Link> Stap{};
-      ThreadLocalCache<Link,0,decltype(Stap)> Rect{}; // offset by Stap type to ensure non-overlapping allocations
+      ThreadLocalCache<Link> Stap;
+      ThreadLocalCache<Link,0,decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
       computeStapleRectangle(arg, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim);
       Z = arg.coeff1x1 * static_cast<const Link &>(Stap) + arg.coeff2x1 * static_cast<const Link &>(Rect);
       break;
diff --git a/include/kernels/hisq_paths_force.cuh b/include/kernels/hisq_paths_force.cuh
index 84173a9a5a..35ddde688d 100644
--- a/include/kernels/hisq_paths_force.cuh
+++ b/include/kernels/hisq_paths_force.cuh
@@ -538,7 +538,6 @@ namespace quda {
          * The "extra" low point corresponds to the Lepage contribution to the
          * force_mu term.
          *
-         *
          *            sig
          *         F        E
          *          |      |
diff --git a/include/targets/cuda/shared_memory_helper.h b/include/targets/cuda/shared_memory_helper.h
index b7dacbb9cf..2da596f6ed 100644
--- a/include/targets/cuda/shared_memory_helper.h
+++ b/include/targets/cuda/shared_memory_helper.h
@@ -14,9 +14,9 @@ namespace quda
   /**
      @brief Class which is used to allocate and access shared memory.
      The shared memory is treated as an array of type T, with the
-     number of elements given by the call to the static member
-     S::size(target::block_dim()).  The offset from the beginning of
-     the total shared memory block is given by the static member
+     number of elements given by a call to the static member
+     S::size(target::block_dim()).  The byte offset from the beginning
+     of the total shared memory block is given by the static member
      O::shared_mem_size(target::block_dim()), or 0 if O is void.
    */
   template <typename T, typename S, typename O = void> class SharedMemory
@@ -26,7 +26,6 @@ namespace quda
 
   private:
     T *data;
-    const unsigned int size;  // number of elements of type T
 
     /**
        @brief This is a dummy instantiation for the host compiler
@@ -57,6 +56,9 @@ namespace quda
     }
 
   public:
+    /**
+       @brief Byte offset for this shared memory object.
+    */
     static constexpr unsigned int get_offset(dim3 block)
     {
       unsigned int o = 0;
@@ -64,6 +66,9 @@ namespace quda
       return o;
     }
 
+    /**
+       @brief Shared memory size in bytes.
+    */
     static constexpr unsigned int shared_mem_size(dim3 block)
     {
       return get_offset(block) + S::size(block)*sizeof(T);
@@ -72,10 +77,12 @@ namespace quda
     /**
        @brief Constructor for SharedMemory object.
     */
-    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))),
-			       size(S::size(target::block_dim())) {}
+    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) {}
 
-    constexpr auto smem() const { return *this; }
+    /**
+       @brief Return this SharedMemory object.
+    */
+    constexpr auto sharedMem() const { return *this; }
 
     /**
        @brief Subscripting operator returning a reference to element.
diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h
index f5e965576d..c88a178e2c 100644
--- a/include/targets/cuda/thread_array.h
+++ b/include/targets/cuda/thread_array.h
@@ -7,6 +7,7 @@
 #else
 
 #include <array.h>
+
 namespace quda
 {
   template <typename T, int n> struct thread_array : array<T, n> {};
diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h
index a08cefa84b..f8faf41b44 100644
--- a/include/targets/generic/helpers.h
+++ b/include/targets/generic/helpers.h
@@ -3,21 +3,33 @@
 namespace quda
 {
 
+  /**
+     @brief Element type used for coalesced storage.
+   */
   template <typename T>
   using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
 
+  /**
+     @brief Used to declare an object of fixed size.
+   */
   template <int N> struct SizeStatic {
     static constexpr unsigned int size(dim3) {
       return N;
     }
   };
 
+  /**
+     @brief Used to declare an object of fixed size per thread, N.
+   */
   template <int N> struct SizePerThread {
     static constexpr unsigned int size(dim3 block) {
       return N * block.x * block.y * block.z;
     }
   };
 
+  /**
+     @brief Used to declare an object of fixed size per thread, N, with thread dimensions derermined by D.
+   */
   template <typename D, int N = 1> struct SizeDims {
     static constexpr unsigned int size(dim3 block) {
       dim3 dims = D::dims(block);
@@ -25,12 +37,18 @@ namespace quda
     }
   };
 
+  /**
+     @brief Used to declare an object with dimensions given by the block size.
+   */
   struct DimsBlock {
     static constexpr dim3 dims(dim3 block) {
       return block;
     }
   };
 
+  /**
+     @brief Used to declare an object with fixed dimensions.
+   */
   template <int x, int y, int z>
   struct DimsStatic {
     static constexpr dim3 dims(dim3) {
diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h
index 149a7016f3..1866375f76 100644
--- a/include/targets/generic/shared_memory_cache_helper.h
+++ b/include/targets/generic/shared_memory_cache_helper.h
@@ -11,46 +11,38 @@
    sharing data between threads in a thread block.
  */
 
-/**
-   @file shared_memory_cache_helper.h
-   @brief Convenience overloads to allow SharedMemoryCache objects to
-   appear in simple expressions.  The actual implementation of
-   SharedMemoryCache is target specific, and located in e.g.,
-   include/targets/cuda/shared_memory_cache_helper.h, etc.
- */
-
 namespace quda
 {
 
   /**
      @brief Class which wraps around a shared memory cache for type T,
      where each thread in the thread block stores a unique value in
-     the cache which any other thread can access.
+     the cache which any other thread can access.  The data is stored
+     in a coalesced order with element size atom_t<T>.
 
-     This accessor supports both explicit run-time block size and
-     compile-time sizing.
+     The dimensions of the cache is determined by a call to
+     D::dims(target::block_dim()), and D defaults to having dimensions
+     equal to the block dimensions.
 
-     * For run-time block size, the constructor should be initialied
-       with the desired block size.
-
-     * For compile-time block size, no arguments should be passed to
-       the constructor, and then the second and third template
-       parameters correspond to the y and z dimensions of the block,
-       respectively.  The x dimension of the block will be set
-       according the maximum number of threads possible, given these
-       dimensions.
+     A byte offset into the shared memory region can be specified with
+     the type O, and is given by
+     O::shared_mem_size(target::block_dim()) if O is not void.
    */
   template <typename T, typename D = DimsBlock, typename O = void>
   class SharedMemoryCache : SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>
   {
+    using Smem = SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>;
+
   public:
     using value_type = T;
     using dims_type = D;
-    using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
-    using Smem = SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>;
+    using offset_type = O;
     using Smem::shared_mem_size;
 
   private:
+    const dim3 block;
+    const int stride;
+    using Smem::sharedMem;
     using atom_t = atom_t<T>;
     static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types");
 
@@ -60,21 +52,13 @@ namespace quda
     // used to avoid instantiation of load functions if unused, in case T is not a valid return type (e.g. C array)
     template <typename dummy = void> using maybeT = std::conditional_t<std::is_same_v<dummy,void>,T,void>;
 
-    const dim3 block;
-    const int stride;
-
-    //constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
-    using Smem::smem;
-    //constexpr Smem smem() const { return Smem::smem(); }
-    //constexpr Smem smem() const { return *Smem::smemp(); }
-
     __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const
     {
       atom_t tmp[n_element];
       memcpy(tmp, (void *)&a, sizeof(T));
       int j = (z * block.y + y) * block.x + x;
 #pragma unroll
-      for (int i = 0; i < n_element; i++) smem()[i * stride + j] = tmp[i];
+      for (int i = 0; i < n_element; i++) sharedMem()[i * stride + j] = tmp[i];
     }
 
     template <typename dummy = void>
@@ -83,7 +67,7 @@ namespace quda
       atom_t tmp[n_element];
       int j = (z * block.y + y) * block.x + x;
 #pragma unroll
-      for (int i = 0; i < n_element; i++) tmp[i] = smem()[i * stride + j];
+      for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[i * stride + j];
       T a;
       memcpy((void *)&a, tmp, sizeof(T));
       return a;
@@ -105,28 +89,20 @@ namespace quda
 
   public:
     /**
-       @brief constructor for SharedMemory cache.  If no arguments are
-       pass, then the dimensions are set according to the templates
-       block_size_y and block_size_z, together with the derived
-       block_size_x.  Otherwise use the block sizes passed into the
-       constructor.
-
-       @param[in] block Block dimensions for the 3-d shared memory object
-       @param[in] thread_offset "Perceived" offset from dynamic shared
-       memory base pointer (used when we have multiple caches in
-       scope).  Need to include block size to actual offset.
+       @brief Constructor for SharedMemoryCache.
     */
     constexpr SharedMemoryCache() :
       block(D::dims(target::block_dim())), stride(block.x * block.y * block.z)
     {
-      static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizeDims<D>::size(dim3{8,8,8})*sizeof(T));
+      // sanity check
+      static_assert(shared_mem_size(dim3{32,16,8})==Smem::get_offset(dim3{32,16,8})+SizeDims<D>::size(dim3{32,16,8})*sizeof(T));
     }
 
     /**
        @brief Grab the raw base address to shared memory.
     */
     __device__ __host__ inline auto data() const {
-      return reinterpret_cast<T *>(&smem()[0]);
+      return reinterpret_cast<T *>(&sharedMem()[0]);
     }
 
     /**
@@ -302,11 +278,10 @@ namespace quda
 
   /**
      @brief Uniform helper for exposing type T, whether we are dealing
-     with an instance of T or SharedMemoryCache<T>
+     with an instance of T or SharedMemoryCache<T,D,O>
    */
   template <class T>
-  struct get_type<
-    T, std::enable_if_t<std::is_same_v<T, SharedMemoryCache<typename T::value_type, typename T::dims_type, typename T::offset_type>>>> {
+  struct get_type<T, std::enable_if_t<std::is_same_v<T, SharedMemoryCache<typename T::value_type, typename T::dims_type, typename T::offset_type>>>> {
     using type = typename T::value_type;
   };
 
diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h
index 583468c036..5325e5ab8f 100644
--- a/include/targets/generic/thread_array.h
+++ b/include/targets/generic/thread_array.h
@@ -8,29 +8,28 @@ namespace quda
 {
 
   /**
-     @brief Class that provides indexable per-thread storage.  On CUDA
-     this maps to using assigning each thread a unique window of
-     shared memory using the SharedMemoryCache object.
+     @brief Class that provides indexable per-thread storage for n
+     elements of type T.  This version uses shared memory for storage.
+     The offset into the shared memory region is determined from the
+     type O.
    */
   template <typename T, int n, typename O = void>
   class thread_array : SharedMemory<array<T,n>, SizePerThread<1>, O>
   {
     using Smem = SharedMemory<array<T,n>, SizePerThread<1>, O>;
+    using Smem::sharedMem;
     array<T, n> &array_;
 
-    //constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
-    using Smem::smem;
-
   public:
     __device__ __host__ constexpr thread_array() :
-      array_(smem()[target::thread_idx_linear<3>()])
+      array_(sharedMem()[target::thread_idx_linear<3>()])
     {
       array_ = array<T, n>(); // call default constructor
     }
 
     template <typename... Ts>
     __device__ __host__ constexpr thread_array(T first, const Ts... other) :
-      array_(smem()[target::thread_idx_linear<3>()])
+      array_(sharedMem()[target::thread_idx_linear<3>()])
     {
       array_ = array<T, n> {first, other...};
     }
diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h
index 545a365e25..277b16903c 100644
--- a/include/targets/generic/thread_local_cache.h
+++ b/include/targets/generic/thread_local_cache.h
@@ -15,39 +15,38 @@ namespace quda
 {
 
   /**
-     @brief Class for threads to store a unique value, or array of values, which can use
-     shared memory for optimization purposes.
+     @brief Class for threads to store a unique value (for N_ == 0),
+     or array of values (for N_ > 0), which can use shared memory for
+     optimization purposes.
    */
   template <typename T, int N_ = 0, typename O = void> class ThreadLocalCache :
     SharedMemory<atom_t<T>, SizePerThread<std::max(1,N_)*sizeof(T)/sizeof(atom_t<T>)>, O>
   {
+    using Smem = SharedMemory<atom_t<T>, SizePerThread<std::max(1,N_)*sizeof(T)/sizeof(atom_t<T>)>, O>;
+
   public:
     using value_type = T;
     static constexpr int N = N_; // size of array, 0 means to behave like T instead of array<T, 1>
     using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
     static constexpr int len = std::max(1,N); // actual number of elements to store
-    using Smem = SharedMemory<atom_t<T>, SizePerThread<std::max(1,N_)*sizeof(T)/sizeof(atom_t<T>)>, O>;
     using Smem::shared_mem_size;
 
   private:
+    const int stride;
+    using Smem::sharedMem;
     using atom_t = atom_t<T>;
     static_assert(sizeof(T) % 4 == 0, "Thread local cache does not support sub-word size types");
 
     // The number of elements of type atom_t that we break T into for optimal shared-memory access
     static constexpr int n_element = sizeof(T) / sizeof(atom_t);
 
-    const int stride;
-
-    //constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
-    using Smem::smem;
-
     __device__ __host__ inline void save_detail(const T &a, const int k) const
     {
       atom_t tmp[n_element];
       memcpy(tmp, (void *)&a, sizeof(T));
       int j = target::thread_idx_linear<3>();
 #pragma unroll
-      for (int i = 0; i < n_element; i++) smem()[(k*n_element + i) * stride + j] = tmp[i];
+      for (int i = 0; i < n_element; i++) sharedMem()[(k*n_element + i) * stride + j] = tmp[i];
     }
 
     __device__ __host__ inline T load_detail(const int k) const
@@ -55,7 +54,7 @@ namespace quda
       atom_t tmp[n_element];
       int j = target::thread_idx_linear<3>();
 #pragma unroll
-      for (int i = 0; i < n_element; i++) tmp[i] = smem()[(k*n_element + i) * stride + j];
+      for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[(k*n_element + i) * stride + j];
       T a;
       memcpy((void *)&a, tmp, sizeof(T));
       return a;
@@ -66,7 +65,8 @@ namespace quda
        @brief Constructor for ThreadLocalCache.
     */
     constexpr ThreadLocalCache() : stride(target::block_size<3>()) {
-      static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizePerThread<len>::size(dim3{8,8,8})*sizeof(T));
+      // sanity check
+      static_assert(shared_mem_size(dim3{32,16,8})==Smem::get_offset(dim3{32,16,8})+SizePerThread<len>::size(dim3{32,16,8})*sizeof(T));
     }
 
     /**
@@ -83,7 +83,10 @@ namespace quda
        @param[in] a The value to store in the thread local cache
        @param[in] k The index to use
      */
-    __device__ __host__ inline void save(const T &a, const int k) const { save_detail(a, k); }
+    __device__ __host__ inline void save(const T &a, const int k) const {
+      static_assert(N > 0);
+      save_detail(a, k);
+    }
 
     /**
        @brief Load a value from the thread local cache.  Used when N==0 so cache acts like single object.
@@ -99,7 +102,10 @@ namespace quda
        @param[in] k The index to use
        @return The value stored in the thread local cache at that index
      */
-    __device__ __host__ inline T load(const int k) const { return load_detail(k); }
+    __device__ __host__ inline T load(const int k) const {
+      static_assert(N > 0);
+      return load_detail(k);
+    }
 
     /**
        @brief Cast operator to allow cache objects to be used where T is expected (when N==0).
@@ -123,7 +129,10 @@ namespace quda
        @param[in] i The index to use
        @return The value stored in the thread local cache at that index
      */
-    __device__ __host__ T operator[](int i) { return load(i); }
+    __device__ __host__ T operator[](int i) {
+      static_assert(N > 0);
+      return load(i);
+    }
   };
 
   template <typename T, int N, typename O> __device__ __host__ inline T operator+(const ThreadLocalCache<T, N, O> &a, const T &b)
@@ -165,7 +174,7 @@ namespace quda
 
   /**
      @brief Uniform helper for exposing type T, whether we are dealing
-     with an instance of T or ThreadLocalCache<T,O>
+     with an instance of T or ThreadLocalCache<T,N,O>
    */
   template <class T>
   struct get_type<T, std::enable_if_t<std::is_same_v<T, ThreadLocalCache<typename T::value_type, T::N, typename T::offset_type>>>> {
diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h
index b7dacbb9cf..2da596f6ed 100644
--- a/include/targets/hip/shared_memory_helper.h
+++ b/include/targets/hip/shared_memory_helper.h
@@ -14,9 +14,9 @@ namespace quda
   /**
      @brief Class which is used to allocate and access shared memory.
      The shared memory is treated as an array of type T, with the
-     number of elements given by the call to the static member
-     S::size(target::block_dim()).  The offset from the beginning of
-     the total shared memory block is given by the static member
+     number of elements given by a call to the static member
+     S::size(target::block_dim()).  The byte offset from the beginning
+     of the total shared memory block is given by the static member
      O::shared_mem_size(target::block_dim()), or 0 if O is void.
    */
   template <typename T, typename S, typename O = void> class SharedMemory
@@ -26,7 +26,6 @@ namespace quda
 
   private:
     T *data;
-    const unsigned int size;  // number of elements of type T
 
     /**
        @brief This is a dummy instantiation for the host compiler
@@ -57,6 +56,9 @@ namespace quda
     }
 
   public:
+    /**
+       @brief Byte offset for this shared memory object.
+    */
     static constexpr unsigned int get_offset(dim3 block)
     {
       unsigned int o = 0;
@@ -64,6 +66,9 @@ namespace quda
       return o;
     }
 
+    /**
+       @brief Shared memory size in bytes.
+    */
     static constexpr unsigned int shared_mem_size(dim3 block)
     {
       return get_offset(block) + S::size(block)*sizeof(T);
@@ -72,10 +77,12 @@ namespace quda
     /**
        @brief Constructor for SharedMemory object.
     */
-    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))),
-			       size(S::size(target::block_dim())) {}
+    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) {}
 
-    constexpr auto smem() const { return *this; }
+    /**
+       @brief Return this SharedMemory object.
+    */
+    constexpr auto sharedMem() const { return *this; }
 
     /**
        @brief Subscripting operator returning a reference to element.