code cleanup

lattice · Sep 1, 2023 · b576582 · b576582
1 parent bab726e
commit b576582
Show file tree

Hide file tree

Showing 14 changed files with 110 additions and 109 deletions.
diff --git a/include/kernels/block_transpose.cuh b/include/kernels/block_transpose.cuh
@@ -47,7 +47,7 @@ namespace quda
     constexpr BlockTransposeKernel(const Arg &arg) : arg(arg) { }
     static constexpr const char *filename() { return KERNEL_FILE; }
 
-    struct Dims {
+    struct CacheDims {
       static constexpr dim3 dims(dim3 block) {
 	block.x += 1;
 	block.z = 1;
@@ -68,8 +68,7 @@ namespace quda
       int parity = parity_color / Arg::nColor;
       using color_spinor_t = ColorSpinor<typename Arg::real, 1, Arg::nSpin>;
 
-      //SharedMemoryCache<color_spinor_t> cache({target::block_dim().x + 1, target::block_dim().y, 1});
-      SharedMemoryCache<color_spinor_t, Dims> cache;
+      SharedMemoryCache<color_spinor_t, CacheDims> cache;
 
       int x_offset = target::block_dim().x * target::block_idx().x;
       int v_offset = target::block_dim().y * target::block_idx().y;

diff --git a/include/kernels/coarse_op_kernel.cuh b/include/kernels/coarse_op_kernel.cuh
@@ -1398,8 +1398,6 @@ namespace quda {
       using real = typename Arg::Float;
       using TileType = typename Arg::vuvTileType;
       const int dim_index = arg.dim_index % arg.Y_atomic.geometry;
-      //__shared__ complex<storeType> X[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
-      //__shared__ complex<storeType> Y[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
       Cache<Arg> cache;
       auto &X = cache.data()[0];
       auto &Y = cache.data()[1];
@@ -1424,7 +1422,6 @@ namespace quda {
         }
       }
 
-      //__syncthreads();
       cache.sync();
 
 #pragma unroll
@@ -1454,7 +1451,6 @@ namespace quda {
         }
       }
 
-      //__syncthreads();
       cache.sync();
 
       if (tx < Arg::coarseSpin*Arg::coarseSpin && (parity == 0 || arg.parity_flip == 1) ) {

diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh
@@ -172,12 +172,12 @@ namespace quda {
   };
 
   template <> struct site_max<true> {
-    template <typename Arg>
-    struct DimsPadX {
+    template <typename Arg> struct CacheDims {
       static constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
       static constexpr int Mc = colors_per_thread<true>(Arg::nColor);
       static constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc);
       static constexpr dim3 dims(dim3 block) {
+	// pad the shared block size to avoid bank conflicts for native ordering
 	if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
 	block.y = color_spin_threads; // state the y block since we know it at compile time
 	return block;
@@ -187,16 +187,8 @@ namespace quda {
     template <typename Arg> __device__ inline auto operator()(typename Arg::real thread_max, Arg &)
     {
       using real = typename Arg::real;
-      //constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
-      //constexpr int Mc = colors_per_thread<true>(Arg::nColor);
-      //constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc);
-      constexpr int color_spin_threads = DimsPadX<Arg>::color_spin_threads;
-      //auto block = target::block_dim();
-      // pad the shared block size to avoid bank conflicts for native ordering
-      //if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
-      //block.y = color_spin_threads; // state the y block since we know it at compile time
-      //SharedMemoryCache<real> cache(block);
-      SharedMemoryCache<real, DimsPadX<Arg>> cache;
+      constexpr int color_spin_threads = CacheDims<Arg>::color_spin_threads;
+      SharedMemoryCache<real, CacheDims<Arg>> cache;
       cache.save(thread_max);
       cache.sync();
       real this_site_max = static_cast<real>(0);

diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh
@@ -135,8 +135,8 @@ namespace quda
       }
 
       Link U, Q;
-      ThreadLocalCache<Link> Stap{};
-      ThreadLocalCache<Link,0,decltype(Stap)> Rect{}; // offset by Stap type to ensure non-overlapping allocations
+      ThreadLocalCache<Link> Stap;
+      ThreadLocalCache<Link,0,decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
 
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:

diff --git a/include/kernels/gauge_utils.cuh b/include/kernels/gauge_utils.cuh
@@ -2,7 +2,6 @@
 #include <index_helper.cuh>
 #include <quda_matrix.h>
 #include <thread_array.h>
-#include <thread_local_cache.h>
 
 namespace quda
 {

diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh
@@ -72,8 +72,8 @@ namespace quda
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
       // https://arxiv.org/abs/0801.1165
-      ThreadLocalCache<Link> Stap{};
-      ThreadLocalCache<Link,0,decltype(Stap)> Rect{}; // offset by Stap type to ensure non-overlapping allocations
+      ThreadLocalCache<Link> Stap;
+      ThreadLocalCache<Link,0,decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
       computeStapleRectangle(arg, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim);
       Z = arg.coeff1x1 * static_cast<const Link &>(Stap) + arg.coeff2x1 * static_cast<const Link &>(Rect);
       break;

diff --git a/include/kernels/hisq_paths_force.cuh b/include/kernels/hisq_paths_force.cuh
@@ -538,7 +538,6 @@ namespace quda {
          * The "extra" low point corresponds to the Lepage contribution to the
          * force_mu term.
          *
-         *
          *            sig
          *         F        E
          *          |      |

diff --git a/include/targets/cuda/shared_memory_helper.h b/include/targets/cuda/shared_memory_helper.h
@@ -14,9 +14,9 @@ namespace quda
   /**
      @brief Class which is used to allocate and access shared memory.
      The shared memory is treated as an array of type T, with the
-     number of elements given by the call to the static member
-     S::size(target::block_dim()).  The offset from the beginning of
-     the total shared memory block is given by the static member
+     number of elements given by a call to the static member
+     S::size(target::block_dim()).  The byte offset from the beginning
+     of the total shared memory block is given by the static member
      O::shared_mem_size(target::block_dim()), or 0 if O is void.
    */
   template <typename T, typename S, typename O = void> class SharedMemory
@@ -26,7 +26,6 @@ namespace quda
 
   private:
     T *data;
-    const unsigned int size;  // number of elements of type T
 
     /**
        @brief This is a dummy instantiation for the host compiler
@@ -57,13 +56,19 @@ namespace quda
     }
 
   public:
+    /**
+       @brief Byte offset for this shared memory object.
+    */
     static constexpr unsigned int get_offset(dim3 block)
     {
       unsigned int o = 0;
       if constexpr (!std::is_same_v<O, void>) { o = O::shared_mem_size(block); }
       return o;
     }
 
+    /**
+       @brief Shared memory size in bytes.
+    */
     static constexpr unsigned int shared_mem_size(dim3 block)
     {
       return get_offset(block) + S::size(block)*sizeof(T);
@@ -72,10 +77,12 @@ namespace quda
     /**
        @brief Constructor for SharedMemory object.
     */
-    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))),
-			       size(S::size(target::block_dim())) {}
+    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) {}
 
-    constexpr auto smem() const { return *this; }
+    /**
+       @brief Return this SharedMemory object.
+    */
+    constexpr auto sharedMem() const { return *this; }
 
     /**
        @brief Subscripting operator returning a reference to element.

diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h
@@ -7,6 +7,7 @@
 #else
 
 #include <array.h>
+
 namespace quda
 {
   template <typename T, int n> struct thread_array : array<T, n> {};

diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h
@@ -3,34 +3,52 @@
 namespace quda
 {
 
+  /**
+     @brief Element type used for coalesced storage.
+   */
   template <typename T>
   using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
 
+  /**
+     @brief Used to declare an object of fixed size.
+   */
   template <int N> struct SizeStatic {
     static constexpr unsigned int size(dim3) {
       return N;
     }
   };
 
+  /**
+     @brief Used to declare an object of fixed size per thread, N.
+   */
   template <int N> struct SizePerThread {
     static constexpr unsigned int size(dim3 block) {
       return N * block.x * block.y * block.z;
     }
   };
 
+  /**
+     @brief Used to declare an object of fixed size per thread, N, with thread dimensions derermined by D.
+   */
   template <typename D, int N = 1> struct SizeDims {
     static constexpr unsigned int size(dim3 block) {
       dim3 dims = D::dims(block);
       return dims.x * dims.y * dims.z * N;
     }
   };
 
+  /**
+     @brief Used to declare an object with dimensions given by the block size.
+   */
   struct DimsBlock {
     static constexpr dim3 dims(dim3 block) {
       return block;
     }
   };
 
+  /**
+     @brief Used to declare an object with fixed dimensions.
+   */
   template <int x, int y, int z>
   struct DimsStatic {
     static constexpr dim3 dims(dim3) {

diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h
@@ -11,46 +11,38 @@
    sharing data between threads in a thread block.
  */
 
-/**
-   @file shared_memory_cache_helper.h
-   @brief Convenience overloads to allow SharedMemoryCache objects to
-   appear in simple expressions.  The actual implementation of
-   SharedMemoryCache is target specific, and located in e.g.,
-   include/targets/cuda/shared_memory_cache_helper.h, etc.
- */
-
 namespace quda
 {
 
   /**
      @brief Class which wraps around a shared memory cache for type T,
      where each thread in the thread block stores a unique value in
-     the cache which any other thread can access.
+     the cache which any other thread can access.  The data is stored
+     in a coalesced order with element size atom_t<T>.
 
-     This accessor supports both explicit run-time block size and
-     compile-time sizing.
+     The dimensions of the cache is determined by a call to
+     D::dims(target::block_dim()), and D defaults to having dimensions
+     equal to the block dimensions.
 
-     * For run-time block size, the constructor should be initialied
-       with the desired block size.
-
-     * For compile-time block size, no arguments should be passed to
-       the constructor, and then the second and third template
-       parameters correspond to the y and z dimensions of the block,
-       respectively.  The x dimension of the block will be set
-       according the maximum number of threads possible, given these
-       dimensions.
+     A byte offset into the shared memory region can be specified with
+     the type O, and is given by
+     O::shared_mem_size(target::block_dim()) if O is not void.
    */
   template <typename T, typename D = DimsBlock, typename O = void>
   class SharedMemoryCache : SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>
   {
+    using Smem = SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>;
+
   public:
     using value_type = T;
     using dims_type = D;
-    using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
-    using Smem = SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>;
+    using offset_type = O;
     using Smem::shared_mem_size;
 
   private:
+    const dim3 block;
+    const int stride;
+    using Smem::sharedMem;
     using atom_t = atom_t<T>;
     static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types");
 
@@ -60,21 +52,13 @@ namespace quda
     // used to avoid instantiation of load functions if unused, in case T is not a valid return type (e.g. C array)
     template <typename dummy = void> using maybeT = std::conditional_t<std::is_same_v<dummy,void>,T,void>;
 
-    const dim3 block;
-    const int stride;
-
-    //constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
-    using Smem::smem;
-    //constexpr Smem smem() const { return Smem::smem(); }
-    //constexpr Smem smem() const { return *Smem::smemp(); }
-
     __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const
     {
       atom_t tmp[n_element];
       memcpy(tmp, (void *)&a, sizeof(T));
       int j = (z * block.y + y) * block.x + x;
 #pragma unroll
-      for (int i = 0; i < n_element; i++) smem()[i * stride + j] = tmp[i];
+      for (int i = 0; i < n_element; i++) sharedMem()[i * stride + j] = tmp[i];
     }
 
     template <typename dummy = void>
@@ -83,7 +67,7 @@ namespace quda
       atom_t tmp[n_element];
       int j = (z * block.y + y) * block.x + x;
 #pragma unroll
-      for (int i = 0; i < n_element; i++) tmp[i] = smem()[i * stride + j];
+      for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[i * stride + j];
       T a;
       memcpy((void *)&a, tmp, sizeof(T));
       return a;
@@ -105,28 +89,20 @@ namespace quda
 
   public:
     /**
-       @brief constructor for SharedMemory cache.  If no arguments are
-       pass, then the dimensions are set according to the templates
-       block_size_y and block_size_z, together with the derived
-       block_size_x.  Otherwise use the block sizes passed into the
-       constructor.
-
-       @param[in] block Block dimensions for the 3-d shared memory object
-       @param[in] thread_offset "Perceived" offset from dynamic shared
-       memory base pointer (used when we have multiple caches in
-       scope).  Need to include block size to actual offset.
+       @brief Constructor for SharedMemoryCache.
     */
     constexpr SharedMemoryCache() :
       block(D::dims(target::block_dim())), stride(block.x * block.y * block.z)
     {
-      static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizeDims<D>::size(dim3{8,8,8})*sizeof(T));
+      // sanity check
+      static_assert(shared_mem_size(dim3{32,16,8})==Smem::get_offset(dim3{32,16,8})+SizeDims<D>::size(dim3{32,16,8})*sizeof(T));
     }
 
     /**
        @brief Grab the raw base address to shared memory.
     */
     __device__ __host__ inline auto data() const {
-      return reinterpret_cast<T *>(&smem()[0]);
+      return reinterpret_cast<T *>(&sharedMem()[0]);
     }
 
     /**
@@ -302,11 +278,10 @@ namespace quda
 
   /**
      @brief Uniform helper for exposing type T, whether we are dealing
-     with an instance of T or SharedMemoryCache<T>
+     with an instance of T or SharedMemoryCache<T,D,O>
    */
   template <class T>
-  struct get_type<
-    T, std::enable_if_t<std::is_same_v<T, SharedMemoryCache<typename T::value_type, typename T::dims_type, typename T::offset_type>>>> {
+  struct get_type<T, std::enable_if_t<std::is_same_v<T, SharedMemoryCache<typename T::value_type, typename T::dims_type, typename T::offset_type>>>> {
     using type = typename T::value_type;
   };