Skip to content

Commit

Permalink
code cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
jcosborn committed Sep 1, 2023
1 parent bab726e commit b576582
Show file tree
Hide file tree
Showing 14 changed files with 110 additions and 109 deletions.
5 changes: 2 additions & 3 deletions include/kernels/block_transpose.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ namespace quda
constexpr BlockTransposeKernel(const Arg &arg) : arg(arg) { }
static constexpr const char *filename() { return KERNEL_FILE; }

struct Dims {
struct CacheDims {
static constexpr dim3 dims(dim3 block) {
block.x += 1;
block.z = 1;
Expand All @@ -68,8 +68,7 @@ namespace quda
int parity = parity_color / Arg::nColor;
using color_spinor_t = ColorSpinor<typename Arg::real, 1, Arg::nSpin>;

//SharedMemoryCache<color_spinor_t> cache({target::block_dim().x + 1, target::block_dim().y, 1});
SharedMemoryCache<color_spinor_t, Dims> cache;
SharedMemoryCache<color_spinor_t, CacheDims> cache;

int x_offset = target::block_dim().x * target::block_idx().x;
int v_offset = target::block_dim().y * target::block_idx().y;
Expand Down
4 changes: 0 additions & 4 deletions include/kernels/coarse_op_kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1398,8 +1398,6 @@ namespace quda {
using real = typename Arg::Float;
using TileType = typename Arg::vuvTileType;
const int dim_index = arg.dim_index % arg.Y_atomic.geometry;
//__shared__ complex<storeType> X[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
//__shared__ complex<storeType> Y[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
Cache<Arg> cache;
auto &X = cache.data()[0];
auto &Y = cache.data()[1];
Expand All @@ -1424,7 +1422,6 @@ namespace quda {
}
}

//__syncthreads();
cache.sync();

#pragma unroll
Expand Down Expand Up @@ -1454,7 +1451,6 @@ namespace quda {
}
}

//__syncthreads();
cache.sync();

if (tx < Arg::coarseSpin*Arg::coarseSpin && (parity == 0 || arg.parity_flip == 1) ) {
Expand Down
16 changes: 4 additions & 12 deletions include/kernels/color_spinor_pack.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -172,12 +172,12 @@ namespace quda {
};

template <> struct site_max<true> {
template <typename Arg>
struct DimsPadX {
template <typename Arg> struct CacheDims {
static constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
static constexpr int Mc = colors_per_thread<true>(Arg::nColor);
static constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc);
static constexpr dim3 dims(dim3 block) {
// pad the shared block size to avoid bank conflicts for native ordering
if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
block.y = color_spin_threads; // state the y block since we know it at compile time
return block;
Expand All @@ -187,16 +187,8 @@ namespace quda {
template <typename Arg> __device__ inline auto operator()(typename Arg::real thread_max, Arg &)
{
using real = typename Arg::real;
//constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
//constexpr int Mc = colors_per_thread<true>(Arg::nColor);
//constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc);
constexpr int color_spin_threads = DimsPadX<Arg>::color_spin_threads;
//auto block = target::block_dim();
// pad the shared block size to avoid bank conflicts for native ordering
//if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
//block.y = color_spin_threads; // state the y block since we know it at compile time
//SharedMemoryCache<real> cache(block);
SharedMemoryCache<real, DimsPadX<Arg>> cache;
constexpr int color_spin_threads = CacheDims<Arg>::color_spin_threads;
SharedMemoryCache<real, CacheDims<Arg>> cache;
cache.save(thread_max);
cache.sync();
real this_site_max = static_cast<real>(0);
Expand Down
4 changes: 2 additions & 2 deletions include/kernels/gauge_stout.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,8 @@ namespace quda
}

Link U, Q;
ThreadLocalCache<Link> Stap{};
ThreadLocalCache<Link,0,decltype(Stap)> Rect{}; // offset by Stap type to ensure non-overlapping allocations
ThreadLocalCache<Link> Stap;
ThreadLocalCache<Link,0,decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations

// This function gets stap = S_{mu,nu} i.e., the staple of length 3,
// and the 1x2 and 2x1 rectangles of length 5. From the following paper:
Expand Down
1 change: 0 additions & 1 deletion include/kernels/gauge_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include <index_helper.cuh>
#include <quda_matrix.h>
#include <thread_array.h>
#include <thread_local_cache.h>

namespace quda
{
Expand Down
4 changes: 2 additions & 2 deletions include/kernels/gauge_wilson_flow.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ namespace quda
// This function gets stap = S_{mu,nu} i.e., the staple of length 3,
// and the 1x2 and 2x1 rectangles of length 5. From the following paper:
// https://arxiv.org/abs/0801.1165
ThreadLocalCache<Link> Stap{};
ThreadLocalCache<Link,0,decltype(Stap)> Rect{}; // offset by Stap type to ensure non-overlapping allocations
ThreadLocalCache<Link> Stap;
ThreadLocalCache<Link,0,decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
computeStapleRectangle(arg, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim);
Z = arg.coeff1x1 * static_cast<const Link &>(Stap) + arg.coeff2x1 * static_cast<const Link &>(Rect);
break;
Expand Down
1 change: 0 additions & 1 deletion include/kernels/hisq_paths_force.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,6 @@ namespace quda {
* The "extra" low point corresponds to the Lepage contribution to the
* force_mu term.
*
*
* sig
* F E
* | |
Expand Down
21 changes: 14 additions & 7 deletions include/targets/cuda/shared_memory_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ namespace quda
/**
@brief Class which is used to allocate and access shared memory.
The shared memory is treated as an array of type T, with the
number of elements given by the call to the static member
S::size(target::block_dim()). The offset from the beginning of
the total shared memory block is given by the static member
number of elements given by a call to the static member
S::size(target::block_dim()). The byte offset from the beginning
of the total shared memory block is given by the static member
O::shared_mem_size(target::block_dim()), or 0 if O is void.
*/
template <typename T, typename S, typename O = void> class SharedMemory
Expand All @@ -26,7 +26,6 @@ namespace quda

private:
T *data;
const unsigned int size; // number of elements of type T

/**
@brief This is a dummy instantiation for the host compiler
Expand Down Expand Up @@ -57,13 +56,19 @@ namespace quda
}

public:
/**
@brief Byte offset for this shared memory object.
*/
static constexpr unsigned int get_offset(dim3 block)
{
unsigned int o = 0;
if constexpr (!std::is_same_v<O, void>) { o = O::shared_mem_size(block); }
return o;
}

/**
@brief Shared memory size in bytes.
*/
static constexpr unsigned int shared_mem_size(dim3 block)
{
return get_offset(block) + S::size(block)*sizeof(T);
Expand All @@ -72,10 +77,12 @@ namespace quda
/**
@brief Constructor for SharedMemory object.
*/
constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))),
size(S::size(target::block_dim())) {}
constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) {}

constexpr auto smem() const { return *this; }
/**
@brief Return this SharedMemory object.
*/
constexpr auto sharedMem() const { return *this; }

/**
@brief Subscripting operator returning a reference to element.
Expand Down
1 change: 1 addition & 0 deletions include/targets/cuda/thread_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#else

#include <array.h>

namespace quda
{
template <typename T, int n> struct thread_array : array<T, n> {};
Expand Down
18 changes: 18 additions & 0 deletions include/targets/generic/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,52 @@
namespace quda
{

/**
@brief Element type used for coalesced storage.
*/
template <typename T>
using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;

/**
@brief Used to declare an object of fixed size.
*/
template <int N> struct SizeStatic {
static constexpr unsigned int size(dim3) {
return N;
}
};

/**
@brief Used to declare an object of fixed size per thread, N.
*/
template <int N> struct SizePerThread {
static constexpr unsigned int size(dim3 block) {
return N * block.x * block.y * block.z;
}
};

/**
@brief Used to declare an object of fixed size per thread, N, with thread dimensions derermined by D.
*/
template <typename D, int N = 1> struct SizeDims {
static constexpr unsigned int size(dim3 block) {
dim3 dims = D::dims(block);
return dims.x * dims.y * dims.z * N;
}
};

/**
@brief Used to declare an object with dimensions given by the block size.
*/
struct DimsBlock {
static constexpr dim3 dims(dim3 block) {
return block;
}
};

/**
@brief Used to declare an object with fixed dimensions.
*/
template <int x, int y, int z>
struct DimsStatic {
static constexpr dim3 dims(dim3) {
Expand Down
69 changes: 22 additions & 47 deletions include/targets/generic/shared_memory_cache_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,46 +11,38 @@
sharing data between threads in a thread block.
*/

/**
@file shared_memory_cache_helper.h
@brief Convenience overloads to allow SharedMemoryCache objects to
appear in simple expressions. The actual implementation of
SharedMemoryCache is target specific, and located in e.g.,
include/targets/cuda/shared_memory_cache_helper.h, etc.
*/

namespace quda
{

/**
@brief Class which wraps around a shared memory cache for type T,
where each thread in the thread block stores a unique value in
the cache which any other thread can access.
the cache which any other thread can access. The data is stored
in a coalesced order with element size atom_t<T>.
This accessor supports both explicit run-time block size and
compile-time sizing.
The dimensions of the cache is determined by a call to
D::dims(target::block_dim()), and D defaults to having dimensions
equal to the block dimensions.
* For run-time block size, the constructor should be initialied
with the desired block size.
* For compile-time block size, no arguments should be passed to
the constructor, and then the second and third template
parameters correspond to the y and z dimensions of the block,
respectively. The x dimension of the block will be set
according the maximum number of threads possible, given these
dimensions.
A byte offset into the shared memory region can be specified with
the type O, and is given by
O::shared_mem_size(target::block_dim()) if O is not void.
*/
template <typename T, typename D = DimsBlock, typename O = void>
class SharedMemoryCache : SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>
{
using Smem = SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>;

public:
using value_type = T;
using dims_type = D;
using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
using Smem = SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>;
using offset_type = O;
using Smem::shared_mem_size;

private:
const dim3 block;
const int stride;
using Smem::sharedMem;
using atom_t = atom_t<T>;
static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types");

Expand All @@ -60,21 +52,13 @@ namespace quda
// used to avoid instantiation of load functions if unused, in case T is not a valid return type (e.g. C array)
template <typename dummy = void> using maybeT = std::conditional_t<std::is_same_v<dummy,void>,T,void>;

const dim3 block;
const int stride;

//constexpr Smem smem() const { return *dynamic_cast<const Smem*>(this); }
using Smem::smem;
//constexpr Smem smem() const { return Smem::smem(); }
//constexpr Smem smem() const { return *Smem::smemp(); }

__device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const
{
atom_t tmp[n_element];
memcpy(tmp, (void *)&a, sizeof(T));
int j = (z * block.y + y) * block.x + x;
#pragma unroll
for (int i = 0; i < n_element; i++) smem()[i * stride + j] = tmp[i];
for (int i = 0; i < n_element; i++) sharedMem()[i * stride + j] = tmp[i];
}

template <typename dummy = void>
Expand All @@ -83,7 +67,7 @@ namespace quda
atom_t tmp[n_element];
int j = (z * block.y + y) * block.x + x;
#pragma unroll
for (int i = 0; i < n_element; i++) tmp[i] = smem()[i * stride + j];
for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[i * stride + j];
T a;
memcpy((void *)&a, tmp, sizeof(T));
return a;
Expand All @@ -105,28 +89,20 @@ namespace quda

public:
/**
@brief constructor for SharedMemory cache. If no arguments are
pass, then the dimensions are set according to the templates
block_size_y and block_size_z, together with the derived
block_size_x. Otherwise use the block sizes passed into the
constructor.
@param[in] block Block dimensions for the 3-d shared memory object
@param[in] thread_offset "Perceived" offset from dynamic shared
memory base pointer (used when we have multiple caches in
scope). Need to include block size to actual offset.
@brief Constructor for SharedMemoryCache.
*/
constexpr SharedMemoryCache() :
block(D::dims(target::block_dim())), stride(block.x * block.y * block.z)
{
static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizeDims<D>::size(dim3{8,8,8})*sizeof(T));
// sanity check
static_assert(shared_mem_size(dim3{32,16,8})==Smem::get_offset(dim3{32,16,8})+SizeDims<D>::size(dim3{32,16,8})*sizeof(T));
}

/**
@brief Grab the raw base address to shared memory.
*/
__device__ __host__ inline auto data() const {
return reinterpret_cast<T *>(&smem()[0]);
return reinterpret_cast<T *>(&sharedMem()[0]);
}

/**
Expand Down Expand Up @@ -302,11 +278,10 @@ namespace quda

/**
@brief Uniform helper for exposing type T, whether we are dealing
with an instance of T or SharedMemoryCache<T>
with an instance of T or SharedMemoryCache<T,D,O>
*/
template <class T>
struct get_type<
T, std::enable_if_t<std::is_same_v<T, SharedMemoryCache<typename T::value_type, typename T::dims_type, typename T::offset_type>>>> {
struct get_type<T, std::enable_if_t<std::is_same_v<T, SharedMemoryCache<typename T::value_type, typename T::dims_type, typename T::offset_type>>>> {
using type = typename T::value_type;
};

Expand Down
Loading

0 comments on commit b576582

Please sign in to comment.