Skip to content

Commit

Permalink
[batch] split cg compilation (cuda)
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcelKoch committed Jul 10, 2024
1 parent e5bb9aa commit e59ab55
Show file tree
Hide file tree
Showing 4 changed files with 293 additions and 137 deletions.
2 changes: 2 additions & 0 deletions cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE)
add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.cu BATCH_BICGSTAB_INSTANTIATE)
add_instantiation_files(. solver/batch_cg_launch.instantiate.cu BATCH_CG_INSTANTIATE)
# we don't split up the dense kernels into distinct compilations
list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
target_sources(ginkgo_cuda
Expand Down Expand Up @@ -41,6 +42,7 @@ target_sources(ginkgo_cuda
solver/batch_bicgstab_kernels.cu
${BATCH_BICGSTAB_INSTANTIATE}
solver/batch_cg_kernels.cu
${BATCH_CG_INSTANTIATE}
solver/lower_trs_kernels.cu
solver/upper_trs_kernels.cu
${GKO_UNIFIED_COMMON_SOURCES}
Expand Down
179 changes: 42 additions & 137 deletions cuda/solver/batch_cg_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,204 +4,110 @@

#include "core/solver/batch_cg_kernels.hpp"

#include <thrust/functional.h>
#include <thrust/transform.h>

#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/base/math.hpp>

#include "common/cuda_hip/base/config.hpp"
#include "common/cuda_hip/base/thrust.hpp"
#include "common/cuda_hip/base/types.hpp"
#include "common/cuda_hip/components/cooperative_groups.hpp"
#include "common/cuda_hip/components/reduction.hpp"
#include "common/cuda_hip/components/thread_ids.hpp"
#include "common/cuda_hip/components/warp_blas.hpp"
#include "core/base/batch_struct.hpp"
#include "core/matrix/batch_struct.hpp"
#include "core/solver/batch_dispatch.hpp"
#include "cuda/base/batch_struct.hpp"
#include "cuda/matrix/batch_struct.hpp"
#include "cuda/solver/batch_cg_launch.cuh"


namespace gko {
namespace kernels {
namespace cuda {


// NOTE: this default block size is not used for the main solver kernel.
constexpr int default_block_size = 256;
constexpr int sm_oversubscription = 4;


/**
* @brief The batch Cg solver namespace.
*
* @ingroup batch_cg
*/
namespace batch_cg {


#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc"


template <typename StopType, typename PrecType, typename LogType,
typename BatchMatrixType, typename ValueType>
int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
const int num_rows)
{
int num_warps = std::max(num_rows / 4, 2);
constexpr int warp_sz = static_cast<int>(config::warp_size);
const int min_block_size = 2 * warp_sz;
const int device_max_threads =
(std::max(num_rows, min_block_size) / warp_sz) * warp_sz;
cudaFuncAttributes funcattr;
cudaFuncGetAttributes(&funcattr,
apply_kernel<StopType, 5, true, PrecType, LogType,
BatchMatrixType, ValueType>);
const int num_regs_used = funcattr.numRegs;
int max_regs_blk = 0;
cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
exec->get_device_id());
const int max_threads_regs =
((max_regs_blk / static_cast<int>(num_regs_used)) / warp_sz) * warp_sz;
int max_threads = std::min(max_threads_regs, device_max_threads);
max_threads = max_threads <= 1024 ? max_threads : 1024;
return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
}


template <typename StopType, typename PrecType, typename LogType,
typename BatchMatrixType, typename ValueType>
int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
{
int shmem_per_sm = 0;
cudaDeviceGetAttribute(&shmem_per_sm,
cudaDevAttrMaxSharedMemoryPerMultiprocessor,
exec->get_device_id());
GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
apply_kernel<StopType, 5, true, PrecType, LogType, BatchMatrixType,
ValueType>,
cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
cudaFuncAttributes funcattr;
cudaFuncGetAttributes(&funcattr,
apply_kernel<StopType, 5, true, PrecType, LogType,
BatchMatrixType, ValueType>);
return funcattr.maxDynamicSharedSizeBytes;
}


template <typename T>
using settings = gko::kernels::batch_cg::settings<T>;


template <typename CuValueType>
template <typename ValueType>
class kernel_caller {
public:
using value_type = CuValueType;
using cuda_value_type = cuda_type<ValueType>;

kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
const settings<remove_complex<value_type>> settings)
const settings<remove_complex<ValueType>> settings)
: exec_{std::move(exec)}, settings_{settings}
{}

template <typename StopType, const int n_shared,
const bool prec_shared_bool, typename PrecType, typename LogType,
typename BatchMatrixType>
void launch_apply_kernel(
const gko::kernels::batch_cg::storage_config& sconf, LogType& logger,
PrecType& prec, const BatchMatrixType& mat,
const value_type* const __restrict__ b_values,
value_type* const __restrict__ x_values,
value_type* const __restrict__ workspace_data, const int& block_size,
const size_t& shared_size) const
{
apply_kernel<StopType, n_shared, prec_shared_bool>
<<<mat.num_batch_items, block_size, shared_size,
exec_->get_stream()>>>(sconf, settings_.max_iterations,
settings_.residual_tol, logger, prec, mat,
b_values, x_values, workspace_data);
}
template <typename BatchMatrixType, typename PrecType, typename StopType,
typename LogType>
void call_kernel(
LogType logger, const BatchMatrixType& mat, PrecType prec,
const gko::batch::multi_vector::uniform_batch<const value_type>& b,
const gko::batch::multi_vector::uniform_batch<value_type>& x) const
const gko::batch::multi_vector::uniform_batch<const cuda_value_type>& b,
const gko::batch::multi_vector::uniform_batch<cuda_value_type>& x) const
{
using real_type = gko::remove_complex<value_type>;
using real_type = gko::remove_complex<cuda_value_type>;
const size_type num_batch_items = mat.num_batch_items;
constexpr int align_multiple = 8;
const int padded_num_rows =
ceildiv(mat.num_rows, align_multiple) * align_multiple;
const int shmem_per_blk =
get_max_dynamic_shared_memory<StopType, PrecType, LogType,
BatchMatrixType, value_type>(exec_);
BatchMatrixType, cuda_value_type>(
exec_);
const int block_size =
get_num_threads_per_block<StopType, PrecType, LogType,
BatchMatrixType, value_type>(
BatchMatrixType, cuda_value_type>(
exec_, mat.num_rows);
GKO_ASSERT(block_size >= 2 * config::warp_size);

const size_t prec_size = PrecType::dynamic_work_size(
padded_num_rows, mat.get_single_item_num_nnz());
const auto sconf =
gko::kernels::batch_cg::compute_shared_storage<PrecType,
value_type>(
cuda_value_type>(
shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
b.num_rhs);
const size_t shared_size =
sconf.n_shared * padded_num_rows * sizeof(value_type) +
sconf.n_shared * padded_num_rows * sizeof(cuda_value_type) +
(sconf.prec_shared ? prec_size : 0);
auto workspace = gko::array<value_type>(
exec_,
sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0);
auto workspace = gko::array<cuda_value_type>(
exec_, sconf.gmem_stride_bytes * num_batch_items /
sizeof(cuda_value_type));
GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(cuda_value_type) == 0);

value_type* const workspace_data = workspace.get_data();
cuda_value_type* const workspace_data = workspace.get_data();

// Template parameters launch_apply_kernel<StopType, n_shared,
// prec_shared>
// Template parameters launch_apply_kernel<ValueType, n_shared,
// prec_shared, StopType>
if (sconf.prec_shared) {
launch_apply_kernel<StopType, 5, true>(
sconf, logger, prec, mat, b.values, x.values, workspace_data,
block_size, shared_size);
launch_apply_kernel<cuda_value_type, 5, true, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
} else {
switch (sconf.n_shared) {
case 0:
launch_apply_kernel<StopType, 0, false>(
sconf, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
launch_apply_kernel<cuda_value_type, 0, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 1:
launch_apply_kernel<StopType, 1, false>(
sconf, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
launch_apply_kernel<cuda_value_type, 1, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 2:
launch_apply_kernel<StopType, 2, false>(
sconf, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
launch_apply_kernel<cuda_value_type, 2, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 3:
launch_apply_kernel<StopType, 3, false>(
sconf, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
launch_apply_kernel<cuda_value_type, 3, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 4:
launch_apply_kernel<StopType, 4, false>(
sconf, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
launch_apply_kernel<cuda_value_type, 4, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
case 5:
launch_apply_kernel<StopType, 5, false>(
sconf, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
launch_apply_kernel<cuda_value_type, 5, false, StopType>(
exec_, sconf, settings_, logger, prec, mat, b.values,
x.values, workspace_data, block_size, shared_size);
break;
default:
GKO_NOT_IMPLEMENTED;
Expand All @@ -211,7 +117,7 @@ public:

private:
std::shared_ptr<const DefaultExecutor> exec_;
const settings<remove_complex<value_type>> settings_;
const settings<remove_complex<ValueType>> settings_;
};


Expand All @@ -224,9 +130,8 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
batch::MultiVector<ValueType>* const x,
batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
{
using cu_value_type = cuda_type<ValueType>;
auto dispatcher = batch::solver::create_dispatcher<ValueType>(
kernel_caller<cu_value_type>(exec, settings), settings, mat, precon);
kernel_caller<ValueType>(exec, settings), settings, mat, precon);
dispatcher.apply(b, x, logdata);
}

Expand Down
105 changes: 105 additions & 0 deletions cuda/solver/batch_cg_launch.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#include "common/cuda_hip/base/config.hpp"
#include "common/cuda_hip/base/types.hpp"
#include "core/base/batch_struct.hpp"
#include "core/matrix/batch_struct.hpp"
#include "core/solver/batch_cg_kernels.hpp"
#include "core/solver/batch_dispatch.hpp"
#include "cuda/base/batch_struct.hpp"
#include "cuda/matrix/batch_struct.hpp"


namespace gko {
namespace kernels {
namespace cuda {
namespace batch_cg {


template <typename T>
using settings = gko::kernels::batch_cg::settings<T>;


template <typename StopType, typename PrecType, typename LogType,
typename BatchMatrixType, typename ValueType>
int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
const int num_rows);

#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_(_vtype, mat_t, log_t, \
pre_t, stop_t) \
int get_num_threads_per_block< \
stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>, \
log_t<gko::remove_complex<cuda_type<_vtype>>>, \
mat_t<const cuda_type<_vtype>>, cuda_type<_vtype>>( \
std::shared_ptr<const DefaultExecutor> exec, const int num_rows)

#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_, \
_vtype)


template <typename StopType, typename PrecType, typename LogType,
typename BatchMatrixType, typename ValueType>
int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec);

#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_( \
_vtype, mat_t, log_t, pre_t, stop_t) \
int get_max_dynamic_shared_memory< \
stop_t<cuda_type<_vtype>>, pre_t<cuda_type<_vtype>>, \
log_t<gko::remove_complex<_vtype>>, mat_t<const cuda_type<_vtype>>, \
cuda_type<_vtype>>(std::shared_ptr<const DefaultExecutor> exec)

#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_, \
_vtype)


template <typename ValueType, int n_shared, bool prec_shared, typename StopType,
typename PrecType, typename LogType, typename BatchMatrixType>
void launch_apply_kernel(
std::shared_ptr<const DefaultExecutor> exec,
const gko::kernels::batch_cg::storage_config& sconf,
const settings<remove_complex<ValueType>>& settings, LogType& logger,
PrecType& prec, const BatchMatrixType& mat,
const cuda_type<ValueType>* const __restrict__ b_values,
cuda_type<ValueType>* const __restrict__ x_values,
cuda_type<ValueType>* const __restrict__ workspace_data,
const int& block_size, const size_t& shared_size);

#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \
log_t, pre_t, stop_t) \
void launch_apply_kernel<cuda_type<_vtype>, _n_shared, _prec_shared, \
stop_t<cuda_type<_vtype>>>( \
std::shared_ptr<const DefaultExecutor> exec, \
const gko::kernels::batch_cg::storage_config& sconf, \
const settings<remove_complex<_vtype>>& settings, \
log_t<cuda_type<gko::remove_complex<cuda_type<_vtype>>>>& logger, \
pre_t<cuda_type<_vtype>>& prec, \
const mat_t<const cuda_type<_vtype>>& mat, \
const cuda_type<_vtype>* const __restrict__ b_values, \
cuda_type<_vtype>* const __restrict__ x_values, \
cuda_type<_vtype>* const __restrict__ workspace_data, \
const int& block_size, const size_t& shared_size)

#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false)
#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false)
#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false)
#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false)
#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false)
#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false)
#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \
GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true)


} // namespace batch_cg
} // namespace cuda
} // namespace kernels
} // namespace gko
Loading

0 comments on commit e59ab55

Please sign in to comment.