diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 13aff53affe..13efc0bd9cc 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -4,6 +4,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE) add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.cu BATCH_BICGSTAB_INSTANTIATE) +add_instantiation_files(. solver/batch_cg_launch.instantiate.cu BATCH_CG_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda @@ -41,6 +42,7 @@ target_sources(ginkgo_cuda solver/batch_bicgstab_kernels.cu ${BATCH_BICGSTAB_INSTANTIATE} solver/batch_cg_kernels.cu + ${BATCH_CG_INSTANTIATE} solver/lower_trs_kernels.cu solver/upper_trs_kernels.cu ${GKO_UNIFIED_COMMON_SOURCES} diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index 746be0365e7..8e06b8f7e88 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -4,147 +4,53 @@ #include "core/solver/batch_cg_kernels.hpp" -#include -#include - #include -#include #include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "cuda/base/batch_struct.hpp" #include "cuda/matrix/batch_struct.hpp" +#include "cuda/solver/batch_cg_launch.cuh" namespace gko { namespace kernels { namespace cuda { - - -// NOTE: this default block size is not used for the main solver kernel. -constexpr int default_block_size = 256; -constexpr int sm_oversubscription = 4; - - -/** - * @brief The batch Cg solver namespace. - * - * @ingroup batch_cg - */ namespace batch_cg { -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" -#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" - - -template -int get_num_threads_per_block(std::shared_ptr exec, - const int num_rows) -{ - int num_warps = std::max(num_rows / 4, 2); - constexpr int warp_sz = static_cast(config::warp_size); - const int min_block_size = 2 * warp_sz; - const int device_max_threads = - (std::max(num_rows, min_block_size) / warp_sz) * warp_sz; - cudaFuncAttributes funcattr; - cudaFuncGetAttributes(&funcattr, - apply_kernel); - const int num_regs_used = funcattr.numRegs; - int max_regs_blk = 0; - cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, - exec->get_device_id()); - const int max_threads_regs = - ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; - int max_threads = std::min(max_threads_regs, device_max_threads); - max_threads = max_threads <= 1024 ? max_threads : 1024; - return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); -} - - -template -int get_max_dynamic_shared_memory(std::shared_ptr exec) -{ - int shmem_per_sm = 0; - cudaDeviceGetAttribute(&shmem_per_sm, - cudaDevAttrMaxSharedMemoryPerMultiprocessor, - exec->get_device_id()); - GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( - apply_kernel, - cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); - cudaFuncAttributes funcattr; - cudaFuncGetAttributes(&funcattr, - apply_kernel); - return funcattr.maxDynamicSharedSizeBytes; -} - - -template -using settings = gko::kernels::batch_cg::settings; - - -template +template class kernel_caller { public: - using value_type = CuValueType; + using cuda_value_type = cuda_type; kernel_caller(std::shared_ptr exec, - const settings> settings) + const settings> settings) : exec_{std::move(exec)}, settings_{settings} {} - template - void launch_apply_kernel( - const gko::kernels::batch_cg::storage_config& sconf, LogType& logger, - PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size) const - { - apply_kernel - <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); - } - template void call_kernel( LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const { - using real_type = gko::remove_complex; + using real_type = gko::remove_complex; const size_type num_batch_items = mat.num_batch_items; constexpr int align_multiple = 8; const int padded_num_rows = ceildiv(mat.num_rows, align_multiple) * align_multiple; const int shmem_per_blk = get_max_dynamic_shared_memory(exec_); + BatchMatrixType, cuda_value_type>( + exec_); const int block_size = get_num_threads_per_block( + BatchMatrixType, cuda_value_type>( exec_, mat.num_rows); GKO_ASSERT(block_size >= 2 * config::warp_size); @@ -152,56 +58,56 @@ public: padded_num_rows, mat.get_single_item_num_nnz()); const auto sconf = gko::kernels::batch_cg::compute_shared_storage( + cuda_value_type>( shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), b.num_rhs); const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + + sconf.n_shared * padded_num_rows * sizeof(cuda_value_type) + (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( - exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0); + auto workspace = gko::array( + exec_, sconf.gmem_stride_bytes * num_batch_items / + sizeof(cuda_value_type)); + GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(cuda_value_type) == 0); - value_type* const workspace_data = workspace.get_data(); + cuda_value_type* const workspace_data = workspace.get_data(); - // Template parameters launch_apply_kernel + // Template parameters launch_apply_kernel if (sconf.prec_shared) { - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); } else { switch (sconf.n_shared) { case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); break; case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); break; case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); break; case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); break; case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); break; case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); break; default: GKO_NOT_IMPLEMENTED; @@ -211,7 +117,7 @@ public: private: std::shared_ptr exec_; - const settings> settings_; + const settings> settings_; }; @@ -224,9 +130,8 @@ void apply(std::shared_ptr exec, batch::MultiVector* const x, batch::log::detail::log_data>& logdata) { - using cu_value_type = cuda_type; auto dispatcher = batch::solver::create_dispatcher( - kernel_caller(exec, settings), settings, mat, precon); + kernel_caller(exec, settings), settings, mat, precon); dispatcher.apply(b, x, logdata); } diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh new file mode 100644 index 00000000000..b98264fd383 --- /dev/null +++ b/cuda/solver/batch_cg_launch.cuh @@ -0,0 +1,105 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_cg_kernels.hpp" +#include "core/solver/batch_dispatch.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +namespace batch_cg { + + +template +using settings = gko::kernels::batch_cg::settings; + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows); + +#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_(_vtype, mat_t, log_t, \ + pre_t, stop_t) \ + int get_num_threads_per_block< \ + stop_t>, pre_t>, \ + log_t>>, \ + mat_t>, cuda_type<_vtype>>( \ + std::shared_ptr exec, const int num_rows) + +#define GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK_, \ + _vtype) + + +template +int get_max_dynamic_shared_memory(std::shared_ptr exec); + +#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_( \ + _vtype, mat_t, log_t, pre_t, stop_t) \ + int get_max_dynamic_shared_memory< \ + stop_t>, pre_t>, \ + log_t>, mat_t>, \ + cuda_type<_vtype>>(std::shared_ptr exec) + +#define GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY_, \ + _vtype) + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const cuda_type* const __restrict__ b_values, + cuda_type* const __restrict__ x_values, + cuda_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size); + +#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \ + log_t, pre_t, stop_t) \ + void launch_apply_kernel, _n_shared, _prec_shared, \ + stop_t>>( \ + std::shared_ptr exec, \ + const gko::kernels::batch_cg::storage_config& sconf, \ + const settings>& settings, \ + log_t>>>& logger, \ + pre_t>& prec, \ + const mat_t>& mat, \ + const cuda_type<_vtype>* const __restrict__ b_values, \ + cuda_type<_vtype>* const __restrict__ x_values, \ + cuda_type<_vtype>* const __restrict__ workspace_data, \ + const int& block_size, const size_t& shared_size) + +#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true) + + +} // namespace batch_cg +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/solver/batch_cg_launch.instantiate.cu b/cuda/solver/batch_cg_launch.instantiate.cu new file mode 100644 index 00000000000..a41dadc4bdb --- /dev/null +++ b/cuda/solver/batch_cg_launch.instantiate.cu @@ -0,0 +1,144 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "cuda/solver/batch_cg_launch.cuh" + +#include +#include + +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_dispatch.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { + + +// NOTE: this default block size is not used for the main solver kernel. +constexpr int default_block_size = 256; +constexpr int sm_oversubscription = 4; + + +/** + * @brief The batch Cg solver namespace. + * + * @ingroup batch_cg + */ +namespace batch_cg { + + +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" +#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows) +{ + int num_warps = std::max(num_rows / 4, 2); + constexpr int warp_sz = static_cast(config::warp_size); + const int min_block_size = 2 * warp_sz; + const int device_max_threads = + (std::max(num_rows, min_block_size) / warp_sz) * warp_sz; + cudaFuncAttributes funcattr; + cudaFuncGetAttributes(&funcattr, + apply_kernel); + const int num_regs_used = funcattr.numRegs; + int max_regs_blk = 0; + cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, + exec->get_device_id()); + const int max_threads_regs = + ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; + int max_threads = std::min(max_threads_regs, device_max_threads); + max_threads = max_threads <= 1024 ? max_threads : 1024; + return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); +} + + +template +int get_max_dynamic_shared_memory(std::shared_ptr exec) +{ + int shmem_per_sm = 0; + cudaDeviceGetAttribute(&shmem_per_sm, + cudaDevAttrMaxSharedMemoryPerMultiprocessor, + exec->get_device_id()); + GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( + apply_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); + cudaFuncAttributes funcattr; + cudaFuncGetAttributes(&funcattr, + apply_kernel); + return funcattr.maxDynamicSharedSizeBytes; +} + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const cuda_type* const __restrict__ b_values, + cuda_type* const __restrict__ x_values, + cuda_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size) +{ + apply_kernel + <<get_stream()>>>( + sconf, settings.max_iterations, as_cuda_type(settings.residual_tol), + logger, prec, mat, b_values, x_values, workspace_data); +} + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_CG_GET_NUM_THREADS_PER_BLOCK); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_CG_GET_MAX_DYNAMIC_SHARED_MEMORY); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE); +// end + + +} // namespace batch_cg +} // namespace cuda +} // namespace kernels +} // namespace gko