From d73437fcace5c35b1d1d35211d3095d0400b6d57 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Thu, 11 Jul 2024 11:20:57 +0200 Subject: [PATCH 1/4] Add cmake flag and instantiate only one by default --- CMakeLists.txt | 1 + core/solver/batch_bicgstab_kernels.hpp | 10 +++++++++- core/solver/batch_cg_kernels.hpp | 10 +++++++++- cuda/solver/batch_bicgstab_kernels.cu | 8 ++++++++ cuda/solver/batch_cg_kernels.cu | 8 ++++++++ dpcpp/solver/batch_bicgstab_kernels.dp.cpp | 8 ++++++++ dpcpp/solver/batch_cg_kernels.dp.cpp | 8 ++++++++ hip/solver/batch_bicgstab_kernels.hip.cpp | 8 ++++++++ hip/solver/batch_cg_kernels.hip.cpp | 8 ++++++++ include/ginkgo/config.hpp.in | 4 ++++ 10 files changed, 71 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 10f76ac9a6c..1f620346ff5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,7 @@ option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic option(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS "Split template instantiations for slow-to-compile files. This improves parallel build performance" ON) mark_as_advanced(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS) option(GINKGO_JACOBI_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA Jacobi algorithm" OFF) +option(GINKGO_BATCHED_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA/HIP batched solver algorithms" OFF) option(BUILD_SHARED_LIBS "Build shared (.so, .dylib, .dll) libraries" ON) option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is OFF." OFF) option(GINKGO_BUILD_PAPI_SDE "Build Ginkgo with PAPI SDE. Enabled if a system installation is found." ${PAPI_SDE_FOUND}) diff --git a/core/solver/batch_bicgstab_kernels.hpp b/core/solver/batch_bicgstab_kernels.hpp index 1eed30aba5a..07ecb1bd834 100644 --- a/core/solver/batch_bicgstab_kernels.hpp +++ b/core/solver/batch_bicgstab_kernels.hpp @@ -6,6 +6,7 @@ #define GKO_CORE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ +#include #include #include #include @@ -15,6 +16,13 @@ #include "core/base/kernel_declaration.hpp" +#ifdef GINKGO_BACTCHED_FULL_OPTIMIZATIONS +constexpr bool bicgstab_no_shared_vecs = false; +#else +constexpr bool bicgstab_no_shared_vecs = true; +#endif + + namespace gko { namespace kernels { namespace batch_bicgstab { @@ -138,7 +146,7 @@ storage_config compute_shared_storage(const int available_shared_mem, // {prec_shared, n_shared, n_global, gmem_stride_bytes, padded_vec_len} storage_config sconf{false, 0, num_main_vecs, 0, num_rows}; // If available shared mem is zero, set all vecs to global. - if (rem_shared <= 0) { + if (rem_shared <= 0 || bicgstab_no_shared_vecs) { set_gmem_stride_bytes(sconf, vec_size, prec_storage); return sconf; } diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp index 6fdb595862e..028223886fe 100644 --- a/core/solver/batch_cg_kernels.hpp +++ b/core/solver/batch_cg_kernels.hpp @@ -6,6 +6,7 @@ #define GKO_CORE_SOLVER_BATCH_CG_KERNELS_HPP_ +#include #include #include #include @@ -15,6 +16,13 @@ #include "core/base/kernel_declaration.hpp" +#ifdef GINKGO_BACTCHED_FULL_OPTIMIZATIONS +constexpr bool cg_no_shared_vecs = false; +#else +constexpr bool cg_no_shared_vecs = true; +#endif + + namespace gko { namespace kernels { namespace batch_cg { @@ -126,7 +134,7 @@ storage_config compute_shared_storage(const int available_shared_mem, // {prec_shared, n_shared, n_global, gmem_stride_bytes, padded_vec_len} storage_config sconf{false, 0, num_main_vecs, 0, num_rows}; // If available shared mem is zero, set all vecs to global. - if (rem_shared <= 0) { + if (rem_shared <= 0 || cg_no_shared_vecs) { set_gmem_stride_bytes(sconf, vec_bytes, prec_storage); return sconf; } diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 6b3dca28607..bc12fc7efde 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -167,6 +167,9 @@ public: value_type* const workspace_data = workspace.get_data(); + // Only instantiate when full optimizations has been enabled. Otherwise, + // just use the default one with no shared memory. +#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // Template parameters launch_apply_kernel if (sconf.prec_shared) { @@ -229,6 +232,11 @@ public: GKO_NOT_IMPLEMENTED; } } +#else + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, workspace_data, + block_size, shared_size); +#endif } private: diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index 746be0365e7..f09b6c70487 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -165,6 +165,9 @@ public: value_type* const workspace_data = workspace.get_data(); + // Only instantiate when full optimizations has been enabled. Otherwise, + // just use the default one with no shared memory. +#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // Template parameters launch_apply_kernel if (sconf.prec_shared) { @@ -207,6 +210,11 @@ public: GKO_NOT_IMPLEMENTED; } } +#else + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, workspace_data, + block_size, shared_size); +#endif } private: diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp index 344e4af56b9..3b6d5d1c5df 100644 --- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp +++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp @@ -159,6 +159,9 @@ class kernel_caller { ValueType* const workspace_data = workspace.get_data(); int n_shared_total = sconf.n_shared + int(sconf.prec_shared); + // Only instantiate when full optimizations has been enabled. Otherwise, + // just use the default one with no shared memory. +#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // template // launch_apply_kernel if (num_rows <= 32 && n_shared_total == 10) { @@ -230,6 +233,11 @@ class kernel_caller { GKO_NOT_IMPLEMENTED; } } +#else + launch_apply_kernel(sconf, logger, prec, mat, b.values, + x.values, workspace_data, + group_size, shared_size); +#endif } private: diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp index 0787afa6fd3..36fbe0dc269 100644 --- a/dpcpp/solver/batch_cg_kernels.dp.cpp +++ b/dpcpp/solver/batch_cg_kernels.dp.cpp @@ -158,6 +158,9 @@ class kernel_caller { ValueType* const workspace_data = workspace.get_data(); int n_shared_total = sconf.n_shared + int(sconf.prec_shared); + // Only instantiate when full optimizations has been enabled. Otherwise, + // just use the default one with no shared memory. +#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // template // launch_apply_kernel if (num_rows <= 32 && n_shared_total == 6) { @@ -205,6 +208,11 @@ class kernel_caller { GKO_NOT_IMPLEMENTED; } } +#else + launch_apply_kernel(sconf, logger, prec, mat, b.values, + x.values, workspace_data, + group_size, shared_size); +#endif } private: diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index 95a49953b3e..54b63983388 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -149,6 +149,9 @@ class kernel_caller { value_type* const workspace_data = workspace.get_data(); + // Only instantiate when full optimizations has been enabled. Otherwise, + // just use the default one with no shared memory. +#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // Template parameters launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, workspace_data, + block_size, shared_size); +#endif } private: diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 6102749b988..290fd72b9f7 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -149,6 +149,9 @@ class kernel_caller { value_type* const workspace_data = workspace.get_data(); + // Only instantiate when full optimizations has been enabled. Otherwise, + // just use the default one with no shared memory. +#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // Template parameters launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, workspace_data, + block_size, shared_size); +#endif } private: diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index 329918399d6..4eb3106633f 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -31,6 +31,10 @@ #cmakedefine GINKGO_JACOBI_FULL_OPTIMIZATIONS +/* Should we use all optimizations for batched solvers? */ +#cmakedefine GINKGO_BATCHED_FULL_OPTIMIZATIONS + + /* Should we compile Ginkgo specifically to tune values? */ #cmakedefine GINKGO_BENCHMARK_ENABLE_TUNING From 605feaa92d388d3253d042acf3a06088aaf058a8 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 23 Jul 2024 14:26:32 +0200 Subject: [PATCH 2/4] [cuda,hip,dpcpp] disable optimized kernels --- CMakeLists.txt | 1 - core/solver/batch_bicgstab_kernels.hpp | 5 +- core/solver/batch_cg_kernels.hpp | 5 +- cuda/solver/batch_bicgstab_kernels.cu | 123 +++++++++--------- cuda/solver/batch_cg_kernels.cu | 81 ++++++------ dpcpp/solver/batch_bicgstab_kernels.dp.cpp | 139 ++++++++++----------- dpcpp/solver/batch_cg_kernels.dp.cpp | 90 +++++++------ hip/solver/batch_bicgstab_kernels.hip.cpp | 120 +++++++++--------- hip/solver/batch_cg_kernels.hip.cpp | 80 ++++++------ include/ginkgo/config.hpp.in | 4 - 10 files changed, 301 insertions(+), 347 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1f620346ff5..10f76ac9a6c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,7 +55,6 @@ option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic option(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS "Split template instantiations for slow-to-compile files. This improves parallel build performance" ON) mark_as_advanced(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS) option(GINKGO_JACOBI_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA Jacobi algorithm" OFF) -option(GINKGO_BATCHED_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA/HIP batched solver algorithms" OFF) option(BUILD_SHARED_LIBS "Build shared (.so, .dylib, .dll) libraries" ON) option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is OFF." OFF) option(GINKGO_BUILD_PAPI_SDE "Build Ginkgo with PAPI SDE. Enabled if a system installation is found." ${PAPI_SDE_FOUND}) diff --git a/core/solver/batch_bicgstab_kernels.hpp b/core/solver/batch_bicgstab_kernels.hpp index 07ecb1bd834..5bab0e43b26 100644 --- a/core/solver/batch_bicgstab_kernels.hpp +++ b/core/solver/batch_bicgstab_kernels.hpp @@ -16,11 +16,8 @@ #include "core/base/kernel_declaration.hpp" -#ifdef GINKGO_BACTCHED_FULL_OPTIMIZATIONS -constexpr bool bicgstab_no_shared_vecs = false; -#else +// TODO: update when splitting kernels constexpr bool bicgstab_no_shared_vecs = true; -#endif namespace gko { diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp index 028223886fe..031b20b2a61 100644 --- a/core/solver/batch_cg_kernels.hpp +++ b/core/solver/batch_cg_kernels.hpp @@ -16,11 +16,8 @@ #include "core/base/kernel_declaration.hpp" -#ifdef GINKGO_BACTCHED_FULL_OPTIMIZATIONS -constexpr bool cg_no_shared_vecs = false; -#else +// TODO: update when splitting compilation constexpr bool cg_no_shared_vecs = true; -#endif namespace gko { diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index bc12fc7efde..54f489304a7 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -167,76 +167,69 @@ public: value_type* const workspace_data = workspace.get_data(); - // Only instantiate when full optimizations has been enabled. Otherwise, - // just use the default one with no shared memory. -#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS + // TODO: split compilation // Template parameters launch_apply_kernel - if (sconf.prec_shared) { - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - } else { - switch (sconf.n_shared) { - case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 6: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 7: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 8: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 9: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - default: - GKO_NOT_IMPLEMENTED; - } - } -#else + // if (sconf.prec_shared) { + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, workspace_data, + // block_size, shared_size); + // } else { + // switch (sconf.n_shared) { + // case 0: launch_apply_kernel( sconf, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); -#endif + // break; + // case 1: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 2: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 3: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 4: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 5: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 6: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 7: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 8: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 9: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // default: + // GKO_NOT_IMPLEMENTED; + // } + // } } private: diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index f09b6c70487..b681bd13ce3 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -165,56 +165,51 @@ public: value_type* const workspace_data = workspace.get_data(); + // TODO: split compilation // Only instantiate when full optimizations has been enabled. Otherwise, // just use the default one with no shared memory. -#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // Template parameters launch_apply_kernel - if (sconf.prec_shared) { - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - } else { - switch (sconf.n_shared) { - case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - default: - GKO_NOT_IMPLEMENTED; - } - } -#else + // if (sconf.prec_shared) { + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, workspace_data, + // block_size, shared_size); + // } else { + // switch (sconf.n_shared) { + // case 0: launch_apply_kernel( sconf, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); -#endif + // break; + // case 1: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 2: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 3: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 4: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 5: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // default: + // GKO_NOT_IMPLEMENTED; + // } + // } } private: diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp index 3b6d5d1c5df..bb84283b49f 100644 --- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp +++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp @@ -159,85 +159,80 @@ class kernel_caller { ValueType* const workspace_data = workspace.get_data(); int n_shared_total = sconf.n_shared + int(sconf.prec_shared); + // TODO: split compilation // Only instantiate when full optimizations has been enabled. Otherwise, // just use the default one with no shared memory. -#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // template // launch_apply_kernel - if (num_rows <= 32 && n_shared_total == 10) { - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - group_size, shared_size); - } else if (num_rows <= 256 && n_shared_total == 10) { - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - group_size, shared_size); - } else { - switch (n_shared_total) { - case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 6: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 7: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 8: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 9: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 10: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - default: - GKO_NOT_IMPLEMENTED; - } - } -#else + // if (num_rows <= 32 && n_shared_total == 10) { + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, workspace_data, + // group_size, shared_size); + // } else if (num_rows <= 256 && n_shared_total == 10) { + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, workspace_data, + // group_size, shared_size); + // } else { + // switch (n_shared_total) { + // case 0: launch_apply_kernel(sconf, logger, prec, mat, b.values, x.values, workspace_data, group_size, shared_size); -#endif + // break; + // case 1: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 2: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 3: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 4: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 5: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 6: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 7: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 8: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 9: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 10: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // default: + // GKO_NOT_IMPLEMENTED; + // } + // } } private: diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp index 36fbe0dc269..61591f9efb6 100644 --- a/dpcpp/solver/batch_cg_kernels.dp.cpp +++ b/dpcpp/solver/batch_cg_kernels.dp.cpp @@ -160,59 +160,53 @@ class kernel_caller { // Only instantiate when full optimizations has been enabled. Otherwise, // just use the default one with no shared memory. -#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // template // launch_apply_kernel - if (num_rows <= 32 && n_shared_total == 6) { - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - group_size, shared_size); - } else { - switch (n_shared_total) { - case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - case 6: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, group_size, shared_size); - break; - default: - GKO_NOT_IMPLEMENTED; - } - } -#else + // if (num_rows <= 32 && n_shared_total == 6) { + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, workspace_data, + // group_size, shared_size); + // } else { + // switch (n_shared_total) { + // case 0: launch_apply_kernel(sconf, logger, prec, mat, b.values, x.values, workspace_data, group_size, shared_size); -#endif + // break; + // case 1: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 2: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 3: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 4: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 5: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // case 6: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, group_size, shared_size); + // break; + // default: + // GKO_NOT_IMPLEMENTED; + // } + // } } private: diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index 54b63983388..ca49fa5eb9c 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -151,74 +151,68 @@ class kernel_caller { // Only instantiate when full optimizations has been enabled. Otherwise, // just use the default one with no shared memory. -#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // Template parameters launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - } else { - switch (sconf.n_shared) { - case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 6: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 7: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 8: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 9: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - default: - GKO_NOT_IMPLEMENTED; - } - } -#else + // if (sconf.prec_shared) { + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, workspace_data, + // block_size, shared_size); + // } else { + // switch (sconf.n_shared) { + // case 0: launch_apply_kernel( sconf, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); -#endif + // break; + // case 1: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 2: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 3: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 4: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 5: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 6: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 7: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 8: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 9: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // default: + // GKO_NOT_IMPLEMENTED; + // } + // } } private: diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 290fd72b9f7..3a1642edfea 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -151,54 +151,48 @@ class kernel_caller { // Only instantiate when full optimizations has been enabled. Otherwise, // just use the default one with no shared memory. -#ifdef GINKGO_BATCHED_FULL_OPTIMIZATIONS // Template parameters launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - } else { - switch (sconf.n_shared) { - case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - default: - GKO_NOT_IMPLEMENTED; - } - } -#else + // if (sconf.prec_shared) { + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, workspace_data, + // block_size, shared_size); + // } else { + // switch (sconf.n_shared) { + // case 0: launch_apply_kernel( sconf, logger, prec, mat, b.values, x.values, workspace_data, block_size, shared_size); -#endif + // break; + // case 1: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 2: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 3: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 4: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // case 5: + // launch_apply_kernel( + // sconf, logger, prec, mat, b.values, x.values, + // workspace_data, block_size, shared_size); + // break; + // default: + // GKO_NOT_IMPLEMENTED; + // } + // } } private: diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index 4eb3106633f..329918399d6 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -31,10 +31,6 @@ #cmakedefine GINKGO_JACOBI_FULL_OPTIMIZATIONS -/* Should we use all optimizations for batched solvers? */ -#cmakedefine GINKGO_BATCHED_FULL_OPTIMIZATIONS - - /* Should we compile Ginkgo specifically to tune values? */ #cmakedefine GINKGO_BENCHMARK_ENABLE_TUNING From c689cf30dca378ace3499f5f83f61f93d76d47ac Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Mon, 5 Aug 2024 13:02:24 +0200 Subject: [PATCH 3/4] [review] review updates --- core/solver/batch_bicgstab_kernels.hpp | 1 - core/solver/batch_cg_kernels.hpp | 1 - 2 files changed, 2 deletions(-) diff --git a/core/solver/batch_bicgstab_kernels.hpp b/core/solver/batch_bicgstab_kernels.hpp index 5bab0e43b26..615ed472597 100644 --- a/core/solver/batch_bicgstab_kernels.hpp +++ b/core/solver/batch_bicgstab_kernels.hpp @@ -6,7 +6,6 @@ #define GKO_CORE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_ -#include #include #include #include diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp index 031b20b2a61..b21a2c07d3e 100644 --- a/core/solver/batch_cg_kernels.hpp +++ b/core/solver/batch_cg_kernels.hpp @@ -6,7 +6,6 @@ #define GKO_CORE_SOLVER_BATCH_CG_KERNELS_HPP_ -#include #include #include #include From e5b261f892011f1cd2f4f642931a32a60cf58ef0 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 13 Aug 2024 13:08:19 +0200 Subject: [PATCH 4/4] use smaller block size on cuda --- cuda/solver/batch_bicgstab_kernels.cu | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 54f489304a7..3c7fe50709c 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -144,10 +144,11 @@ public: const int shmem_per_blk = get_max_dynamic_shared_memory(exec_); - const int block_size = - get_num_threads_per_block( - exec_, mat.num_rows); + // TODO + const int block_size = 256; + // get_num_threads_per_block( + // exec_, mat.num_rows); GKO_ASSERT(block_size >= 2 * config::warp_size); const size_t prec_size = PrecType::dynamic_work_size(