Skip to content

Commit

Permalink
more compatibility between CUDA/HIP
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed May 19, 2024
1 parent 2a767da commit 7f1d5be
Show file tree
Hide file tree
Showing 9 changed files with 99 additions and 21 deletions.
6 changes: 0 additions & 6 deletions common/cuda_hip/base/sparselib_bindings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,8 @@

#ifdef GKO_COMPILING_HIP
#include "hip/base/hipsparse_bindings.hip.hpp"

#define SPARSELIB_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE
#define SPARSELIB_SOLVE_POLICY_USE_LEVEL HIPSPARSE_SOLVE_POLICY_USE_LEVEL
#else // GKO_COMPILING_CUDA
#include "cuda/base/cusparse_bindings.cuh"

#define SPARSELIB_OPERATION_NON_TRANSPOSE CUSPARSE_OPERATION_NON_TRANSPOSE
#define SPARSELIB_SOLVE_POLICY_USE_LEVEL CUSPARSE_SOLVE_POLICY_USE_LEVEL
#endif


Expand Down
7 changes: 6 additions & 1 deletion cuda/base/cusparse_bindings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1471,7 +1471,12 @@ namespace sparselib {
using namespace cusparse;


}
#define SPARSELIB_OPERATION_TRANSPOSE CUSPARSE_OPERATION_TRANSPOSE
#define SPARSELIB_OPERATION_NON_TRANSPOSE CUSPARSE_OPERATION_NON_TRANSPOSE
#define SPARSELIB_SOLVE_POLICY_USE_LEVEL CUSPARSE_SOLVE_POLICY_USE_LEVEL


} // namespace sparselib
} // namespace cuda
} // namespace kernels
} // namespace gko
Expand Down
15 changes: 10 additions & 5 deletions cuda/matrix/fbcsr_kernels.template.cu
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ constexpr int default_block_size{512};

namespace {


template <typename ValueType>
void dense_transpose(std::shared_ptr<const CudaExecutor> exec,
const size_type nrows, const size_type ncols,
Expand All @@ -88,15 +89,16 @@ void dense_transpose(std::shared_ptr<const CudaExecutor> exec,
blas::pointer_mode_guard pm_guard(handle);
auto alpha = one<ValueType>();
auto beta = zero<ValueType>();
blas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, nrows, ncols, &alpha,
orig, orig_stride, &beta, trans, trans_stride, trans,
blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig,
orig_stride, &beta, trans, trans_stride, trans,
trans_stride);
}
} else {
GKO_NOT_IMPLEMENTED;
}
}


} // namespace


Expand Down Expand Up @@ -141,7 +143,7 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
const auto trans_stride = nrows;
auto trans_c = array<ValueType>(exec, nrows * nrhs);
sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
&alpha, descr, values, row_ptrs, col_idxs, bs,
b->get_const_values(), in_stride, &beta,
trans_c.get_data(), trans_stride);
Expand Down Expand Up @@ -199,7 +201,7 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
trans_stride, trans_c.get_data());
sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
alphp, descr, values, row_ptrs, col_idxs, bs,
b->get_const_values(), in_stride, betap,
trans_c.get_data(), trans_stride);
Expand Down Expand Up @@ -245,6 +247,7 @@ void transpose(const std::shared_ptr<const CudaExecutor> exec,
const matrix::Fbcsr<ValueType, IndexType>* const orig,
matrix::Fbcsr<ValueType, IndexType>* const trans)
{
#ifdef GKO_COMPILING_CUDA
if (sparselib::is_supported<ValueType, IndexType>::value) {
const int bs = orig->get_block_size();
const IndexType nnzb =
Expand All @@ -269,7 +272,9 @@ void transpose(const std::shared_ptr<const CudaExecutor> exec,
fixedblock::compiled_kernels(),
[bs](int compiled_block_size) { return bs == compiled_block_size; },
syn::value_list<int>(), syn::type_list<>(), exec, trans);
} else {
} else
#endif
{
fallback_transpose(exec, orig, trans);
}
}
Expand Down
4 changes: 4 additions & 0 deletions cuda/matrix/sparsity_csr_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,11 @@ namespace sparsity_csr {

constexpr int classical_oversubscription = 32;
constexpr int default_block_size = 512;
#ifdef GKO_COMPILING_HIP
constexpr int spmv_block_size = 256;
#else
constexpr int spmv_block_size = 128;
#endif
constexpr int warps_in_block = 4;


Expand Down
6 changes: 5 additions & 1 deletion cuda/preconditioner/jacobi_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,12 @@ namespace cuda {
namespace jacobi {


// a total of 32 warps (1024 threads)
// a total of 32/16 warps (1024 threads)
#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
constexpr int default_num_warps = 16;
#else // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC
constexpr int default_num_warps = 32;
#endif
// with current architectures, at most 32 warps can be scheduled per SM (and
// current GPUs have at most 84 SMs)
constexpr int default_grid_size = 32 * 32 * 128;
Expand Down
7 changes: 6 additions & 1 deletion hip/base/hipsparse_bindings.hip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -963,7 +963,12 @@ namespace sparselib {
using namespace hipsparse;


}
#define SPARSELIB_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE
#define SPARSELIB_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE
#define SPARSELIB_SOLVE_POLICY_USE_LEVEL HIPSPARSE_SOLVE_POLICY_USE_LEVEL


} // namespace sparselib
} // namespace hip
} // namespace kernels
} // namespace gko
Expand Down
67 changes: 62 additions & 5 deletions hip/matrix/fbcsr_kernels.template.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
#include "hip/components/thread_ids.hip.hpp"
#include "hip/components/uninitialized_array.hip.hpp"


namespace gko {
namespace kernels {
namespace hip {
Expand Down Expand Up @@ -142,7 +143,7 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
const auto trans_stride = nrows;
auto trans_c = array<ValueType>(exec, nrows * nrhs);
sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
&alpha, descr, values, row_ptrs, col_idxs, bs,
b->get_const_values(), in_stride, &beta,
trans_c.get_data(), trans_stride);
Expand Down Expand Up @@ -200,7 +201,7 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
trans_stride, trans_c.get_data());
sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
alphp, descr, values, row_ptrs, col_idxs, bs,
b->get_const_values(), in_stride, betap,
trans_c.get_data(), trans_stride);
Expand All @@ -214,12 +215,68 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
}


namespace {


template <int mat_blk_sz, typename ValueType, typename IndexType>
void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
std::shared_ptr<const DefaultExecutor> exec,
matrix::Fbcsr<ValueType, IndexType>* const mat)
{
constexpr int subwarp_size = config::warp_size;
const auto nbnz = mat->get_num_stored_blocks();
const auto numthreads = nbnz * subwarp_size;
const auto block_size = default_block_size;
const auto grid_dim = ceildiv(numthreads, block_size);
if (grid_dim > 0) {
kernel::transpose_blocks<mat_blk_sz, subwarp_size>
<<<grid_dim, block_size, 0, exec->get_stream()>>>(
nbnz, mat->get_values());
}
}

GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks,
transpose_blocks_impl);


} // namespace


template <typename ValueType, typename IndexType>
void transpose(const std::shared_ptr<const DefaultExecutor> exec,
const matrix::Fbcsr<ValueType, IndexType>* const input,
matrix::Fbcsr<ValueType, IndexType>* const output)
const matrix::Fbcsr<ValueType, IndexType>* const orig,
matrix::Fbcsr<ValueType, IndexType>* const trans)
{
fallback_transpose(exec, input, output);
#ifdef GKO_COMPILING_CUDA
if (sparselib::is_supported<ValueType, IndexType>::value) {
const int bs = orig->get_block_size();
const IndexType nnzb =
static_cast<IndexType>(orig->get_num_stored_blocks());
cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
const IndexType buffer_size = sparselib::bsr_transpose_buffersize(
exec->get_sparselib_handle(), orig->get_num_block_rows(),
orig->get_num_block_cols(), nnzb, orig->get_const_values(),
orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs);
array<char> buffer_array(exec, buffer_size);
auto buffer = buffer_array.get_data();
sparselib::bsr_transpose(
exec->get_sparselib_handle(), orig->get_num_block_rows(),
orig->get_num_block_cols(), nnzb, orig->get_const_values(),
orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs,
trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(),
copyValues, idxBase, buffer);

// transpose blocks
select_transpose_blocks(
fixedblock::compiled_kernels(),
[bs](int compiled_block_size) { return bs == compiled_block_size; },
syn::value_list<int>(), syn::type_list<>(), exec, trans);
} else
#endif
{
fallback_transpose(exec, orig, trans);
}
}


Expand Down
4 changes: 4 additions & 0 deletions hip/matrix/sparsity_csr_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,11 @@ namespace sparsity_csr {

constexpr int classical_oversubscription = 32;
constexpr int default_block_size = 512;
#ifdef GKO_COMPILING_HIP
constexpr int spmv_block_size = 256;
#else
constexpr int spmv_block_size = 128;
#endif
constexpr int warps_in_block = 4;


Expand Down
4 changes: 2 additions & 2 deletions hip/preconditioner/jacobi_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ namespace jacobi {


// a total of 32/16 warps (1024 threads)
#if GINKGO_HIP_PLATFORM_HCC
#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
constexpr int default_num_warps = 16;
#else // GINKGO_HIP_PLATFORM_NVCC
#else // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC
constexpr int default_num_warps = 32;
#endif
// with current architectures, at most 32 warps can be scheduled per SM (and
Expand Down

0 comments on commit 7f1d5be

Please sign in to comment.