From 5577bcacc50efba658ad455b0e5b51f9073ff073 Mon Sep 17 00:00:00 2001 From: wu7 Date: Thu, 9 Dec 2021 23:30:13 +0100 Subject: [PATCH] bug fixed for the indexing of large matrix on GPU --- ChASE-MPI/impl/chase_mpidla_mgpu.hpp | 16 +++++------ ChASE-MPI/impl/mgpu_cudaDLA.hpp | 30 ++++++++++----------- ChASE-MPI/kernels/shift.cu | 40 ++++++++++++++-------------- 3 files changed, 43 insertions(+), 43 deletions(-) diff --git a/ChASE-MPI/impl/chase_mpidla_mgpu.hpp b/ChASE-MPI/impl/chase_mpidla_mgpu.hpp index c7a26ec..d511ac6 100644 --- a/ChASE-MPI/impl/chase_mpidla_mgpu.hpp +++ b/ChASE-MPI/impl/chase_mpidla_mgpu.hpp @@ -22,20 +22,20 @@ #include "chase_mpidla_interface.hpp" #include "mgpu_cudaDLA.hpp" -void chase_shift_mgpu_matrix(float* A, int* off_m, int* off_n, - int offsize, int ldH, float shift, +void chase_shift_mgpu_matrix(float* A, std::size_t* off_m, std::size_t* off_n, + std::size_t offsize, std::size_t ldH, float shift, cudaStream_t stream_); -void chase_shift_mgpu_matrix(double* A, int* off_m, int* off_n, - int offsize, int ldH, double shift, +void chase_shift_mgpu_matrix(double* A, std::size_t* off_m, std::size_t* off_n, + std::size_t offsize, std::size_t ldH, double shift, cudaStream_t stream_); -void chase_shift_mgpu_matrix(std::complex* A, int* off_m, int* off_n, - int offsize, int ldH, double shift, +void chase_shift_mgpu_matrix(std::complex* A, std::size_t* off_m, std::size_t* off_n, + std::size_t offsize, std::size_t ldH, double shift, cudaStream_t stream_); -void chase_shift_mgpu_matrix(std::complex* A, int* off_m, int* off_n, - int offsize, int ldH, float shift, +void chase_shift_mgpu_matrix(std::complex* A, std::size_t* off_m, std::size_t* off_n, + std::size_t offsize, std::size_t ldH, float shift, cudaStream_t stream_); diff --git a/ChASE-MPI/impl/mgpu_cudaDLA.hpp b/ChASE-MPI/impl/mgpu_cudaDLA.hpp index edd72c7..c7358e2 100644 --- a/ChASE-MPI/impl/mgpu_cudaDLA.hpp +++ b/ChASE-MPI/impl/mgpu_cudaDLA.hpp @@ -185,9 +185,9 @@ namespace chase { //for shifting matrix on gpus - int start_row, start_col; - d_off_m_ = (int**) malloc(num_devices_per_rank * sizeof(int*)); - d_off_n_ = (int**) malloc(num_devices_per_rank * sizeof(int*)); + std::size_t start_row, start_col; + d_off_m_ = (std::size_t**) malloc(num_devices_per_rank * sizeof(std::size_t*)); + d_off_n_ = (std::size_t**) malloc(num_devices_per_rank * sizeof(std::size_t*)); for (int dev_x = 0; dev_x < ntile_m_; dev_x++){ tile_x = get_tile_size_row(dev_x); @@ -197,7 +197,7 @@ namespace chase { tile_y = get_tile_size_col(dev_y); start_col = dev_y * dim_tile_n_; int dev_id = dev_x * ntile_n_ + dev_y; - std::vector off_m, off_n; + std::vector off_m, off_n; for(std::size_t j = 0; j < nblocks_; j++){ for(std::size_t i = 0; i < mblocks_; i++){ @@ -205,7 +205,7 @@ namespace chase { for(std::size_t p = 0; p < r_lens_[i]; p++){ if(q + c_offs_l_[j] >= start_col && q + c_offs_l_[j] < start_col + tile_y && p + r_offs_l_[i] >= start_row && p + r_offs_l_[i] < start_row + tile_x){ - int s, t; + std::size_t s, t; //t, s, global index t = q + c_offs_[j]; s = p + r_offs_[i]; @@ -221,13 +221,13 @@ namespace chase { } } - int off_size = off_m.size(); + std::size_t off_size = off_m.size(); diagonal_offs_.push_back(off_size); cuda_exec(cudaSetDevice(shmrank_*num_devices_per_rank + dev_id)); - cudaMalloc(&d_off_m_[dev_id], off_size * sizeof(int)); - cudaMalloc(&d_off_n_[dev_id], off_size * sizeof(int)); - cudaMemcpy(d_off_m_[dev_id], off_m.data(), off_size* sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_off_n_[dev_id], off_n.data(), off_size* sizeof(int), cudaMemcpyHostToDevice); + cudaMalloc(&d_off_m_[dev_id], off_size * sizeof(std::size_t)); + cudaMalloc(&d_off_n_[dev_id], off_size * sizeof(std::size_t)); + cudaMemcpy(d_off_m_[dev_id], off_m.data(), off_size* sizeof(std::size_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_off_n_[dev_id], off_n.data(), off_size* sizeof(std::size_t), cudaMemcpyHostToDevice); } } @@ -377,7 +377,7 @@ namespace chase { int tile_x, tile_y; int count_x = 0, count_y = 0; - int start_row, start_col; + std::size_t start_row, start_col; for (int dev_x = 0; dev_x < ntile_m_; dev_x++){ tile_x = get_tile_size_row(dev_x); @@ -388,7 +388,7 @@ namespace chase { start_col = dev_y * dim_tile_n_; int dev_id = dev_x * ntile_n_ + dev_y; - int off_size = diagonal_offs_[dev_id]; + std::size_t off_size = diagonal_offs_[dev_id]; cuda_exec(cudaSetDevice(shmrank_*num_devices_per_rank + dev_id)); chase_shift_mgpu_matrix(H_[dev_id], d_off_m_[dev_id], d_off_n_[dev_id], off_size, ldH, std::real(c), stream_[dev_id]); @@ -971,9 +971,9 @@ namespace chase { std::size_t mblocks_; //for shifting matrix on gpus - int **d_off_m_ = nullptr; - int **d_off_n_ = nullptr; - std::vector diagonal_offs_; + std::size_t **d_off_m_ = nullptr; + std::size_t **d_off_n_ = nullptr; + std::vector diagonal_offs_; /// Return the number of rows of the tile with row-index 'tile_position' int get_tile_size_row (int tile_position) { diff --git a/ChASE-MPI/kernels/shift.cu b/ChASE-MPI/kernels/shift.cu index 9dfa0af..7264d45 100644 --- a/ChASE-MPI/kernels/shift.cu +++ b/ChASE-MPI/kernels/shift.cu @@ -33,40 +33,40 @@ __global__ void zshift_matrix(cuDoubleComplex* A, int n, double shift) { if (idx < n) A[(idx)*n + idx].x += shift; } -__global__ void sshift_mgpu_matrix(float* A, int* off_m, int* off_n, - int offsize, int ldH, float shift) { +__global__ void sshift_mgpu_matrix(float* A, std::size_t* off_m, std::size_t* off_n, + std::size_t offsize, std::size_t ldH, float shift) { int i = blockIdx.x * blockDim.x + threadIdx.x; - int ind; + std::size_t ind; if(i < offsize){ ind = off_n[i] * ldH + off_m[i]; A[ind] += shift; } } -__global__ void dshift_mgpu_matrix(double* A, int* off_m, int* off_n, - int offsize, int ldH, double shift) { +__global__ void dshift_mgpu_matrix(double* A, std::size_t* off_m, std::size_t* off_n, + std::size_t offsize, std::size_t ldH, double shift) { int i = blockIdx.x * blockDim.x + threadIdx.x; - int ind; + std::size_t ind; if(i < offsize){ ind = off_n[i] * ldH + off_m[i]; A[ind] += shift; } } -__global__ void cshift_mgpu_matrix(cuComplex* A, int* off_m, int* off_n, - int offsize, int ldH, float shift) { +__global__ void cshift_mgpu_matrix(cuComplex* A, std::size_t* off_m, std::size_t* off_n, + std::size_t offsize, std::size_t ldH, float shift) { int i = blockIdx.x * blockDim.x + threadIdx.x; - int ind; + std::size_t ind; if(i < offsize){ ind = off_n[i] * ldH + off_m[i]; A[ind].x += shift; } } -__global__ void zshift_mgpu_matrix(cuDoubleComplex* A, int* off_m, int* off_n, - int offsize, int ldH, double shift) { +__global__ void zshift_mgpu_matrix(cuDoubleComplex* A, std::size_t* off_m, std::size_t* off_n, + std::size_t offsize, std::size_t ldH, double shift) { int i = blockIdx.x * blockDim.x + threadIdx.x; - int ind; + std::size_t ind; if(i < offsize){ ind = off_n[i] * ldH + off_m[i]; A[ind].x += shift; @@ -101,8 +101,8 @@ void chase_shift_matrix(std::complex* A, int n, double shift, reinterpret_cast(A), n, shift); } -void chase_shift_mgpu_matrix(float* A, int* off_m, int* off_n, - int offsize, int ldH, float shift, +void chase_shift_mgpu_matrix(float* A, std::size_t* off_m, std::size_t* off_n, + std::size_t offsize, std::size_t ldH, float shift, cudaStream_t stream_) { unsigned int grid = (offsize + 256 - 1) / 256; @@ -114,8 +114,8 @@ void chase_shift_mgpu_matrix(float* A, int* off_m, int* off_n, } -void chase_shift_mgpu_matrix(double* A, int* off_m, int* off_n, - int offsize, int ldH, double shift, +void chase_shift_mgpu_matrix(double* A, std::size_t* off_m, std::size_t* off_n, + std::size_t offsize, std::size_t ldH, double shift, cudaStream_t stream_) { unsigned int grid = (offsize + 256 - 1) / 256; @@ -126,8 +126,8 @@ void chase_shift_mgpu_matrix(double* A, int* off_m, int* off_n, } -void chase_shift_mgpu_matrix(std::complex* A, int* off_m, int* off_n, - int offsize, int ldH, float shift, +void chase_shift_mgpu_matrix(std::complex* A, std::size_t* off_m, std::size_t* off_n, + std::size_t offsize, std::size_t ldH, float shift, cudaStream_t stream_) { unsigned int grid = (offsize + 256 - 1) / 256; @@ -140,8 +140,8 @@ void chase_shift_mgpu_matrix(std::complex* A, int* off_m, int* off_n, } -void chase_shift_mgpu_matrix(std::complex* A, int* off_m, int* off_n, - int offsize, int ldH, double shift, +void chase_shift_mgpu_matrix(std::complex* A, std::size_t* off_m, std::size_t* off_n, + std::size_t offsize, std::size_t ldH, double shift, cudaStream_t stream_) { unsigned int grid = (offsize + 256 - 1) / 256;