Skip to content

Commit

Permalink
Merge branch 'shift_H_mgpu_fix_bug' into 'master'
Browse files Browse the repository at this point in the history
bug fixed for the indexing of large matrix on GPU

See merge request SLai/ChASE!22
  • Loading branch information
brunowu committed Dec 9, 2021
2 parents a7ce5fe + 5577bca commit 89649df
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 43 deletions.
16 changes: 8 additions & 8 deletions ChASE-MPI/impl/chase_mpidla_mgpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,20 @@
#include "chase_mpidla_interface.hpp"
#include "mgpu_cudaDLA.hpp"

void chase_shift_mgpu_matrix(float* A, int* off_m, int* off_n,
int offsize, int ldH, float shift,
void chase_shift_mgpu_matrix(float* A, std::size_t* off_m, std::size_t* off_n,
std::size_t offsize, std::size_t ldH, float shift,
cudaStream_t stream_);

void chase_shift_mgpu_matrix(double* A, int* off_m, int* off_n,
int offsize, int ldH, double shift,
void chase_shift_mgpu_matrix(double* A, std::size_t* off_m, std::size_t* off_n,
std::size_t offsize, std::size_t ldH, double shift,
cudaStream_t stream_);

void chase_shift_mgpu_matrix(std::complex<double>* A, int* off_m, int* off_n,
int offsize, int ldH, double shift,
void chase_shift_mgpu_matrix(std::complex<double>* A, std::size_t* off_m, std::size_t* off_n,
std::size_t offsize, std::size_t ldH, double shift,
cudaStream_t stream_);

void chase_shift_mgpu_matrix(std::complex<float>* A, int* off_m, int* off_n,
int offsize, int ldH, float shift,
void chase_shift_mgpu_matrix(std::complex<float>* A, std::size_t* off_m, std::size_t* off_n,
std::size_t offsize, std::size_t ldH, float shift,
cudaStream_t stream_);


Expand Down
30 changes: 15 additions & 15 deletions ChASE-MPI/impl/mgpu_cudaDLA.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,9 @@ namespace chase {


//for shifting matrix on gpus
int start_row, start_col;
d_off_m_ = (int**) malloc(num_devices_per_rank * sizeof(int*));
d_off_n_ = (int**) malloc(num_devices_per_rank * sizeof(int*));
std::size_t start_row, start_col;
d_off_m_ = (std::size_t**) malloc(num_devices_per_rank * sizeof(std::size_t*));
d_off_n_ = (std::size_t**) malloc(num_devices_per_rank * sizeof(std::size_t*));

for (int dev_x = 0; dev_x < ntile_m_; dev_x++){
tile_x = get_tile_size_row(dev_x);
Expand All @@ -197,15 +197,15 @@ namespace chase {
tile_y = get_tile_size_col(dev_y);
start_col = dev_y * dim_tile_n_;
int dev_id = dev_x * ntile_n_ + dev_y;
std::vector<int> off_m, off_n;
std::vector<std::size_t> off_m, off_n;

for(std::size_t j = 0; j < nblocks_; j++){
for(std::size_t i = 0; i < mblocks_; i++){
for(std::size_t q = 0; q < c_lens_[j]; q++){
for(std::size_t p = 0; p < r_lens_[i]; p++){

if(q + c_offs_l_[j] >= start_col && q + c_offs_l_[j] < start_col + tile_y && p + r_offs_l_[i] >= start_row && p + r_offs_l_[i] < start_row + tile_x){
int s, t;
std::size_t s, t;
//t, s, global index
t = q + c_offs_[j];
s = p + r_offs_[i];
Expand All @@ -221,13 +221,13 @@ namespace chase {
}
}

int off_size = off_m.size();
std::size_t off_size = off_m.size();
diagonal_offs_.push_back(off_size);
cuda_exec(cudaSetDevice(shmrank_*num_devices_per_rank + dev_id));
cudaMalloc(&d_off_m_[dev_id], off_size * sizeof(int));
cudaMalloc(&d_off_n_[dev_id], off_size * sizeof(int));
cudaMemcpy(d_off_m_[dev_id], off_m.data(), off_size* sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_off_n_[dev_id], off_n.data(), off_size* sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc(&d_off_m_[dev_id], off_size * sizeof(std::size_t));
cudaMalloc(&d_off_n_[dev_id], off_size * sizeof(std::size_t));
cudaMemcpy(d_off_m_[dev_id], off_m.data(), off_size* sizeof(std::size_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_off_n_[dev_id], off_n.data(), off_size* sizeof(std::size_t), cudaMemcpyHostToDevice);

}
}
Expand Down Expand Up @@ -377,7 +377,7 @@ namespace chase {
int tile_x, tile_y;
int count_x = 0, count_y = 0;

int start_row, start_col;
std::size_t start_row, start_col;

for (int dev_x = 0; dev_x < ntile_m_; dev_x++){
tile_x = get_tile_size_row(dev_x);
Expand All @@ -388,7 +388,7 @@ namespace chase {
start_col = dev_y * dim_tile_n_;
int dev_id = dev_x * ntile_n_ + dev_y;

int off_size = diagonal_offs_[dev_id];
std::size_t off_size = diagonal_offs_[dev_id];

cuda_exec(cudaSetDevice(shmrank_*num_devices_per_rank + dev_id));
chase_shift_mgpu_matrix(H_[dev_id], d_off_m_[dev_id], d_off_n_[dev_id], off_size, ldH, std::real(c), stream_[dev_id]);
Expand Down Expand Up @@ -971,9 +971,9 @@ namespace chase {
std::size_t mblocks_;

//for shifting matrix on gpus
int **d_off_m_ = nullptr;
int **d_off_n_ = nullptr;
std::vector<int> diagonal_offs_;
std::size_t **d_off_m_ = nullptr;
std::size_t **d_off_n_ = nullptr;
std::vector<std::size_t> diagonal_offs_;

/// Return the number of rows of the tile with row-index 'tile_position'
int get_tile_size_row (int tile_position) {
Expand Down
40 changes: 20 additions & 20 deletions ChASE-MPI/kernels/shift.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,40 +33,40 @@ __global__ void zshift_matrix(cuDoubleComplex* A, int n, double shift) {
if (idx < n) A[(idx)*n + idx].x += shift;
}

__global__ void sshift_mgpu_matrix(float* A, int* off_m, int* off_n,
int offsize, int ldH, float shift) {
__global__ void sshift_mgpu_matrix(float* A, std::size_t* off_m, std::size_t* off_n,
std::size_t offsize, std::size_t ldH, float shift) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int ind;
std::size_t ind;
if(i < offsize){
ind = off_n[i] * ldH + off_m[i];
A[ind] += shift;
}
}

__global__ void dshift_mgpu_matrix(double* A, int* off_m, int* off_n,
int offsize, int ldH, double shift) {
__global__ void dshift_mgpu_matrix(double* A, std::size_t* off_m, std::size_t* off_n,
std::size_t offsize, std::size_t ldH, double shift) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int ind;
std::size_t ind;
if(i < offsize){
ind = off_n[i] * ldH + off_m[i];
A[ind] += shift;
}
}

__global__ void cshift_mgpu_matrix(cuComplex* A, int* off_m, int* off_n,
int offsize, int ldH, float shift) {
__global__ void cshift_mgpu_matrix(cuComplex* A, std::size_t* off_m, std::size_t* off_n,
std::size_t offsize, std::size_t ldH, float shift) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int ind;
std::size_t ind;
if(i < offsize){
ind = off_n[i] * ldH + off_m[i];
A[ind].x += shift;
}
}

__global__ void zshift_mgpu_matrix(cuDoubleComplex* A, int* off_m, int* off_n,
int offsize, int ldH, double shift) {
__global__ void zshift_mgpu_matrix(cuDoubleComplex* A, std::size_t* off_m, std::size_t* off_n,
std::size_t offsize, std::size_t ldH, double shift) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int ind;
std::size_t ind;
if(i < offsize){
ind = off_n[i] * ldH + off_m[i];
A[ind].x += shift;
Expand Down Expand Up @@ -101,8 +101,8 @@ void chase_shift_matrix(std::complex<double>* A, int n, double shift,
reinterpret_cast<cuDoubleComplex*>(A), n, shift);
}

void chase_shift_mgpu_matrix(float* A, int* off_m, int* off_n,
int offsize, int ldH, float shift,
void chase_shift_mgpu_matrix(float* A, std::size_t* off_m, std::size_t* off_n,
std::size_t offsize, std::size_t ldH, float shift,
cudaStream_t stream_) {

unsigned int grid = (offsize + 256 - 1) / 256;
Expand All @@ -114,8 +114,8 @@ void chase_shift_mgpu_matrix(float* A, int* off_m, int* off_n,
}


void chase_shift_mgpu_matrix(double* A, int* off_m, int* off_n,
int offsize, int ldH, double shift,
void chase_shift_mgpu_matrix(double* A, std::size_t* off_m, std::size_t* off_n,
std::size_t offsize, std::size_t ldH, double shift,
cudaStream_t stream_) {

unsigned int grid = (offsize + 256 - 1) / 256;
Expand All @@ -126,8 +126,8 @@ void chase_shift_mgpu_matrix(double* A, int* off_m, int* off_n,

}

void chase_shift_mgpu_matrix(std::complex<float>* A, int* off_m, int* off_n,
int offsize, int ldH, float shift,
void chase_shift_mgpu_matrix(std::complex<float>* A, std::size_t* off_m, std::size_t* off_n,
std::size_t offsize, std::size_t ldH, float shift,
cudaStream_t stream_) {

unsigned int grid = (offsize + 256 - 1) / 256;
Expand All @@ -140,8 +140,8 @@ void chase_shift_mgpu_matrix(std::complex<float>* A, int* off_m, int* off_n,
}


void chase_shift_mgpu_matrix(std::complex<double>* A, int* off_m, int* off_n,
int offsize, int ldH, double shift,
void chase_shift_mgpu_matrix(std::complex<double>* A, std::size_t* off_m, std::size_t* off_n,
std::size_t offsize, std::size_t ldH, double shift,
cudaStream_t stream_) {

unsigned int grid = (offsize + 256 - 1) / 256;
Expand Down

0 comments on commit 89649df

Please sign in to comment.