Skip to content

Commit

Permalink
Single GPU version without cuDecomp dependency.
Browse files Browse the repository at this point in the history
  • Loading branch information
p-costa committed Mar 15, 2024
1 parent 99b706e commit 6708d1c
Show file tree
Hide file tree
Showing 10 changed files with 292 additions and 173 deletions.
4 changes: 2 additions & 2 deletions build.conf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# compiler and compiling profile
#
FCOMP=GNU # options: GNU, NVIDIA, INTEL
FCOMP=NVIDIA # options: GNU, NVIDIA, INTEL
FFLAGS_OPT=1 # for production runs
FFLAGS_OPT_MAX=0 # for production runs (more aggressive optimization)
FFLAGS_DEBUG=0 # for debugging
Expand All @@ -18,5 +18,5 @@ SINGLE_PRECISION=0 # perform the whole calculation in single precision
#
# GPU-related
#
GPU=0
GPU=1
USE_NVTX=0 # use the NVTX-enabled Git branch 'with-nvtx' to see the markers
5 changes: 3 additions & 2 deletions configs/libs.mk
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ override LIBS += -L$(LIBS_DIR)/2decomp-fft -ldecomp2d
override INCS += -I$(LIBS_DIR)/2decomp-fft/mod

ifeq ($(strip $(GPU)),1)
override LIBS += -L$(LIBS_DIR)/cuDecomp/build/lib -lcudecomp -lcudecomp_fort -cudalib=cufft
override INCS += -I$(LIBS_DIR)/cuDecomp/build/include
#override LIBS += -L$(LIBS_DIR)/cuDecomp/build/lib -lcudecomp -lcudecomp_fort -cudalib=cufft
override LIBS += -cudalib=cufft
#override INCS += -I$(LIBS_DIR)/cuDecomp/build/include
endif

ifeq ($(strip $(USE_NVTX)),1)
Expand Down
4 changes: 2 additions & 2 deletions dependencies/external.mk
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
ifeq ($(strip $(GPU)),1)
libs: $(wildcard $(LIBS_DIR)/2decomp-fft/src/*.f90)
cd $(LIBS_DIR)/2decomp-fft && make
cd $(LIBS_DIR)/cuDecomp && make lib -j
# cd $(LIBS_DIR)/cuDecomp && make lib -j
libsclean: $(wildcard $(LIBS_DIR)/2decomp-fft/src/*.f90)
cd $(LIBS_DIR)/2decomp-fft && make clean
cd $(LIBS_DIR)/cuDecomp && make clean
# cd $(LIBS_DIR)/cuDecomp && make clean
else
libs: $(wildcard $(LIBS_DIR)/2decomp-fft/src/*.f90)
cd $(LIBS_DIR)/2decomp-fft && make
Expand Down
36 changes: 18 additions & 18 deletions src/bound.f90
Original file line number Diff line number Diff line change
Expand Up @@ -530,29 +530,29 @@ end subroutine updthalo
#if defined(_OPENACC)
subroutine updthalo_gpu(nh,periods,p)
use mod_types
use cudecomp
use mod_common_cudecomp, only: work => work_halo, &
ch => handle,gd => gd_halo, &
dtype => cudecomp_real_rp, &
istream => istream_acc_queue_1
!!!PC use cudecomp
!!!PC use mod_common_cudecomp, only: work => work_halo, &
!!!PC ch => handle,gd => gd_halo, &
!!!PC dtype => cudecomp_real_rp, &
!!!PC istream => istream_acc_queue_1
implicit none
integer , intent(in) :: nh
logical , intent(in) :: periods(3)
real(rp), intent(inout), dimension(1-nh:,1-nh:,1-nh:) :: p
integer :: istat
!$acc host_data use_device(p,work)
select case(ipencil_axis)
case(1)
istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream)
istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream)
case(2)
istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream)
istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream)
case(3)
istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream)
istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream)
end select
!$acc end host_data
!!!PC !$acc host_data use_device(p,work)
!!!PC select case(ipencil_axis)
!!!PC case(1)
!!!PC istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream)
!!!PC istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream)
!!!PC case(2)
!!!PC istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream)
!!!PC istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream)
!!!PC case(3)
!!!PC istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream)
!!!PC istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream)
!!!PC end select
!!!PC !$acc end host_data
end subroutine updthalo_gpu
#endif
end module mod_bound
14 changes: 7 additions & 7 deletions src/common_cudecomp.f90
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,22 @@ module mod_common_cudecomp
#if defined(_OPENACC)
use mod_types
!@cuf use cudafor
use cudecomp
!!!PC use cudecomp
use openacc
use mod_param, only: cudecomp_is_t_in_place
implicit none
public
integer :: cudecomp_real_rp
type(cudecompHandle) :: handle
type(cudecompGridDesc) :: gd_halo,gd_poi
type(cudecompPencilInfo) :: ap_x,ap_y,ap_z,ap_x_poi,ap_y_poi,ap_z_poi
!!!PC integer :: cudecomp_real_rp
!!!PC type(cudecompHandle) :: handle
!!!PC type(cudecompGridDesc) :: gd_halo,gd_poi
!!!PC type(cudecompPencilInfo) :: ap_x,ap_y,ap_z,ap_x_poi,ap_y_poi,ap_z_poi
!
! workspace stuff
!
integer(i8) :: wsize_fft
real(rp), pointer, contiguous, dimension(:) :: work ,work_cuda
real(rp), pointer, contiguous, dimension(:) :: work_halo,work_halo_cuda
!@cuf attributes(device) :: work_cuda,work_halo_cuda
!!!PC real(rp), pointer, contiguous, dimension(:) :: work_halo,work_halo_cuda
!@cuf attributes(device) :: work_cuda !!!PC,work_halo_cuda
real(rp), target, allocatable, dimension(:) :: solver_buf_0,solver_buf_1
#if !defined(_IMPDIFF_1D)
real(rp), allocatable, dimension(:,:,:) :: pz_aux_1,pz_aux_2
Expand Down
175 changes: 93 additions & 82 deletions src/initmpi.f90
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ module mod_initmpi
use mod_common_mpi, only: myid,ierr,halo,ipencil => ipencil_axis
use mod_types
!@acc use openacc
!@acc use cudecomp
!@cuf use cudafor, only: cudaGetDeviceCount,cudaSetDevice
!!!PC !@acc use cudecomp
!!!PC !@cuf use cudafor, only: cudaGetDeviceCount,cudaSetDevice
#if defined(_OPENACC)
use mod_common_cudecomp, only: cudecomp_real_rp, &
ch => handle,gd => gd_halo,gd_poi, &
Expand Down Expand Up @@ -40,8 +40,8 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound)
integer(acc_device_kind) ::dev_type
integer :: local_comm,mydev,ndev
integer :: istat
type(cudecompGridDescConfig) :: conf,conf_poi
type(cudecompGridDescAutotuneOptions) :: atune_conf
!!!PC type(cudecompGridDescConfig) :: conf,conf_poi
!!!PC type(cudecompGridDescAutotuneOptions) :: atune_conf
#else
integer :: comm_cart
#endif
Expand All @@ -53,7 +53,7 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound)
call MPI_COMM_SPLIT_TYPE(MPI_COMM_WORLD,MPI_COMM_TYPE_SHARED,0,MPI_INFO_NULL,local_comm,ierr)
call MPI_COMM_RANK(local_comm,mydev,ierr)
dev_type = acc_get_device_type()
#if 1
#if 0
istat = cudaGetDeviceCount(ndev) ! may be tweaked with environment variable CUDA_VISIBLE_DEVICES
mydev = mod(mydev,ndev)
istat = cudaSetDevice(mydev)
Expand All @@ -65,52 +65,52 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound)
call acc_set_device_num(mydev,dev_type)
call acc_init(dev_type)
!
istat = cudecompInit(ch,MPI_COMM_WORLD)
!!!PCistat = cudecompInit(ch,MPI_COMM_WORLD)
!
! setup descriptor for the Poisson solver
!
istat = cudecompGridDescConfigSetDefaults(conf)
conf%transpose_comm_backend = cudecomp_t_comm_backend
conf%transpose_axis_contiguous(:) = [.true.,.true.,.false.]
conf%gdims(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)]
conf%pdims(:) = dims(1:2)
conf%gdims_dist(:) = ng(:)
istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf)
if(rp == dp) then
cudecomp_real_rp = CUDECOMP_DOUBLE
else
cudecomp_real_rp = CUDECOMP_FLOAT
end if
atune_conf%dtype = cudecomp_real_rp
atune_conf%grid_mode = CUDECOMP_AUTOTUNE_GRID_TRANSPOSE
atune_conf%autotune_transpose_backend = cudecomp_is_t_comm_autotune
atune_conf%disable_nccl_backends = .not.cudecomp_is_t_enable_nccl
atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem
istat = cudecompGridDescCreate(ch,gd_poi,conf,atune_conf)
conf_poi = conf
dims(:) = conf%pdims
!
! setup descriptor for halo exchanges
!
istat = cudecompGridDescConfigSetDefaults(conf)
conf%gdims(:) = ng(:)
conf%pdims(:) = dims(1:2)
conf%halo_comm_backend = cudecomp_h_comm_backend
conf%transpose_axis_contiguous(:) = .false.
istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf)
atune_conf%halo_extents(:) = 1
atune_conf%halo_periods(:) = periods(:)
atune_conf%dtype = cudecomp_real_rp
atune_conf%autotune_halo_backend = cudecomp_is_h_comm_autotune
atune_conf%disable_nccl_backends = .not.cudecomp_is_t_enable_nccl
atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem
if(all(conf_poi%transpose_comm_backend /= [CUDECOMP_TRANSPOSE_COMM_NVSHMEM,CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL])) then
!
! disable NVSHMEM halo backend autotuning when NVSHMEM is NOT used for transposes
!
atune_conf%disable_nvshmem_backends = .true.
end if
istat = cudecompGridDescCreate(ch,gd,conf,atune_conf)
!!!PC istat = cudecompGridDescConfigSetDefaults(conf)
!!!PC conf%transpose_comm_backend = cudecomp_t_comm_backend
!!!PC conf%transpose_axis_contiguous(:) = [.true.,.true.,.false.]
!!!PC conf%gdims(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)]
!!!PC conf%pdims(:) = dims(1:2)
!!!PC conf%gdims_dist(:) = ng(:)
!!!PC istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf)
!!!PC if(rp == dp) then
!!!PC cudecomp_real_rp = CUDECOMP_DOUBLE
!!!PC else
!!!PC cudecomp_real_rp = CUDECOMP_FLOAT
!!!PC end if
!!!PC atune_conf%dtype = cudecomp_real_rp
!!!PC atune_conf%grid_mode = CUDECOMP_AUTOTUNE_GRID_TRANSPOSE
!!!PC atune_conf%autotune_transpose_backend = cudecomp_is_t_comm_autotune
!!!PC atune_conf%disable_nccl_backends = .not.cudecomp_is_t_enable_nccl
!!!PC atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem
!!!PC istat = cudecompGridDescCreate(ch,gd_poi,conf,atune_conf)
!!!PC conf_poi = conf
!!!PC dims(:) = conf%pdims
!!!PC !
!!!PC ! setup descriptor for halo exchanges
!!!PC !
!!!PC istat = cudecompGridDescConfigSetDefaults(conf)
!!!PC conf%gdims(:) = ng(:)
!!!PC conf%pdims(:) = dims(1:2)
!!!PC conf%halo_comm_backend = cudecomp_h_comm_backend
!!!PC conf%transpose_axis_contiguous(:) = .false.
!!!PC istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf)
!!!PC atune_conf%halo_extents(:) = 1
!!!PC atune_conf%halo_periods(:) = periods(:)
!!!PC atune_conf%dtype = cudecomp_real_rp
!!!PC atune_conf%autotune_halo_backend = cudecomp_is_h_comm_autotune
!!!PC atune_conf%disable_nccl_backends = .not.cudecomp_is_t_enable_nccl
!!!PC atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem
!!!PC if(all(conf_poi%transpose_comm_backend /= [CUDECOMP_TRANSPOSE_COMM_NVSHMEM,CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL])) then
!!!PC !
!!!PC ! disable NVSHMEM halo backend autotuning when NVSHMEM is NOT used for transposes
!!!PC !
!!!PC atune_conf%disable_nvshmem_backends = .true.
!!!PC end if
!!!PC istat = cudecompGridDescCreate(ch,gd,conf,atune_conf)
#endif
call decomp_2d_init(ng(1),ng(2),ng(3),dims(1),dims(2),periods)
#if !defined(_DECOMP_Y) && !defined(_DECOMP_Z)
Expand All @@ -123,40 +123,51 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound)
ipencil_t(:) = pack([1,2,3],[1,2,3] /= ipencil)
is_bound(:,:) = .false.
#if defined(_OPENACC)
!
! fetch lo(:), hi(:), n(:) and n_z(:) from cuDecomp (should match the modified 2decomp one)
!
istat = cudecompGetPencilInfo(ch,gd ,ap_x ,1)
istat = cudecompGetPencilInfo(ch,gd ,ap_y ,2)
istat = cudecompGetPencilInfo(ch,gd ,ap_z ,3)
istat = cudecompGetPencilInfo(ch,gd_poi,ap_x_poi,1)
istat = cudecompGetPencilInfo(ch,gd_poi,ap_y_poi,2)
istat = cudecompGetPencilInfo(ch,gd_poi,ap_z_poi,3)
select case(ipencil)
case(1)
lo(:) = ap_x%lo(:)
hi(:) = ap_x%hi(:)
case(2)
lo(:) = ap_y%lo(:)
hi(:) = ap_y%hi(:)
case(3)
lo(:) = ap_z%lo(:)
hi(:) = ap_z%hi(:)
end select
n(:) = hi(:)-lo(:)+1
n_x_fft(:) = ap_x_poi%shape(:)
n_y_fft(:) = ap_y_poi%shape(:)
lo_z(:) = ap_z%lo(:)
hi_z(:) = ap_z%hi(:)
n_z(:) = ap_z%shape(:)
nb(:,ipencil) = CUDECOMP_RANK_NULL
associate(ip_t => ipencil_t)
istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1),-1,periods(ip_t(1)),nb(0,ip_t(1)))
istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1), 1,periods(ip_t(1)),nb(1,ip_t(1)))
istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2),-1,periods(ip_t(2)),nb(0,ip_t(2)))
istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2), 1,periods(ip_t(2)),nb(1,ip_t(2)))
end associate
where(nb(:,:) == CUDECOMP_RANK_NULL) is_bound(:,:) = .true.
!!!PC !
!!!PC ! fetch lo(:), hi(:), n(:) and n_z(:) from cuDecomp (should match the modified 2decomp one)
!!!PC !
!!!PC istat = cudecompGetPencilInfo(ch,gd ,ap_x ,1)
!!!PC istat = cudecompGetPencilInfo(ch,gd ,ap_y ,2)
!!!PC istat = cudecompGetPencilInfo(ch,gd ,ap_z ,3)
!!!PC istat = cudecompGetPencilInfo(ch,gd_poi,ap_x_poi,1)
!!!PC istat = cudecompGetPencilInfo(ch,gd_poi,ap_y_poi,2)
!!!PC istat = cudecompGetPencilInfo(ch,gd_poi,ap_z_poi,3)
!!!PC select case(ipencil)
!!!PC case(1)
!!!PC lo(:) = ap_x%lo(:)
!!!PC hi(:) = ap_x%hi(:)
!!!PC case(2)
!!!PC lo(:) = ap_y%lo(:)
!!!PC hi(:) = ap_y%hi(:)
!!!PC case(3)
!!!PC lo(:) = ap_z%lo(:)
!!!PC hi(:) = ap_z%hi(:)
!!!PC end select
!!!PC n(:) = hi(:)-lo(:)+1
!!!PC n_x_fft(:) = ap_x_poi%shape(:)
!!!PC n_y_fft(:) = ap_y_poi%shape(:)
!!!PC lo_z(:) = ap_z%lo(:)
!!!PC hi_z(:) = ap_z%hi(:)
!!!PC n_z(:) = ap_z%shape(:)
!!!PC nb(:,ipencil) = CUDECOMP_RANK_NULL
!!!PC associate(ip_t => ipencil_t)
!!!PC istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1),-1,periods(ip_t(1)),nb(0,ip_t(1)))
!!!PC istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1), 1,periods(ip_t(1)),nb(1,ip_t(1)))
!!!PC istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2),-1,periods(ip_t(2)),nb(0,ip_t(2)))
!!!PC istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2), 1,periods(ip_t(2)),nb(1,ip_t(2)))
!!!PC end associate
!!!PC where(nb(:,:) == CUDECOMP_RANK_NULL) is_bound(:,:) = .true.
lo(:) = 1 !!!PC
hi(:) = ng(:) !!!PC
n(:) = hi(:)-lo(:)+1 !!!PC
n_x_fft(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)] !!!PC
n_y_fft(:) = [2*(ng(2)/2+1),ng(3),2*(ng(1)/2+1)] !!!PC
lo_z(:) = lo(:) !!!PC
hi_z(:) = hi(:) !!!PC
n_z(:) = n(:) !!!PC
nb(:,ipencil) = CUDECOMP_RANK_NULL !!!PC
where(nb(:,:) == CUDECOMP_RANK_NULL) is_bound(:,:) = .true. !!!PC
is_bound(:,:) = .true. !!!PC
#else
select case(ipencil)
case(1)
Expand Down
4 changes: 2 additions & 2 deletions src/param.f90
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
! -
module mod_param
use mod_types
!@acc use cudecomp
!!@acc use cudecomp
implicit none
public
!
Expand Down Expand Up @@ -112,7 +112,7 @@ subroutine read_input(myid)
dl(:) = l(:)/(1.*ng(:))
dli(:) = dl(:)**(-1)
visc = visci**(-1)
#if defined(_OPENACC)
#if 0
!
! read cuDecomp parameter file cudecomp.in, if it exists
!
Expand Down
Loading

0 comments on commit 6708d1c

Please sign in to comment.