diff --git a/Exec/DrivenTurbulence/GNUmakefile b/Exec/DrivenTurbulence/GNUmakefile index 9aa34622..c3a2a381 100644 --- a/Exec/DrivenTurbulence/GNUmakefile +++ b/Exec/DrivenTurbulence/GNUmakefile @@ -1,5 +1,5 @@ # AMREX_HOME defines the directory in which we will find all the BoxLib code -AMREX_HOME ?= /project/projectdirs/nyx/src/amrex +AMREX_HOME ?= ../../../amrex # TOP defines the directory in which we will find Source, Exec, etc TOP = ../.. diff --git a/Exec/DrivenTurbulence/heat_cool_stubs.f90 b/Exec/DrivenTurbulence/heat_cool_stubs.f90 index f7f956a1..75e1ca53 100644 --- a/Exec/DrivenTurbulence/heat_cool_stubs.f90 +++ b/Exec/DrivenTurbulence/heat_cool_stubs.f90 @@ -53,3 +53,12 @@ subroutine integrate_state(lo, hi, & dx, time, a, half_dt) end subroutine integrate_state + + +! unused VODE stubs if we are not doing heating/cooling +module vode_aux_module + use amrex_fort_module, only : rt => amrex_real + implicit none + + real(rt) :: z_vode +end module vode_aux_module diff --git a/Exec/DrivenTurbulence/inputs b/Exec/DrivenTurbulence/inputs index ab674a06..6b378880 100644 --- a/Exec/DrivenTurbulence/inputs +++ b/Exec/DrivenTurbulence/inputs @@ -84,7 +84,6 @@ amr.derive_plot_vars = magvort divu MachNumber amr.probin_file = probin # DIAGNOSTICS & VERBOSITY -nyx.show_timings = 0 # show timings nyx.sum_interval = 1 # timesteps between computing mass nyx.v = 2 # verbosity in Castro.cpp amr.v = 2 # verbosity in Amr.cpp diff --git a/Exec/DrivenTurbulence/inputs.regtest b/Exec/DrivenTurbulence/inputs.regtest index 172e04a7..0f470189 100644 --- a/Exec/DrivenTurbulence/inputs.regtest +++ b/Exec/DrivenTurbulence/inputs.regtest @@ -81,7 +81,6 @@ amr.derive_plot_vars = forcex forcey forcez magvort MachNumber amr.probin_file = probin # DIAGNOSTICS & VERBOSITY -nyx.show_timings = 0 # show timings nyx.sum_interval = 1 # timesteps between computing mass nyx.v = 2 # verbosity in Castro.cpp amr.v = 2 # verbosity in Amr.cpp diff --git a/Exec/GravityTests/MacLaurin/GNUmakefile b/Exec/GravityTests/MacLaurin/GNUmakefile index 2f332f68..f5430eef 100644 --- a/Exec/GravityTests/MacLaurin/GNUmakefile +++ b/Exec/GravityTests/MacLaurin/GNUmakefile @@ -1,5 +1,5 @@ # AMREX_HOME defines the directory in which we will find all the BoxLib code -AMREX_HOME ?= /project/projectdirs/nyx/src/amrex +AMREX_HOME ?= ../../../../amrex # TOP defines the directory in which we will find Source, Exec, etc TOP = ../../.. diff --git a/Exec/HydroTests/DoubleRarefaction/GNUmakefile b/Exec/HydroTests/DoubleRarefaction/GNUmakefile index d17a3b21..2e694fcd 100644 --- a/Exec/HydroTests/DoubleRarefaction/GNUmakefile +++ b/Exec/HydroTests/DoubleRarefaction/GNUmakefile @@ -1,5 +1,5 @@ # AMREX_HOME defines the directory in which we will find all the BoxLib code -AMREX_HOME ?= /project/projectdirs/nyx/src/amrex +AMREX_HOME ?= ../../../../amrex # TOP defines the directory in which we will find Source, Exec, etc TOP = ../../.. @@ -11,7 +11,6 @@ USE_MPI = FALSE USE_OMP = FALSE PRECISION = DOUBLE -DEBUG = TRUE DEBUG = FALSE # physics diff --git a/Exec/HydroTests/Sedov/GNUmakefile b/Exec/HydroTests/Sedov/GNUmakefile index a7170983..b645faf6 100644 --- a/Exec/HydroTests/Sedov/GNUmakefile +++ b/Exec/HydroTests/Sedov/GNUmakefile @@ -1,17 +1,16 @@ # AMREX_HOME defines the directory in which we will find all the BoxLib code -AMREX_HOME ?= /project/projectdirs/nyx/src/amrex +AMREX_HOME ?= ../../../../amrex # TOP defines the directory in which we will find Source, Exec, etc TOP = ../../.. # compilation options -COMP = gcc +COMP = intel #gcc USE_MPI = FALSE USE_OMP = FALSE PRECISION = DOUBLE -DEBUG = TRUE DEBUG = FALSE # physics diff --git a/Exec/HydroTests/Sod/GNUmakefile b/Exec/HydroTests/Sod/GNUmakefile index d17a3b21..2e694fcd 100644 --- a/Exec/HydroTests/Sod/GNUmakefile +++ b/Exec/HydroTests/Sod/GNUmakefile @@ -1,5 +1,5 @@ # AMREX_HOME defines the directory in which we will find all the BoxLib code -AMREX_HOME ?= /project/projectdirs/nyx/src/amrex +AMREX_HOME ?= ../../../../amrex # TOP defines the directory in which we will find Source, Exec, etc TOP = ../../.. @@ -11,7 +11,6 @@ USE_MPI = FALSE USE_OMP = FALSE PRECISION = DOUBLE -DEBUG = TRUE DEBUG = FALSE # physics diff --git a/Exec/HydroTests/StrongShockTube/GNUmakefile b/Exec/HydroTests/StrongShockTube/GNUmakefile index d17a3b21..2e694fcd 100644 --- a/Exec/HydroTests/StrongShockTube/GNUmakefile +++ b/Exec/HydroTests/StrongShockTube/GNUmakefile @@ -1,5 +1,5 @@ # AMREX_HOME defines the directory in which we will find all the BoxLib code -AMREX_HOME ?= /project/projectdirs/nyx/src/amrex +AMREX_HOME ?= ../../../../amrex # TOP defines the directory in which we will find Source, Exec, etc TOP = ../../.. @@ -11,7 +11,6 @@ USE_MPI = FALSE USE_OMP = FALSE PRECISION = DOUBLE -DEBUG = TRUE DEBUG = FALSE # physics diff --git a/Exec/HydroTests/TurbForce/GNUmakefile b/Exec/HydroTests/TurbForce/GNUmakefile index 8c58c1b2..5787b11f 100644 --- a/Exec/HydroTests/TurbForce/GNUmakefile +++ b/Exec/HydroTests/TurbForce/GNUmakefile @@ -1,5 +1,5 @@ # AMREX_HOME defines the directory in which we will find all the BoxLib code -AMREX_HOME ?= /project/projectdirs/nyx/src/amrex +AMREX_HOME ?= ../../../../amrex # TOP defines the directory in which we will find Source, Exec, etc TOP = ../../.. diff --git a/Exec/HydroTests/TurbForce/Nyx_setup.cpp b/Exec/HydroTests/TurbForce/Nyx_setup.cpp index 6ccff42e..58a777b5 100644 --- a/Exec/HydroTests/TurbForce/Nyx_setup.cpp +++ b/Exec/HydroTests/TurbForce/Nyx_setup.cpp @@ -233,7 +233,7 @@ Nyx::hydro_setup() ppm_flatten_before_integrals, use_colglaz, use_flattening, corner_coupling, version_2, use_const_species, gamma, normalize_species, - heat_cool_type, ParallelDescriptor::Communicator()); + heat_cool_type); if (use_const_species == 1) fort_set_eos_params(h_species, he_species); @@ -664,7 +664,7 @@ Nyx::no_hydro_setup() ppm_flatten_before_integrals, use_colglaz, use_flattening, corner_coupling, version_2, use_const_species, gamma, normalize_species, - heat_cool_type, ParallelDescriptor::Communicator()); + heat_cool_type); int coord_type = Geometry::Coord(); fort_set_problem_params(dm, phys_bc.lo(), phys_bc.hi(), Outflow, Symmetry, coord_type); diff --git a/Exec/LyA/32.nyx b/Exec/LyA/32.nyx deleted file mode 100644 index 827a0059..00000000 Binary files a/Exec/LyA/32.nyx and /dev/null differ diff --git a/Exec/LyA/64sssss_20mpc.nyx b/Exec/LyA/64sssss_20mpc.nyx new file mode 100644 index 00000000..019faac7 Binary files /dev/null and b/Exec/LyA/64sssss_20mpc.nyx differ diff --git a/Exec/LyA/GNUmakefile b/Exec/LyA/GNUmakefile index e68c175b..99d40ed7 100644 --- a/Exec/LyA/GNUmakefile +++ b/Exec/LyA/GNUmakefile @@ -1,50 +1,35 @@ -# AMREX_HOME defines the directory in which we will find all the BoxLib code -AMREX_HOME ?= /project/projectdirs/nyx/src/amrex +# AMREX_HOME defines the directory in which we will find all the AMReX code +AMREX_HOME ?= ../../../amrex -HPGMG_DIR ?= /global/homes/f/friesen/hpgmg/finite-volume +HPGMG_DIR ?= ../../Util/hpgmg/finite-volume +CVODE_LIB_DIR ?= ../../../sundials/sundials-intel/lib # TOP defines the directory in which we will find Source, Exec, etc TOP = ../.. # compilation options -COMP = gcc +COMP = intel # gnu USE_MPI = TRUE USE_OMP = TRUE -PROFILE = FALSE +PROFILE = TRUE TRACE_PROFILE = FALSE COMM_PROFILE = FALSE PRECISION = DOUBLE +USE_SINGLE_PRECISION_PARTICLES = TRUE DEBUG = FALSE GIMLET = FALSE REEBER = FALSE -GIMLET_DIR ?= /home/vince/Development/gimlet -# Gimlet needs FFTW MPI. -FFTW_INC ?= /usr/include -FFTW_DIR ?= /usr/lib/x86_64-linux-gnu - -REEBER_HOME ?= /project/projectdirs/nyx/ghweber/reeber2 -# Reeber needs Boost (both headers and libraries) and diy2. -BOOST_INCLUDE_DIR ?= /project/projectdirs/nyx/ghweber/boost-1.61.0-noarch/include -DIY_INCLUDE_DIR ?= /project/projectdirs/nyx/ghweber/diy/include - -USE_HPGMG = FALSE -HPGMG_FCYCLES = FALSE -HPGMG_POST_F_CYCLE_TYPE = V -HPGMG_HELMHOLTZ = FALSE -HPGMG_STENCIL_VARIABLE_COEFFICIENT = FALSE -HPGMG_USE_SUBCOMM = TRUE -HPGMG_BOTTOM_SOLVER= BICGSTAB -HPGMG_SMOOTHER = GSRB +USE_HPGMG = TRUE # physics DIM = 3 USE_GRAV = TRUE USE_HEATCOOL = TRUE - +USE_AGN = FALSE USE_CVODE = FALSE Bpack := ./Make.package diff --git a/Exec/LyA/Make.package b/Exec/LyA/Make.package index ce986e64..13af1531 100644 --- a/Exec/LyA/Make.package +++ b/Exec/LyA/Make.package @@ -1,5 +1,2 @@ f90EXE_sources += Prob_${DIM}d.f90 f90EXE_sources += probdata.f90 -ifeq ($(USE_CVODE), TRUE) - f90EXE_sources += fcvode_extras.f90 -endif diff --git a/Exec/LyA/Prob_3d.f90 b/Exec/LyA/Prob_3d.f90 index f1727bea..2e82541f 100644 --- a/Exec/LyA/Prob_3d.f90 +++ b/Exec/LyA/Prob_3d.f90 @@ -65,10 +65,11 @@ subroutine fort_initdata(level,time,lo,hi, & bind(C, name="fort_initdata") use amrex_fort_module, only : rt => amrex_real + use amrex_parmparse_module use probdata_module use atomic_rates_module, only : XHYDROGEN use meth_params_module, only : URHO, UMX, UMZ, UEDEN, UEINT, UFS, & - small_dens, TEMP_COMP, NE_COMP + small_dens, TEMP_COMP, NE_COMP, ZHI_COMP implicit none @@ -81,6 +82,13 @@ subroutine fort_initdata(level,time,lo,hi, & real(rt) diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,nd) integer i,j,k + real(rt) z_in + + type(amrex_parmparse) :: pp + + call amrex_parmparse_build(pp, "nyx") + call pp%query("initial_z", z_in) + call amrex_parmparse_destroy(pp) ! This is the case where we have compiled with states defined ! but they have only one component each so we fill them this way. @@ -90,7 +98,7 @@ subroutine fort_initdata(level,time,lo,hi, & diag_eos(:,:,:,1) = 0.0d0 ! This is the regular case with NO_HYDRO = FALSE - else if (ns.gt.1 .and. nd.eq.2) then + else if (ns.gt.1 .and. nd.ge.2) then do k = lo(3), hi(3) do j = lo(2), hi(2) @@ -108,8 +116,13 @@ subroutine fort_initdata(level,time,lo,hi, & state(i,j,k,UFS+1) = (1.d0 - XHYDROGEN) end if - diag_eos(i,j,k,TEMP_COMP) = 1000.d0 - diag_eos(i,j,k, NE_COMP) = 0.d0 + diag_eos(i,j,k,TEMP_COMP) = 0.021d0*(1.0d0 + z_in)**2 + diag_eos(i,j,k, NE_COMP) = 0.d0 + + if (ZHI_COMP .gt. -1) then + diag_eos(i,j,k, ZHI_COMP) = 7.5d0 + endif + enddo enddo enddo diff --git a/Exec/LyA/fcvode_extras.f90 b/Exec/LyA/fcvode_extras.f90 deleted file mode 100644 index 450a4c0e..00000000 --- a/Exec/LyA/fcvode_extras.f90 +++ /dev/null @@ -1,90 +0,0 @@ -module fcvode_extras - - implicit none - - contains - - subroutine fcvode_wrapper(dt, rho_in, T_in, ne_in, e_in, neq, cvmem, & - sunvec_y, yvec, T_out, ne_out, e_out) - - use amrex_fort_module, only : rt => amrex_real - use vode_aux_module, only: rho_vode, T_vode, ne_vode - use cvode_interface - use fnvector_serial - use, intrinsic :: iso_c_binding - - implicit none - - real(rt), intent(in ) :: dt - real(rt), intent(in ) :: rho_in, T_in, ne_in, e_in - type(c_ptr), value :: cvmem - type(c_ptr), value :: sunvec_y - real(rt), intent( out) :: T_out,ne_out,e_out - - real(c_double) :: atol, rtol - real(c_double) :: time, tout - integer(c_long), intent(in) :: neq - real(c_double), pointer, intent(in) :: yvec(:) - - integer(c_int) :: ierr - - real(c_double) :: t_soln - - T_vode = T_in - ne_vode = ne_in - rho_vode = rho_in - - ! Initialize the integration time - time = 0.d0 - - ! We will integrate "e" in time. - yvec(1) = e_in - - ! Set the tolerances. - atol = 1.d-4 * e_in - rtol = 1.d-4 - - ierr = FCVodeReInit(cvmem, time, sunvec_y) - ierr = FCVodeSStolerances(CVmem, rtol, atol) - - ierr = FCVode(CVmem, dt, sunvec_y, time, CV_NORMAL) - - e_out = yvec(1) - T_out = T_vode - ne_out = ne_vode - - end subroutine fcvode_wrapper - - integer(c_int) function RhsFn(tn, sunvec_y, sunvec_f, user_data) & - result(ierr) bind(C,name='RhsFn') - - use, intrinsic :: iso_c_binding - use fnvector_serial - use cvode_interface - implicit none - - real(c_double), value :: tn - type(c_ptr), value :: sunvec_y - type(c_ptr), value :: sunvec_f - type(c_ptr), value :: user_data - - ! pointers to data in SUNDAILS vectors - real(c_double), pointer :: yvec(:) - real(c_double), pointer :: fvec(:) - - real(c_double) :: energy - - integer(c_long), parameter :: neq = 1 - - ! get data arrays from SUNDIALS vectors - call N_VGetData_Serial(sunvec_y, neq, yvec) - call N_VGetData_Serial(sunvec_f, neq, fvec) - - call f_rhs(1, tn, yvec(1), energy, 0.0, 0) - - fvec(1) = energy - - ierr = 0 - end function RhsFn - -end module fcvode_extras diff --git a/Exec/LyA/inputs b/Exec/LyA/inputs index 51144b63..539fa4a7 100644 --- a/Exec/LyA/inputs +++ b/Exec/LyA/inputs @@ -1,28 +1,25 @@ # ------------------ INPUTS TO MAIN PROGRAM ------------------- max_step = 10000000 -nyx.ppm_type = 0 -nyx.use_colglaz = 1 -nyx.add_ext_src = 1 -nyx.heat_cool_type = 1 -nyx.strang_split = 1 -gravity.show_timings = 1 -nyx.show_timings = 1 +nyx.ppm_type = 1 +nyx.ppm_reference = 1 +nyx.use_colglaz = 0 +nyx.corner_coupling = 1 -#This is 1e-8 times the lowest density in plt00000 -nyx.small_dens = 5.162470e1 +nyx.strang_split = 1 +nyx.add_ext_src = 1 +nyx.heat_cool_type = 3 +#nyx.simd_width = 8 -#This is 1e-5 times the constant temparature in plt00000 +nyx.small_dens = 1.e-2 nyx.small_temp = 1.e-2 -#This is 1e-8 times the lowest pressure in plt00000 -nyx.small_pres = 3.487507e2 - nyx.do_santa_barbara = 1 nyx.init_sb_vels = 1 -gravity.sl_tol = 1.e-12 +gravity.ml_tol = 1.e-10 +gravity.sl_tol = 1.e-10 -nyx.initial_z = 100.0 +nyx.initial_z = 159.0 nyx.final_z = 2.0 #File written during the run: nstep | time | dt | redshift | a @@ -31,13 +28,15 @@ amr.data_log = runlog #This is how we restart from a checkpoint and write an ascii particle file #Leave this commented out in cvs version -#amr.restart = chk00070 +#amr.restart = chk00100 #max_step = 4 #particles.particle_output_file = particle_output gravity.gravity_type = PoissonGrav gravity.no_sync = 1 gravity.no_composite = 1 +gravity.solve_with_cpp = 0 +gravity.solve_with_hpgmg = 1 mg.bottom_solver = 4 @@ -48,11 +47,12 @@ geometry.coord_sys = 0 geometry.prob_lo = 0 0 0 #Domain size in Mpc -geometry.prob_hi = 8.0 8.0 8.0 - -amr.n_cell = 32 32 32 -amr.max_grid_size = 16 +geometry.prob_hi = 28.49002849 28.49002849 28.49002849 +amr.n_cell = 64 64 64 +amr.max_grid_size = 32 +#fabarray.mfiter_tile_size = 128 8 8 +fabarray.mfiter_tile_size = 1024000 8 8 # >>>>>>>>>>>>> BC FLAGS <<<<<<<<<<<<<<<< # 0 = Interior 3 = Symmetry @@ -66,32 +66,44 @@ nyx.hi_bc = 0 0 0 nyx.do_hydro = 1 nyx.do_grav = 1 -# COMOVING -nyx.comoving_OmM = 0.27 -nyx.comoving_OmB = 0.045 -nyx.comoving_h = 0.71d0 +# COSMOLOGY +nyx.comoving_OmM = 0.275 +nyx.comoving_OmB = 0.046 +nyx.comoving_h = 0.702d0 + +# UVB and reionization +nyx.inhomo_reion = 0 +nyx.inhomo_zhi_file = "zhi.bin" +nyx.inhomo_grid = 512 +nyx.uvb_rates_file = "TREECOOL_middle" +nyx.uvb_density_A = 1.0 +nyx.uvb_density_B = 0.0 +nyx.reionization_zHI_flash = -1.0 +nyx.reionization_zHeII_flash = -1.0 +nyx.reionization_T_zHI = 2.0e4 +nyx.reionization_T_zHeII = 1.5e4 # PARTICLES nyx.do_dm_particles = 1 # >>>>>>>>>>>>> PARTICLE INIT OPTIONS <<<<<<<<<<<<<<<< -# "AsciiFile" "Random" "Cosmological" +# "AsciiFile" "Random" "Cosmological" # >>>>>>>>>>>>> PARTICLE INIT OPTIONS <<<<<<<<<<<<<<<< nyx.particle_init_type = BinaryFile -nyx.binary_particle_file = 32.nyx +nyx.binary_particle_file = 64sssss_20mpc.nyx +particles.nparts_per_read = 2097152 # >>>>>>>>>>>>> PARTICLE MOVE OPTIONS <<<<<<<<<<<<<<<< # "Gravitational" "Random" # >>>>>>>>>>>>> PARTICLE MOVE OPTIONS <<<<<<<<<<<<<<<< nyx.particle_move_type = Gravitational - # TIME STEP CONTROL nyx.relative_max_change_a = 0.01 # max change in scale factor particles.cfl = 0.5 # 'cfl' for particles -nyx.cfl = 0.9 # cfl number for hyperbolic system +nyx.cfl = 0.5 # cfl number for hyperbolic system nyx.init_shrink = 1.0 # scale back initial timestep -nyx.change_max = 1.1 # factor by which timestep can change +nyx.change_max = 2.0 # factor by which timestep can change nyx.dt_cutoff = 5.e-20 # level 0 timestep below which we halt # DIAGNOSTICS & VERBOSITY @@ -109,20 +121,27 @@ amr.max_level = 0 # maximum level number allowed #amr.regrid_int = 4 4 4 4 #amr.n_error_buf = 0 0 0 8 #amr.refine_grid_layout = 1 -#amr.regrid_on_restart = 1 +amr.regrid_on_restart = 1 #amr.blocking_factor = 32 +#amr.nosub = 1 # CHECKPOINT FILES -amr.check_file = chk -amr.check_int = 1000 +amr.checkpoint_files_output = 1 +amr.check_file = chk +amr.check_int = 100 +amr.checkpoint_nfiles = 64 # PLOTFILES +fab.format = NATIVE_32 +amr.plot_files_output = 1 amr.plot_file = plt -amr.plot_int = 1000 +amr.plot_int = -1 +amr.plot_nfiles = 64 +nyx.plot_z_values = 7.0 6.0 5.0 4.0 3.0 2.0 +particles.write_in_plotfile = 1 -amr.plot_vars = ALL -amr.derive_plot_vars = particle_count particle_mass_density pressure magvel +amr.plot_vars = density xmom ymom zmom rho_e Temp phi_grav +amr.derive_plot_vars = particle_mass_density #PROBIN FILENAME amr.probin_file = probin - diff --git a/Exec/LyA/inputs.rt b/Exec/LyA/inputs.rt index 4ecdc100..d78fa0de 100644 --- a/Exec/LyA/inputs.rt +++ b/Exec/LyA/inputs.rt @@ -7,8 +7,8 @@ nyx.add_ext_src = 1 nyx.heat_cool_type = 3 nyx.strang_split = 1 -gravity.show_timings = 0 -nyx.show_timings = 0 +nyx.inhomo_reion = 0 +nyx.uvb_rates_file = "TREECOOL_middle" #This is 1e-8 times the lowest density in plt00000 nyx.small_dens = 5.162470e1 @@ -16,9 +16,6 @@ nyx.small_dens = 5.162470e1 #This is 1e-5 times the constant temparature in plt00000 nyx.small_temp = 1.e-2 -#This is 1e-8 times the lowest pressure in plt00000 -nyx.small_pres = 3.487507e2 - nyx.do_santa_barbara = 1 nyx.init_sb_vels = 1 gravity.sl_tol = 1.e-12 diff --git a/Exec/LyA/inputs.small.dsc b/Exec/LyA/inputs.small.dsc index 1080f4e0..8b4009fe 100644 --- a/Exec/LyA/inputs.small.dsc +++ b/Exec/LyA/inputs.small.dsc @@ -13,8 +13,6 @@ nyx.use_colglaz = 1 nyx.add_ext_src = 1 nyx.heat_cool_type = 1 nyx.strang_split = 1 -gravity.show_timings = 1 -nyx.show_timings = 1 #This is 1e-8 times the lowest density in plt00000 nyx.small_dens = 5.162470e1 @@ -22,9 +20,6 @@ nyx.small_dens = 5.162470e1 #This is 1e-5 times the constant temparature in plt00000 nyx.small_temp = 1.e-2 -#This is 1e-8 times the lowest pressure in plt00000 -nyx.small_pres = 3.487507e2 - nyx.do_santa_barbara = 1 nyx.init_sb_vels = 1 gravity.sl_tol = 1.e-12 diff --git a/Exec/LyA/inputs_gimlet_in_transit.dsc b/Exec/LyA/inputs_gimlet_in_transit.dsc index fff008ce..ef3ceca4 100644 --- a/Exec/LyA/inputs_gimlet_in_transit.dsc +++ b/Exec/LyA/inputs_gimlet_in_transit.dsc @@ -14,18 +14,12 @@ nyx.add_ext_src = 1 nyx.heat_cool_type = 3 nyx.strang_split = 1 -gravity.show_timings = 1 -nyx.show_timings = 1 - #This is 1e-8 times the lowest density in plt00000 nyx.small_dens = 5.162470e1 #This is 1e-5 times the constant temparature in plt00000 nyx.small_temp = 1.e-2 -#This is 1e-8 times the lowest pressure in plt00000 -nyx.small_pres = 3.487507e2 - nyx.do_santa_barbara = 1 nyx.init_sb_vels = 1 gravity.ml_tol = 1.e-10 diff --git a/Exec/LyA/integrate_state_vode_3d.f90 b/Exec/LyA/integrate_state_vode_3d.f90 deleted file mode 100644 index a866d3ac..00000000 --- a/Exec/LyA/integrate_state_vode_3d.f90 +++ /dev/null @@ -1,243 +0,0 @@ -subroutine integrate_state_vode(lo, hi, & - state , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, & - diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, & - a, half_dt, min_iter, max_iter) -! -! Calculates the sources to be added later on. -! -! Parameters -! ---------- -! lo : double array (3) -! The low corner of the current box. -! hi : double array (3) -! The high corner of the current box. -! state_* : double arrays -! The state vars -! diag_eos_* : double arrays -! Temp and Ne -! src_* : doubles arrays -! The source terms to be added to state (iterative approx.) -! double array (3) -! The low corner of the entire domain -! a : double -! The current a -! half_dt : double -! time step size, in Mpc km^-1 s ~ 10^12 yr. -! -! Returns -! ------- -! state : double array (dims) @todo -! The state vars -! - use amrex_fort_module, only : rt => amrex_real - use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, & - TEMP_COMP, NE_COMP, gamma_minus_1 - use bl_constants_module, only: M_PI - use eos_params_module - use network - use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT - use fundamental_constants_module - use comoving_module, only: comoving_h, comoving_OmB - use atomic_rates_module, only: tabulate_rates, interp_to_this_z, YHELIUM - use vode_aux_module , only: z_vode, i_vode, j_vode, k_vode - - implicit none - - integer , intent(in) :: lo(3), hi(3) - integer , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3 - integer , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3 - real(rt), intent(inout) :: state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR) - real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2) - real(rt), intent(in) :: a, half_dt - integer , intent(inout) :: max_iter, min_iter - - integer :: i, j, k - real(rt) :: z, rho - real(rt) :: T_orig, ne_orig, e_orig - real(rt) :: T_out , ne_out , e_out, mu, mean_rhob - - z = 1.d0/a - 1.d0 - - z_vode = z - mean_rhob = comoving_OmB * 3.d0*(comoving_h*100.d0)**2 / (8.d0*M_PI*Gconst) - - ! Interpolate from the table to this redshift - call interp_to_this_z(z) - - ! Note that (lo,hi) define the region of the box containing the grow cells - ! Do *not* assume this is just the valid region - ! apply heating-cooling to UEDEN and UEINT - - do k = lo(3),hi(3) - do j = lo(2),hi(2) - do i = lo(1),hi(1) - - ! Original values - rho = state(i,j,k,URHO) - e_orig = state(i,j,k,UEINT) / rho - T_orig = diag_eos(i,j,k,TEMP_COMP) - ne_orig = diag_eos(i,j,k, NE_COMP) - - if (e_orig .lt. 0.d0) then - print *,'negative e entering strang integration ',z, i,j,k, rho/mean_rhob, e_orig - call bl_abort('bad e in strang') - end if - - i_vode = i - j_vode = j - k_vode = k - - call vode_wrapper(half_dt,rho,T_orig,ne_orig,e_orig, & - T_out ,ne_out ,e_out) - - if (e_out .lt. 0.d0) then - print *,'negative e exiting strang integration ',z, i,j,k, rho/mean_rhob, e_out - T_out = 10.0 - ne_out = 0.0 - mu = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne_out) - e_out = T_out / (gamma_minus_1 * mp_over_kB * mu) - call flush(6) -! call bl_abort('bad e out of strang') - end if - - ! Update (rho e) and (rho E) - state(i,j,k,UEINT) = state(i,j,k,UEINT) + rho * (e_out-e_orig) - state(i,j,k,UEDEN) = state(i,j,k,UEDEN) + rho * (e_out-e_orig) - - ! Update T and ne (do not use stuff computed in f_rhs, per vode manual) - call nyx_eos_T_given_Re(T_out, ne_out, rho, e_out, a) - diag_eos(i,j,k,TEMP_COMP) = T_out - diag_eos(i,j,k, NE_COMP) = ne_out - - end do ! i - end do ! j - end do ! k - -end subroutine integrate_state_vode - -subroutine vode_wrapper(dt, rho_in, T_in, ne_in, e_in, T_out, ne_out, e_out) - - use amrex_fort_module, only : rt => amrex_real - use vode_aux_module, only: rho_vode, T_vode, ne_vode, & - i_vode, j_vode, k_vode - - implicit none - - real(rt), intent(in ) :: dt - real(rt), intent(in ) :: rho_in, T_in, ne_in, e_in - real(rt), intent( out) :: T_out,ne_out,e_out - - ! Set the number of independent variables -- this should be just "e" - integer, parameter :: NEQ = 1 - - ! Allocate storage for the input state - real(rt) :: y(NEQ) - - ! Our problem is stiff, tell ODEPACK that. 21 means stiff, jacobian - ! function is supplied, 22 means stiff, figure out my jacobian through - ! differencing - integer, parameter :: MF_ANALYTIC_JAC = 21, MF_NUMERICAL_JAC = 22 - - ! Tolerance parameters: - ! - ! itol specifies whether to use an single absolute tolerance for - ! all variables (1), or to pass an array of absolute tolerances, one - ! for each variable with a scalar relative tol (2), a scalar absolute - ! and array of relative tolerances (3), or arrays for both (4) - ! - ! The error is determined as e(i) = rtol*abs(y(i)) + atol, and must - ! be > 0. - ! - ! We will use arrays for both the absolute and relative tolerances, - ! since we want to be easier on the temperature than the species - - integer, parameter :: ITOL = 1 - real(rt) :: atol(NEQ), rtol(NEQ) - - ! We want to do a normal computation, and get the output values of y(t) - ! after stepping though dt - integer, PARAMETER :: ITASK = 1 - - ! istate determines the state of the calculation. A value of 1 meeans - ! this is the first call to the problem -- this is what we will want. - ! Note, istate is changed over the course of the calculation, so it - ! cannot be a parameter - integer :: istate - - ! we will override the maximum number of steps, so turn on the - ! optional arguments flag - integer, parameter :: IOPT = 1 - - ! declare a real work array of size 22 + 9*NEQ + 2*NEQ**2 and an - ! integer work array of since 30 + NEQ - - integer, parameter :: LRW = 22 + 9*NEQ + 2*NEQ**2 - real(rt) :: rwork(LRW) - real(rt) :: time - ! real(rt) :: dt4 - - integer, parameter :: LIW = 30 + NEQ - integer, dimension(LIW) :: iwork - - real(rt) :: rpar - integer :: ipar - - EXTERNAL jac, f_rhs - - logical, save :: firstCall = .true. - - T_vode = T_in - ne_vode = ne_in - rho_vode = rho_in - - ! We want VODE to re-initialize each time we call it - istate = 1 - - rwork(:) = 0.d0 - iwork(:) = 0 - - ! Set the maximum number of steps allowed (the VODE default is 500) - iwork(6) = 2000 - - ! Initialize the integration time - time = 0.d0 - - ! We will integrate "e" in time. - y(1) = e_in - - ! Set the tolerances. - atol(1) = 1.d-4 * e_in - rtol(1) = 1.d-4 - - ! call the integration routine - call dvode(f_rhs, NEQ, y, time, dt, ITOL, rtol, atol, ITASK, & - istate, IOPT, rwork, LRW, iwork, LIW, jac, MF_NUMERICAL_JAC, & - rpar, ipar) - - e_out = y(1) - T_out = T_vode - ne_out = ne_vode - - if (istate < 0) then - print *, 'istate = ', istate, 'at (i,j,k) ',i_vode,j_vode,k_vode - call bl_error("ERROR in vode_wrapper: integration failed") - endif - -! print *,'Calling vode with 1/4 the time step' -! dt4 = 0.25d0 * dt -! y(1) = e_in - -! do n = 1,4 -! call dvode(f_rhs, NEQ, y, time, dt4, ITOL, rtol, atol, ITASK, & -! istate, IOPT, rwork, LRW, iwork, LIW, jac, MF_NUMERICAL_JAC, & -! rpar, ipar) -! if (istate < 0) then -! print *, 'doing subiteration ',n -! print *, 'istate = ', istate, 'at (i,j,k) ',i,j,k -! call bl_error("ERROR in vode_wrapper: sub-integration failed") -! end if - -! end do -! endif - -end subroutine vode_wrapper diff --git a/Exec/LyA_AGN/64sssss_20mpc.nyx b/Exec/LyA_AGN/64sssss_20mpc.nyx deleted file mode 100644 index 019faac7..00000000 Binary files a/Exec/LyA_AGN/64sssss_20mpc.nyx and /dev/null differ diff --git a/Exec/LyA_AGN/64sssss_20mpc.nyx b/Exec/LyA_AGN/64sssss_20mpc.nyx new file mode 120000 index 00000000..3c7b0271 --- /dev/null +++ b/Exec/LyA_AGN/64sssss_20mpc.nyx @@ -0,0 +1 @@ +../LyA/64sssss_20mpc.nyx \ No newline at end of file diff --git a/Exec/LyA_AGN/GNUmakefile b/Exec/LyA_AGN/GNUmakefile index 763081fe..02919213 100644 --- a/Exec/LyA_AGN/GNUmakefile +++ b/Exec/LyA_AGN/GNUmakefile @@ -1,55 +1,36 @@ -# AMREX_HOME defines the directory in which we will find all the BoxLib code +# AMREX_HOME defines the directory in which we will find all the AMReX code AMREX_HOME ?= ../../../amrex -#AMREX_HOME = /home/vince/Development/BLMaster/amrex -HPGMG_DIR ?= ../../../hpgmg/finite-volume +HPGMG_DIR ?= ../../Util/hpgmg/finite-volume +CVODE_LIB_DIR ?= ../../../sundials/sundials-intel/lib # TOP defines the directory in which we will find Source, Exec, etc TOP = ../.. # compilation options -COMP = gnu +COMP = intel # gnu USE_MPI = TRUE -USE_OMP = FALSE +USE_OMP = TRUE -PROFILE = FALSE +PROFILE = TRUE TRACE_PROFILE = FALSE COMM_PROFILE = FALSE PRECISION = DOUBLE +USE_SINGLE_PRECISION_PARTICLES = TRUE DEBUG = FALSE GIMLET = FALSE REEBER = FALSE -GIMLET_DIR ?= /home/vince/Development/gimlet -# Gimlet needs FFTW MPI. -FFTW_INC ?= /usr/include -FFTW_DIR ?= /usr/lib/x86_64-linux-gnu - -REEBER_HOME ?= /project/projectdirs/nyx/ghweber/reeber2 -# Reeber needs Boost (both headers and libraries) and diy2. -BOOST_INCLUDE_DIR ?= /project/projectdirs/nyx/ghweber/boost-1.61.0-noarch/include -DIY_INCLUDE_DIR ?= /project/projectdirs/nyx/ghweber/diy/include - -USE_HPGMG = FALSE -HPGMG_FCYCLES = FALSE -HPGMG_POST_F_CYCLE_TYPE = V -HPGMG_HELMHOLTZ = FALSE -HPGMG_STENCIL_VARIABLE_COEFFICIENT = FALSE -HPGMG_USE_SUBCOMM = TRUE -HPGMG_BOTTOM_SOLVER= BICGSTAB -HPGMG_SMOOTHER = GSRB +USE_HPGMG = TRUE # physics DIM = 3 USE_GRAV = TRUE USE_HEATCOOL = TRUE - USE_AGN = TRUE - -# units -#USE_CGS = TRUE +USE_CVODE = FALSE Bpack := ./Make.package Blocs := . diff --git a/Exec/LyA_AGN/Prob_3d.f90 b/Exec/LyA_AGN/Prob_3d.f90 index 801fe970..b9e3f078 100644 --- a/Exec/LyA_AGN/Prob_3d.f90 +++ b/Exec/LyA_AGN/Prob_3d.f90 @@ -68,10 +68,11 @@ subroutine fort_initdata(level,time,lo,hi, & delta,xlo,xhi) & bind(C, name="fort_initdata") use amrex_fort_module, only : rt => amrex_real + use amrex_parmparse_module use probdata_module use atomic_rates_module, only : XHYDROGEN use meth_params_module, only : URHO, UMX, UMZ, UEDEN, UEINT, UFS, & - small_dens, TEMP_COMP, NE_COMP + small_dens, TEMP_COMP, NE_COMP, ZHI_COMP implicit none @@ -84,6 +85,13 @@ subroutine fort_initdata(level,time,lo,hi, & real(rt) diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,nd) integer i,j,k + real(rt) z_in + + type(amrex_parmparse) :: pp + + call amrex_parmparse_build(pp, "nyx") + call pp%query("initial_z", z_in) + call amrex_parmparse_destroy(pp) ! This is the case where we have compiled with states defined ! but they have only one component each so we fill them this way. @@ -93,7 +101,7 @@ subroutine fort_initdata(level,time,lo,hi, & diag_eos(:,:,:,1) = 0.0d0 ! This is the regular case with NO_HYDRO = FALSE - else if (ns.gt.1 .and. nd.eq.2) then + else if (ns.gt.1 .and. nd.ge.2) then do k = lo(3), hi(3) do j = lo(2), hi(2) @@ -111,8 +119,13 @@ subroutine fort_initdata(level,time,lo,hi, & state(i,j,k,UFS+1) = (1.d0 - XHYDROGEN) end if - diag_eos(i,j,k,TEMP_COMP) = 1000.d0 - diag_eos(i,j,k, NE_COMP) = 0.d0 + diag_eos(i,j,k,TEMP_COMP) = 0.021d0*(1.0d0 + z_in)**2 + diag_eos(i,j,k, NE_COMP) = 0.d0 + + if (ZHI_COMP .gt. -1) then + diag_eos(i,j,k, ZHI_COMP) = 7.5d0 + endif + enddo enddo enddo diff --git a/Exec/LyA_AGN/inputs b/Exec/LyA_AGN/inputs index 9d018d69..4f95a3b0 100644 --- a/Exec/LyA_AGN/inputs +++ b/Exec/LyA_AGN/inputs @@ -1,7 +1,5 @@ # ------------------ INPUTS TO MAIN PROGRAM ------------------- -max_step = 2 - -#amr.restart = chk00001 +max_step = 10000000 #Number of time steps between calls to halo finder reeber.halo_int = 1 @@ -11,22 +9,14 @@ nyx.ppm_reference = 1 nyx.use_colglaz = 0 nyx.corner_coupling = 1 +nyx.strang_split = 1 nyx.add_ext_src = 1 nyx.heat_cool_type = 3 -nyx.strang_split = 1 - -gravity.show_timings = 1 -nyx.show_timings = 1 +#nyx.simd_width = 8 -#This is 1e-8 times the lowest density in plt00000 nyx.small_dens = 1.e-2 - -#This is 1e-5 times the constant temparature in plt00000 nyx.small_temp = 1.e-2 -#This is 1e-8 times the lowest pressure in plt00000 -nyx.small_pres = 1.0e-4 - nyx.do_santa_barbara = 1 nyx.init_sb_vels = 1 gravity.ml_tol = 1.e-10 @@ -48,7 +38,8 @@ amr.data_log = runlog gravity.gravity_type = PoissonGrav gravity.no_sync = 1 gravity.no_composite = 1 -gravity.solve_with_cpp = 1 +gravity.solve_with_cpp = 0 +gravity.solve_with_hpgmg = 1 mg.bottom_solver = 4 @@ -61,10 +52,11 @@ geometry.prob_lo = 0 0 0 #Domain size in Mpc geometry.prob_hi = 28.49002849 28.49002849 28.49002849 -amr.n_cell = 256 256 256 amr.n_cell = 64 64 64 amr.max_grid_size = 32 #fabarray.mfiter_tile_size = 128 8 8 +fabarray.mfiter_tile_size = 1024000 8 8 + # >>>>>>>>>>>>> BC FLAGS <<<<<<<<<<<<<<<< # 0 = Interior 3 = Symmetry # 1 = Inflow 4 = SlipWall @@ -77,11 +69,23 @@ nyx.hi_bc = 0 0 0 nyx.do_hydro = 1 nyx.do_grav = 1 -# COMOVING +# COSMOLOGY nyx.comoving_OmM = 0.275 nyx.comoving_OmB = 0.046 nyx.comoving_h = 0.702d0 +# UVB and reionization +nyx.inhomo_reion = 0 +nyx.inhomo_zhi_file = "zhi.bin" +nyx.inhomo_grid = 512 +nyx.uvb_rates_file = "TREECOOL_middle" +nyx.uvb_density_A = 1.0 +nyx.uvb_density_B = 0.0 +nyx.reionization_zHI_flash = -1.0 +nyx.reionization_zHeII_flash = -1.0 +nyx.reionization_T_zHI = 2.0e4 +nyx.reionization_T_zHeII = 1.5e4 + # PARTICLES nyx.do_dm_particles = 1 @@ -98,7 +102,7 @@ particles.nparts_per_read = 2097152 nyx.particle_move_type = Gravitational # TIME STEP CONTROL -nyx.relative_max_change_a = 0.02 # max change in scale factor +nyx.relative_max_change_a = 0.01 # max change in scale factor particles.cfl = 0.5 # 'cfl' for particles nyx.cfl = 0.5 # cfl number for hyperbolic system nyx.init_shrink = 1.0 # scale back initial timestep @@ -125,19 +129,18 @@ amr.regrid_on_restart = 1 #amr.nosub = 1 # CHECKPOINT FILES -amr.checkpoint_files_output = 1 # no output +amr.checkpoint_files_output = 1 amr.check_file = chk amr.check_int = 100 amr.checkpoint_nfiles = 64 # PLOTFILES -#fab.format = IEEE32 fab.format = NATIVE_32 -#amr.plot_files_output = 0 +amr.plot_files_output = 1 amr.plot_file = plt amr.plot_int = -1 amr.plot_nfiles = 64 -nyx.plot_z_values = 5.4 5.0 4.6 4.2 +nyx.plot_z_values = 7.0 6.0 5.0 4.0 3.0 2.0 particles.write_in_plotfile = 1 amr.plot_vars = density xmom ymom zmom rho_e Temp phi_grav diff --git a/Exec/LyA_AGN/inputs.rt b/Exec/LyA_AGN/inputs.rt index 54aefe32..72a8b21e 100644 --- a/Exec/LyA_AGN/inputs.rt +++ b/Exec/LyA_AGN/inputs.rt @@ -7,18 +7,12 @@ nyx.add_ext_src = 1 nyx.heat_cool_type = 3 nyx.strang_split = 1 -gravity.show_timings = 0 -nyx.show_timings = 0 - #This is 1e-8 times the lowest density in plt00000 nyx.small_dens = 5.162470e1 #This is 1e-5 times the constant temparature in plt00000 nyx.small_temp = 1.e-2 -#This is 1e-8 times the lowest pressure in plt00000 -nyx.small_pres = 3.487507e2 - nyx.do_santa_barbara = 1 nyx.init_sb_vels = 1 gravity.sl_tol = 1.e-12 diff --git a/Exec/LyA_AGN/integrate_state_vode_3d.f90 b/Exec/LyA_AGN/integrate_state_vode_3d.f90 deleted file mode 100644 index 2c124b9e..00000000 --- a/Exec/LyA_AGN/integrate_state_vode_3d.f90 +++ /dev/null @@ -1,243 +0,0 @@ -subroutine integrate_state_vode(lo, hi, & - state , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, & - diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, & - a, half_dt, min_iter, max_iter) -! -! Calculates the sources to be added later on. -! -! Parameters -! ---------- -! lo : double array (3) -! The low corner of the current box. -! hi : double array (3) -! The high corner of the current box. -! state_* : double arrays -! The state vars -! diag_eos_* : double arrays -! Temp and Ne -! src_* : doubles arrays -! The source terms to be added to state (iterative approx.) -! double array (3) -! The low corner of the entire domain -! a : double -! The current a -! half_dt : double -! time step size, in Mpc km^-1 s ~ 10^12 yr. -! -! Returns -! ------- -! state : double array (dims) @todo -! The state vars -! - use amrex_fort_module, only : rt => amrex_real - use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, & - TEMP_COMP, NE_COMP, gamma_minus_1 - use bl_constants_module, only: M_PI - use eos_params_module - use network - use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT - use fundamental_constants_module - use comoving_module, only: comoving_h, comoving_OmB - use atomic_rates_module, only: tabulate_rates, interp_to_this_z, YHELIUM - use vode_aux_module , only: z_vode, i_vode, j_vode, k_vode - - implicit none - - integer , intent(in) :: lo(3), hi(3) - integer , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3 - integer , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3 - real(rt), intent(inout) :: state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR) - real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2) - real(rt), intent(in) :: a, half_dt - integer , intent(inout) :: max_iter, min_iter - - integer :: i, j, k - real(rt) :: z, rho - real(rt) :: T_orig, ne_orig, e_orig - real(rt) :: T_out , ne_out , e_out, mu, mean_rhob - - z = 1.d0/a - 1.d0 - - z_vode = z - mean_rhob = comoving_OmB * 3.d0*(comoving_h*100.d0)**2 / (8.d0*M_PI*Gconst) - - ! Interpolate from the table to this redshift - call interp_to_this_z(z) - - ! Note that (lo,hi) define the region of the box containing the grow cells - ! Do *not* assume this is just the valid region - ! apply heating-cooling to UEDEN and UEINT - - do k = lo(3),hi(3) - do j = lo(2),hi(2) - do i = lo(1),hi(1) - - ! Original values - rho = state(i,j,k,URHO) - e_orig = state(i,j,k,UEINT) / rho - T_orig = diag_eos(i,j,k,TEMP_COMP) - ne_orig = diag_eos(i,j,k, NE_COMP) - - if (e_orig .lt. 0.d0) then - print *,'negative e entering strang integration ',z, i,j,k, rho/mean_rhob, e_orig - call bl_abort('bad e in strang') - end if - - i_vode = i - j_vode = j - k_vode = k - - call vode_wrapper(half_dt,rho,T_orig,ne_orig,e_orig, & - T_out ,ne_out ,e_out) - - if (e_out .lt. 0.d0) then - print *,'negative e exiting strang integration ',z, i,j,k, rho/mean_rhob, e_out - T_out = 10.0 - ne_out = 0.0 - mu = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne_out) - e_out = T_out / (gamma_minus_1 * mp_over_kB * mu) - call flush(6) -! call bl_abort('bad e out of strang') - end if - - ! Update (rho e) and (rho E) - state(i,j,k,UEINT) = state(i,j,k,UEINT) + rho * (e_out-e_orig) - state(i,j,k,UEDEN) = state(i,j,k,UEDEN) + rho * (e_out-e_orig) - - ! Update T and ne (do not use stuff computed in f_rhs, per vode manual) - call nyx_eos_T_given_Re(T_out, ne_out, rho, e_out, a) - diag_eos(i,j,k,TEMP_COMP) = T_out - diag_eos(i,j,k, NE_COMP) = ne_out - - end do ! i - end do ! j - end do ! k - -end subroutine integrate_state_vode - -subroutine vode_wrapper(dt, rho_in, T_in, ne_in, e_in, T_out, ne_out, e_out) - - use vode_aux_module, only: rho_vode, T_vode, ne_vode, & - i_vode, j_vode, k_vode - - use amrex_fort_module, only : rt => amrex_real - implicit none - - real(rt), intent(in ) :: dt - real(rt), intent(in ) :: rho_in, T_in, ne_in, e_in - real(rt), intent( out) :: T_out,ne_out,e_out - - ! Set the number of independent variables -- this should be just "e" - integer, parameter :: NEQ = 1 - - ! Allocate storage for the input state - real(rt) :: y(NEQ) - - ! Our problem is stiff, tell ODEPACK that. 21 means stiff, jacobian - ! function is supplied, 22 means stiff, figure out my jacobian through - ! differencing - integer, parameter :: MF_ANALYTIC_JAC = 21, MF_NUMERICAL_JAC = 22 - - ! Tolerance parameters: - ! - ! itol specifies whether to use an single absolute tolerance for - ! all variables (1), or to pass an array of absolute tolerances, one - ! for each variable with a scalar relative tol (2), a scalar absolute - ! and array of relative tolerances (3), or arrays for both (4) - ! - ! The error is determined as e(i) = rtol*abs(y(i)) + atol, and must - ! be > 0. - ! - ! We will use arrays for both the absolute and relative tolerances, - ! since we want to be easier on the temperature than the species - - integer, parameter :: ITOL = 1 - real(rt) :: atol(NEQ), rtol(NEQ) - - ! We want to do a normal computation, and get the output values of y(t) - ! after stepping though dt - integer, PARAMETER :: ITASK = 1 - - ! istate determines the state of the calculation. A value of 1 meeans - ! this is the first call to the problem -- this is what we will want. - ! Note, istate is changed over the course of the calculation, so it - ! cannot be a parameter - integer :: istate - - ! we will override the maximum number of steps, so turn on the - ! optional arguments flag - integer, parameter :: IOPT = 1 - - ! declare a real work array of size 22 + 9*NEQ + 2*NEQ**2 and an - ! integer work array of since 30 + NEQ - - integer, parameter :: LRW = 22 + 9*NEQ + 2*NEQ**2 - real(rt) :: rwork(LRW) - real(rt) :: time - ! real(rt) :: dt4 - - integer, parameter :: LIW = 30 + NEQ - integer, dimension(LIW) :: iwork - - real(rt) :: rpar - integer :: ipar - - EXTERNAL jac, f_rhs - - logical, save :: firstCall = .true. - - T_vode = T_in - ne_vode = ne_in - rho_vode = rho_in - - ! We want VODE to re-initialize each time we call it - istate = 1 - - rwork(:) = 0.d0 - iwork(:) = 0 - - ! Set the maximum number of steps allowed (the VODE default is 500) - iwork(6) = 2000 - - ! Initialize the integration time - time = 0.d0 - - ! We will integrate "e" in time. - y(1) = e_in - - ! Set the tolerances. - atol(1) = 1.d-4 * e_in - rtol(1) = 1.d-4 - - ! call the integration routine - call dvode(f_rhs, NEQ, y, time, dt, ITOL, rtol, atol, ITASK, & - istate, IOPT, rwork, LRW, iwork, LIW, jac, MF_NUMERICAL_JAC, & - rpar, ipar) - - e_out = y(1) - T_out = T_vode - ne_out = ne_vode - - if (istate < 0) then - print *, 'istate = ', istate, 'at (i,j,k) ',i_vode,j_vode,k_vode - call bl_error("ERROR in vode_wrapper: integration failed") - endif - -! print *,'Calling vode with 1/4 the time step' -! dt4 = 0.25d0 * dt -! y(1) = e_in - -! do n = 1,4 -! call dvode(f_rhs, NEQ, y, time, dt4, ITOL, rtol, atol, ITASK, & -! istate, IOPT, rwork, LRW, iwork, LIW, jac, MF_NUMERICAL_JAC, & -! rpar, ipar) -! if (istate < 0) then -! print *, 'doing subiteration ',n -! print *, 'istate = ', istate, 'at (i,j,k) ',i,j,k -! call bl_error("ERROR in vode_wrapper: sub-integration failed") -! end if - -! end do -! endif - -end subroutine vode_wrapper diff --git a/Exec/Make.Nyx b/Exec/Make.Nyx index 4308625e..1e2b11d3 100644 --- a/Exec/Make.Nyx +++ b/Exec/Make.Nyx @@ -39,6 +39,16 @@ endif DEFINES += -DBL_NOLINEVALUES +GIMLET_DIR ?= /home/vince/Development/gimlet +# Gimlet needs FFTW MPI. +FFTW_INC ?= /usr/include +FFTW_DIR ?= /usr/lib/x86_64-linux-gnu + +REEBER_HOME ?= /project/projectdirs/nyx/ghweber/reeber2 +# Reeber needs Boost (both headers and libraries) and diy2. +BOOST_INCLUDE_DIR ?= /project/projectdirs/nyx/ghweber/boost-1.61.0-noarch/include +DIY_INCLUDE_DIR ?= /project/projectdirs/nyx/ghweber/diy/include + ifeq ($(REEBER), TRUE) DEFINES += -DREEBER DEFINES += -DREEBER_USE_BOXLIB_READER -DREEBER_IN_SITU @@ -133,6 +143,16 @@ ifeq ($(USE_MG), TRUE) VPATH_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers/F_MG endif +ifeq ($(USE_HPGMG), TRUE) + HPGMG_FCYCLES = TRUE + HPGMG_POST_F_CYCLE_TYPE = V + HPGMG_HELMHOLTZ = FALSE + HPGMG_STENCIL_VARIABLE_COEFFICIENT = FALSE + HPGMG_USE_SUBCOMM = TRUE + HPGMG_BOTTOM_SOLVER= CG + HPGMG_SMOOTHER = GSRB +endif + include $(AMREX_HOME)/Src/F_BaseLib/FParallelMG.mak INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/F_BaseLib VPATH_LOCATIONS += $(AMREX_HOME)/Src/F_BaseLib diff --git a/Exec/MiniSB/GNUmakefile b/Exec/MiniSB/GNUmakefile index 41fac4a2..488bdf5a 100644 --- a/Exec/MiniSB/GNUmakefile +++ b/Exec/MiniSB/GNUmakefile @@ -1,34 +1,21 @@ # AMREX_HOME defines the directory in which we will find all the BoxLib code -AMREX_HOME ?= /project/projectdirs/nyx/src/amrex +AMREX_HOME ?= ../../../amrex -HPGMG_DIR ?= $(HOME)/hpgmg/finite-volume +HPGMG_DIR ?= ../../Util/hpgmg/finite-volume # TOP defines the directory in which we will find Source, Exec, etc TOP = ../.. -# Reeber (Edison) -BOOST_DIR ?= /project/projectdirs/nyx/ghweber/boost-1.58.0-edison-gcc-4.9.2 -DIY2_INCLUDE_DIR ?= /project/projectdirs/nyx/ghweber/diy2/include -REEBER_HOME ?= /project/projectdirs/nyx/ghweber/reeber2 - # compilation options -COMP = gcc +COMP = intel # gnu USE_MPI = TRUE USE_OMP = FALSE # Analysis REEBER = FALSE -#DEFINES += -DREEBER_PERSISTENT_INTEGRAL_TRACE_VTCS USE_HPGMG = FALSE -HPGMG_FCYCLES = TRUE -HPGMG_POST_F_CYCLE_TYPE = V -HPGMG_HELMHOLTZ = FALSE -HPGMG_STENCIL_VARIABLE_COEFFICIENT = FALSE -HPGMG_USE_SUBCOMM = TRUE -HPGMG_BOTTOM_SOLVER= BICGSTAB -HPGMG_SMOOTHER = GSRB PRECISION = DOUBLE DEBUG = FALSE diff --git a/Exec/MiniSB/inputs.32 b/Exec/MiniSB/inputs.32 index 78d94936..88ca697b 100644 --- a/Exec/MiniSB/inputs.32 +++ b/Exec/MiniSB/inputs.32 @@ -13,10 +13,6 @@ amr.plot_files_output = 1 nyx.print_fortran_warnings = 0 -// Show timings in different routines -nyx.show_timings = 0 -gravity.show_timings = 0 - nyx.ppm_type = 0 nyx.use_colglaz = 1 diff --git a/Exec/MiniSB/inputs.32.plot_z b/Exec/MiniSB/inputs.32.plot_z index 03f81cff..6801a552 100644 --- a/Exec/MiniSB/inputs.32.plot_z +++ b/Exec/MiniSB/inputs.32.plot_z @@ -17,10 +17,6 @@ amr.plot_files_output = 1 nyx.print_fortran_warnings = 0 -// Show timings in different routines -nyx.show_timings = 0 -gravity.show_timings = 0 - nyx.ppm_type = 0 nyx.use_colglaz = 1 diff --git a/Exec/MiniSB/inputs.32.ref b/Exec/MiniSB/inputs.32.ref index 13a373fa..23cebc8c 100644 --- a/Exec/MiniSB/inputs.32.ref +++ b/Exec/MiniSB/inputs.32.ref @@ -15,10 +15,6 @@ amr.plot_files_output = 1 nyx.print_fortran_warnings = 0 -// Show timings in different routines -nyx.show_timings = 0 -gravity.show_timings = 0 - nyx.ppm_type = 0 nyx.use_colglaz = 1 diff --git a/Exec/MiniSB/inputs.analysis b/Exec/MiniSB/inputs.analysis index a8f26a9f..8b0bc9f8 100644 --- a/Exec/MiniSB/inputs.analysis +++ b/Exec/MiniSB/inputs.analysis @@ -7,10 +7,6 @@ amr.refine_grid_layout = 0 amr.checkpoint_files_output = 1 amr.plot_files_output = 1 -// Show timings in different routines -nyx.show_timings = 1 -gravity.show_timings = 1 - nyx.ppm_type = 0 nyx.use_colglaz = 1 diff --git a/Exec/RegressionTest/GNUmakefile b/Exec/RegressionTest/GNUmakefile index 59f62960..9a35b23f 100644 --- a/Exec/RegressionTest/GNUmakefile +++ b/Exec/RegressionTest/GNUmakefile @@ -1,5 +1,5 @@ # AMREX_HOME defines the directory in which we will find all the BoxLib code -AMREX_HOME ?= /project/projectdirs/nyx/src/amrex +AMREX_HOME ?= ../../../amrex # TOP defines the directory in which we will find Source, Exec, etc TOP = ../.. diff --git a/Exec/SantaBarbara/GNUmakefile b/Exec/SantaBarbara/GNUmakefile index 04466511..19dd050e 100644 --- a/Exec/SantaBarbara/GNUmakefile +++ b/Exec/SantaBarbara/GNUmakefile @@ -1,39 +1,21 @@ # AMREX_HOME defines the directory in which we will find all the BoxLib code -AMREX_HOME ?= /project/projectdirs/nyx/src/amrex +AMREX_HOME ?= ../../../amrex -HPGMG_DIR ?= /global/homes/f/friesen/hpgmg/finite-volume +HPGMG_DIR ?= ../../Util/hpgmg/finite-volume # TOP defines the directory in which we will find Source, Exec, etc TOP = ../.. -# Reeber -BOOST_DIR ?= /opt/local -REEBER_HOME ?= $(HOME)/devel/Reeber/dev - # compilation options -COMP = gcc +COMP = intel # gnu USE_OMP = FALSE USE_MPI = TRUE PRECISION = DOUBLE +USE_SINGLE_PRECISION_PARTICLES = FALSE DEBUG = FALSE -DEBUG = TRUE USE_HPGMG = FALSE -HPGMG_FCYCLES = TRUE -HPGMG_POST_F_CYCLE_TYPE = V -HPGMG_HELMHOLTZ = FALSE -HPGMG_STENCIL_VARIABLE_COEFFICIENT = FALSE -HPGMG_USE_SUBCOMM = TRUE -HPGMG_BOTTOM_SOLVER= BICGSTAB -HPGMG_SMOOTHER = GSRB - -# Use single precision for particles? -# -# If you set this to be TRUE be sure to do a "make realclean" -# to make sure all your code is consistent. -# -USE_SINGLE_PRECISION_PARTICLES = FALSE # physics DIM = 3 diff --git a/Exec/SantaBarbara/inputs b/Exec/SantaBarbara/inputs index a27330ae..db74e055 100644 --- a/Exec/SantaBarbara/inputs +++ b/Exec/SantaBarbara/inputs @@ -4,8 +4,6 @@ max_step = 10000000 nyx.ppm_type = 0 nyx.use_colglaz = 1 nyx.add_ext_src = 0 -nyx.show_timings = 1 -gravity.show_timings = 1 #This is 1e-8 times the lowest density in plt00000 nyx.small_dens = 5.162470e1 @@ -13,9 +11,6 @@ nyx.small_dens = 5.162470e1 #This is 1e-5 times the constant temparature in plt00000 nyx.small_temp = 1.e-2 -#This is 1e-8 times the lowest pressure in plt00000 -nyx.small_pres = 3.487507e2 - nyx.do_santa_barbara = 1 nyx.init_sb_vels = 1 gravity.sl_tol = 1.e-12 diff --git a/Exec/Scaling/GNUmakefile b/Exec/Scaling/GNUmakefile index 98595f9c..79e64ebf 100644 --- a/Exec/Scaling/GNUmakefile +++ b/Exec/Scaling/GNUmakefile @@ -1,52 +1,39 @@ # AMREX_HOME defines the directory in which we will find all the AMReX code -AMREX_HOME ?= /global/homes/a/almgren/GitCode/amrex +AMREX_HOME ?= ../../../amrex -HPGMG_DIR ?= ../../../hpgmg/finite-volume +HPGMG_DIR ?= ../../Util/hpgmg/finite-volume +CVODE_LIB_DIR ?= ../../../sundials/sundials-intel/lib # TOP defines the directory in which we will find Source, Exec, etc TOP = ../.. # compilation options -COMP = gcc +COMP = intel # gnu USE_MPI = TRUE USE_OMP = TRUE -PROFILE = FALSE +PROFILE = TRUE TRACE_PROFILE = FALSE COMM_PROFILE = FALSE PRECISION = DOUBLE +USE_SINGLE_PRECISION_PARTICLES = FALSE DEBUG = FALSE GIMLET = FALSE REEBER = FALSE -GIMLET_DIR ?= /home/vince/Development/gimlet -# Gimlet needs FFTW MPI. -FFTW_INC ?= /usr/include -FFTW_DIR ?= /usr/lib/x86_64-linux-gnu - -REEBER_HOME ?= $(HOME)/devel/Reeber/dev -# Reeber needs Boost (both headers and libraries) and diy2. -BOOST_INCLUDE_DIR ?= /usr/local/include -BOOST_LIB_DIR ?= /usr/local/lib -DIY2_INCLUDE_DIR ?= /usr/local/include - -#USE_HPGMG = TRUE -HPGMG_FCYCLES = FALSE -HPGMG_POST_F_CYCLE_TYPE = V -HPGMG_HELMHOLTZ = FALSE -HPGMG_STENCIL_VARIABLE_COEFFICIENT = FALSE -HPGMG_USE_SUBCOMM = TRUE -HPGMG_BOTTOM_SOLVER= CG -HPGMG_SMOOTHER = GSRB +USE_HPGMG = TRUE # physics DIM = 3 USE_GRAV = TRUE USE_HEATCOOL = TRUE +USE_AGN = FALSE +USE_CVODE = FALSE Bpack := ./Make.package Blocs := . include $(TOP)/Exec/Make.Nyx + diff --git a/Exec/Scaling/Prob_3d.f90 b/Exec/Scaling/Prob_3d.f90 index f1727bea..2e82541f 100644 --- a/Exec/Scaling/Prob_3d.f90 +++ b/Exec/Scaling/Prob_3d.f90 @@ -65,10 +65,11 @@ subroutine fort_initdata(level,time,lo,hi, & bind(C, name="fort_initdata") use amrex_fort_module, only : rt => amrex_real + use amrex_parmparse_module use probdata_module use atomic_rates_module, only : XHYDROGEN use meth_params_module, only : URHO, UMX, UMZ, UEDEN, UEINT, UFS, & - small_dens, TEMP_COMP, NE_COMP + small_dens, TEMP_COMP, NE_COMP, ZHI_COMP implicit none @@ -81,6 +82,13 @@ subroutine fort_initdata(level,time,lo,hi, & real(rt) diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,nd) integer i,j,k + real(rt) z_in + + type(amrex_parmparse) :: pp + + call amrex_parmparse_build(pp, "nyx") + call pp%query("initial_z", z_in) + call amrex_parmparse_destroy(pp) ! This is the case where we have compiled with states defined ! but they have only one component each so we fill them this way. @@ -90,7 +98,7 @@ subroutine fort_initdata(level,time,lo,hi, & diag_eos(:,:,:,1) = 0.0d0 ! This is the regular case with NO_HYDRO = FALSE - else if (ns.gt.1 .and. nd.eq.2) then + else if (ns.gt.1 .and. nd.ge.2) then do k = lo(3), hi(3) do j = lo(2), hi(2) @@ -108,8 +116,13 @@ subroutine fort_initdata(level,time,lo,hi, & state(i,j,k,UFS+1) = (1.d0 - XHYDROGEN) end if - diag_eos(i,j,k,TEMP_COMP) = 1000.d0 - diag_eos(i,j,k, NE_COMP) = 0.d0 + diag_eos(i,j,k,TEMP_COMP) = 0.021d0*(1.0d0 + z_in)**2 + diag_eos(i,j,k, NE_COMP) = 0.d0 + + if (ZHI_COMP .gt. -1) then + diag_eos(i,j,k, ZHI_COMP) = 7.5d0 + endif + enddo enddo enddo diff --git a/Exec/Scaling/inputs b/Exec/Scaling/inputs index 2c6b1d5c..48808055 100644 --- a/Exec/Scaling/inputs +++ b/Exec/Scaling/inputs @@ -6,29 +6,21 @@ nyx.ppm_reference = 1 nyx.use_colglaz = 0 nyx.corner_coupling = 1 +nyx.strang_split = 1 nyx.add_ext_src = 1 nyx.heat_cool_type = 3 -nyx.strang_split = 1 - -gravity.show_timings = 1 -nyx.show_timings = 1 +#nyx.simd_width = 8 -#This is 1e-8 times the lowest density in plt00000 nyx.small_dens = 1.e-2 - -#This is 1e-5 times the constant temparature in plt00000 nyx.small_temp = 1.e-2 -#This is 1e-8 times the lowest pressure in plt00000 -nyx.small_pres = 1.0e-4 - nyx.do_santa_barbara = 1 nyx.init_sb_vels = 1 gravity.ml_tol = 1.e-10 gravity.sl_tol = 1.e-10 nyx.initial_z = 159.0 -nyx.final_z = 4.2 +nyx.final_z = 2.0 #File written during the run: nstep | time | dt | redshift | a amr.data_log = runlog @@ -36,19 +28,18 @@ amr.data_log = runlog #This is how we restart from a checkpoint and write an ascii particle file #Leave this commented out in cvs version -#amr.restart = chk03500 +#amr.restart = chk00100 #max_step = 4 #particles.particle_output_file = particle_output gravity.gravity_type = PoissonGrav gravity.no_sync = 1 gravity.no_composite = 1 +gravity.solve_with_cpp = 0 +gravity.solve_with_hpgmg = 1 mg.bottom_solver = 4 -gravity.solve_with_cpp = 0 -gravity.solve_with_hpgmg = 0 - # PROBLEM SIZE & GEOMETRY geometry.is_periodic = 1 1 1 geometry.coord_sys = 0 @@ -58,11 +49,12 @@ geometry.prob_lo = 0 0 0 #Domain size in Mpc geometry.prob_hi = 28.49002849 28.49002849 28.49002849 -amr.n_cell = 64 64 64 +amr.n_cell = 64 64 64 amr.max_grid_size = 32 +#fabarray.mfiter_tile_size = 128 8 8 +fabarray.mfiter_tile_size = 1024000 8 8 -nyx.particle_initrandom_mass = 1.01241529887243E5 - +nyx.particle_initrandom_mass = 3.317482451E9 # >>>>>>>>>>>>> BC FLAGS <<<<<<<<<<<<<<<< # 0 = Interior 3 = Symmetry @@ -76,16 +68,28 @@ nyx.hi_bc = 0 0 0 nyx.do_hydro = 1 nyx.do_grav = 1 -# COMOVING +# COSMOLOGY nyx.comoving_OmM = 0.275 nyx.comoving_OmB = 0.046 nyx.comoving_h = 0.702d0 +# UVB and reionization +nyx.inhomo_reion = 0 +nyx.inhomo_zhi_file = "zhi.bin" +nyx.inhomo_grid = 512 +nyx.uvb_rates_file = "TREECOOL_middle" +nyx.uvb_density_A = 1.0 +nyx.uvb_density_B = 0.0 +nyx.reionization_zHI_flash = -1.0 +nyx.reionization_zHeII_flash = -1.0 +nyx.reionization_T_zHI = 2.0e4 +nyx.reionization_T_zHeII = 1.5e4 + # PARTICLES nyx.do_dm_particles = 1 # >>>>>>>>>>>>> PARTICLE INIT OPTIONS <<<<<<<<<<<<<<<< -# "AsciiFile" "Random" "Cosmological" +# "AsciiFile" "Random" "Cosmological" # >>>>>>>>>>>>> PARTICLE INIT OPTIONS <<<<<<<<<<<<<<<< nyx.particle_init_type = RandomPerCell @@ -96,7 +100,7 @@ nyx.particle_move_type = Gravitational # TIME STEP CONTROL -nyx.relative_max_change_a = 0.02 # max change in scale factor +nyx.relative_max_change_a = 0.01 # max change in scale factor particles.cfl = 0.5 # 'cfl' for particles nyx.cfl = 0.5 # cfl number for hyperbolic system nyx.init_shrink = 1.0 # scale back initial timestep @@ -110,7 +114,7 @@ nyx.v = 1 # verbosity in Nyx.cpp gravity.v = 1 # verbosity in Gravity.cpp amr.v = 1 # verbosity in Amr.cpp mg.v = 1 # verbosity in Amr.cpp -particles.v = 1 # verbosity in Particle class +particles.v = 2 # verbosity in Particle class # REFINEMENT / REGRIDDING amr.max_level = 0 # maximum level number allowed @@ -123,20 +127,18 @@ amr.regrid_on_restart = 1 #amr.nosub = 1 # CHECKPOINT FILES -amr.checkpoint_files_output = 0 # no output +amr.checkpoint_files_output = 0 # no output amr.check_file = chk -amr.check_int = 200 -amr.checkpoint_nfiles = 128 +amr.check_int = 100 +amr.checkpoint_nfiles = 64 # PLOTFILES -amr.plot_files_output = 0 -#fab.format = IEEE32 +amr.plot_files_output = 0 # no output fab.format = NATIVE_32 amr.plot_file = plt amr.plot_int = -1 -amr.plot_nfiles = 128 -nyx.plot_z_values = 5.4 5.0 4.6 4.2 -#nyx.plot_z_values = 6.0 5.4 5.0 4.6 4.2 4.0 3.6 3.2 3.0 2.6 2.4 2.2 2.0 +amr.plot_nfiles = 64 +nyx.plot_z_values = 7.0 6.0 5.0 4.0 3.0 2.0 particles.write_in_plotfile = 1 amr.plot_vars = density xmom ymom zmom rho_e Temp phi_grav diff --git a/Exec/Scaling/integrate_state_vode_3d.f90 b/Exec/Scaling/integrate_state_vode_3d.f90 deleted file mode 100644 index a866d3ac..00000000 --- a/Exec/Scaling/integrate_state_vode_3d.f90 +++ /dev/null @@ -1,243 +0,0 @@ -subroutine integrate_state_vode(lo, hi, & - state , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, & - diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, & - a, half_dt, min_iter, max_iter) -! -! Calculates the sources to be added later on. -! -! Parameters -! ---------- -! lo : double array (3) -! The low corner of the current box. -! hi : double array (3) -! The high corner of the current box. -! state_* : double arrays -! The state vars -! diag_eos_* : double arrays -! Temp and Ne -! src_* : doubles arrays -! The source terms to be added to state (iterative approx.) -! double array (3) -! The low corner of the entire domain -! a : double -! The current a -! half_dt : double -! time step size, in Mpc km^-1 s ~ 10^12 yr. -! -! Returns -! ------- -! state : double array (dims) @todo -! The state vars -! - use amrex_fort_module, only : rt => amrex_real - use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, & - TEMP_COMP, NE_COMP, gamma_minus_1 - use bl_constants_module, only: M_PI - use eos_params_module - use network - use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT - use fundamental_constants_module - use comoving_module, only: comoving_h, comoving_OmB - use atomic_rates_module, only: tabulate_rates, interp_to_this_z, YHELIUM - use vode_aux_module , only: z_vode, i_vode, j_vode, k_vode - - implicit none - - integer , intent(in) :: lo(3), hi(3) - integer , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3 - integer , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3 - real(rt), intent(inout) :: state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR) - real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2) - real(rt), intent(in) :: a, half_dt - integer , intent(inout) :: max_iter, min_iter - - integer :: i, j, k - real(rt) :: z, rho - real(rt) :: T_orig, ne_orig, e_orig - real(rt) :: T_out , ne_out , e_out, mu, mean_rhob - - z = 1.d0/a - 1.d0 - - z_vode = z - mean_rhob = comoving_OmB * 3.d0*(comoving_h*100.d0)**2 / (8.d0*M_PI*Gconst) - - ! Interpolate from the table to this redshift - call interp_to_this_z(z) - - ! Note that (lo,hi) define the region of the box containing the grow cells - ! Do *not* assume this is just the valid region - ! apply heating-cooling to UEDEN and UEINT - - do k = lo(3),hi(3) - do j = lo(2),hi(2) - do i = lo(1),hi(1) - - ! Original values - rho = state(i,j,k,URHO) - e_orig = state(i,j,k,UEINT) / rho - T_orig = diag_eos(i,j,k,TEMP_COMP) - ne_orig = diag_eos(i,j,k, NE_COMP) - - if (e_orig .lt. 0.d0) then - print *,'negative e entering strang integration ',z, i,j,k, rho/mean_rhob, e_orig - call bl_abort('bad e in strang') - end if - - i_vode = i - j_vode = j - k_vode = k - - call vode_wrapper(half_dt,rho,T_orig,ne_orig,e_orig, & - T_out ,ne_out ,e_out) - - if (e_out .lt. 0.d0) then - print *,'negative e exiting strang integration ',z, i,j,k, rho/mean_rhob, e_out - T_out = 10.0 - ne_out = 0.0 - mu = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne_out) - e_out = T_out / (gamma_minus_1 * mp_over_kB * mu) - call flush(6) -! call bl_abort('bad e out of strang') - end if - - ! Update (rho e) and (rho E) - state(i,j,k,UEINT) = state(i,j,k,UEINT) + rho * (e_out-e_orig) - state(i,j,k,UEDEN) = state(i,j,k,UEDEN) + rho * (e_out-e_orig) - - ! Update T and ne (do not use stuff computed in f_rhs, per vode manual) - call nyx_eos_T_given_Re(T_out, ne_out, rho, e_out, a) - diag_eos(i,j,k,TEMP_COMP) = T_out - diag_eos(i,j,k, NE_COMP) = ne_out - - end do ! i - end do ! j - end do ! k - -end subroutine integrate_state_vode - -subroutine vode_wrapper(dt, rho_in, T_in, ne_in, e_in, T_out, ne_out, e_out) - - use amrex_fort_module, only : rt => amrex_real - use vode_aux_module, only: rho_vode, T_vode, ne_vode, & - i_vode, j_vode, k_vode - - implicit none - - real(rt), intent(in ) :: dt - real(rt), intent(in ) :: rho_in, T_in, ne_in, e_in - real(rt), intent( out) :: T_out,ne_out,e_out - - ! Set the number of independent variables -- this should be just "e" - integer, parameter :: NEQ = 1 - - ! Allocate storage for the input state - real(rt) :: y(NEQ) - - ! Our problem is stiff, tell ODEPACK that. 21 means stiff, jacobian - ! function is supplied, 22 means stiff, figure out my jacobian through - ! differencing - integer, parameter :: MF_ANALYTIC_JAC = 21, MF_NUMERICAL_JAC = 22 - - ! Tolerance parameters: - ! - ! itol specifies whether to use an single absolute tolerance for - ! all variables (1), or to pass an array of absolute tolerances, one - ! for each variable with a scalar relative tol (2), a scalar absolute - ! and array of relative tolerances (3), or arrays for both (4) - ! - ! The error is determined as e(i) = rtol*abs(y(i)) + atol, and must - ! be > 0. - ! - ! We will use arrays for both the absolute and relative tolerances, - ! since we want to be easier on the temperature than the species - - integer, parameter :: ITOL = 1 - real(rt) :: atol(NEQ), rtol(NEQ) - - ! We want to do a normal computation, and get the output values of y(t) - ! after stepping though dt - integer, PARAMETER :: ITASK = 1 - - ! istate determines the state of the calculation. A value of 1 meeans - ! this is the first call to the problem -- this is what we will want. - ! Note, istate is changed over the course of the calculation, so it - ! cannot be a parameter - integer :: istate - - ! we will override the maximum number of steps, so turn on the - ! optional arguments flag - integer, parameter :: IOPT = 1 - - ! declare a real work array of size 22 + 9*NEQ + 2*NEQ**2 and an - ! integer work array of since 30 + NEQ - - integer, parameter :: LRW = 22 + 9*NEQ + 2*NEQ**2 - real(rt) :: rwork(LRW) - real(rt) :: time - ! real(rt) :: dt4 - - integer, parameter :: LIW = 30 + NEQ - integer, dimension(LIW) :: iwork - - real(rt) :: rpar - integer :: ipar - - EXTERNAL jac, f_rhs - - logical, save :: firstCall = .true. - - T_vode = T_in - ne_vode = ne_in - rho_vode = rho_in - - ! We want VODE to re-initialize each time we call it - istate = 1 - - rwork(:) = 0.d0 - iwork(:) = 0 - - ! Set the maximum number of steps allowed (the VODE default is 500) - iwork(6) = 2000 - - ! Initialize the integration time - time = 0.d0 - - ! We will integrate "e" in time. - y(1) = e_in - - ! Set the tolerances. - atol(1) = 1.d-4 * e_in - rtol(1) = 1.d-4 - - ! call the integration routine - call dvode(f_rhs, NEQ, y, time, dt, ITOL, rtol, atol, ITASK, & - istate, IOPT, rwork, LRW, iwork, LIW, jac, MF_NUMERICAL_JAC, & - rpar, ipar) - - e_out = y(1) - T_out = T_vode - ne_out = ne_vode - - if (istate < 0) then - print *, 'istate = ', istate, 'at (i,j,k) ',i_vode,j_vode,k_vode - call bl_error("ERROR in vode_wrapper: integration failed") - endif - -! print *,'Calling vode with 1/4 the time step' -! dt4 = 0.25d0 * dt -! y(1) = e_in - -! do n = 1,4 -! call dvode(f_rhs, NEQ, y, time, dt4, ITOL, rtol, atol, ITASK, & -! istate, IOPT, rwork, LRW, iwork, LIW, jac, MF_NUMERICAL_JAC, & -! rpar, ipar) -! if (istate < 0) then -! print *, 'doing subiteration ',n -! print *, 'istate = ', istate, 'at (i,j,k) ',i,j,k -! call bl_error("ERROR in vode_wrapper: sub-integration failed") -! end if - -! end do -! endif - -end subroutine vode_wrapper diff --git a/Source/AGN/AGN_sources.cpp b/Source/AGN/AGN_sources.cpp index ffb4e76b..f7099018 100644 --- a/Source/AGN/AGN_sources.cpp +++ b/Source/AGN/AGN_sources.cpp @@ -17,7 +17,6 @@ Nyx::get_old_source (Real old_time, MultiFab& S_old = get_old_data(State_Type); MultiFab& D_old = get_old_data(DiagEOS_Type); - const int num_comps = S_old.nComp(); ext_src.setVal(0.); @@ -30,8 +29,8 @@ Nyx::get_old_source (Real old_time, Nyx::theAPC()->GetParticleData(part_data); for (FillPatchIterator - Old_fpi (*this, S_old, 4, old_time, State_Type, Density, num_comps), - Old_dfpi(*this, D_old, 4, old_time, DiagEOS_Type, 0, 2); + Old_fpi (*this, S_old, 4, old_time, State_Type , Density, S_old.nComp()), + Old_dfpi(*this, D_old, 4, old_time, DiagEOS_Type, 0 , D_old.nComp()); Old_fpi.isValid(); ++Old_fpi) { @@ -71,7 +70,6 @@ Nyx::get_new_source (Real old_time, MultiFab& S_old = get_old_data(State_Type); MultiFab& D_old = get_old_data(DiagEOS_Type); - const int num_comps = S_old.nComp(); ext_src.setVal(0.); @@ -87,10 +85,10 @@ Nyx::get_new_source (Real old_time, std::cout << "AGN DATA(V) " << part_data[0] << " " << part_data[1] << " " << part_data[2] << std::endl; std::cout << "AGN DATA(A) " << part_data[3] << " " << part_data[4] << " " << part_data[5] << std::endl; - for (FillPatchIterator Old_fpi(*this, S_old, 4, old_time, State_Type, Density, num_comps), - New_fpi(*this, S_old, 4, new_time, State_Type, Density, num_comps), - Old_dfpi(*this, D_old, 4, old_time, DiagEOS_Type, 0, 2), - New_dfpi(*this, D_old, 4, new_time, DiagEOS_Type, 0, 2); + for (FillPatchIterator Old_fpi( *this, S_old, 4, old_time, State_Type , Density, S_old.nComp()), + New_fpi( *this, S_old, 4, new_time, State_Type , Density, S_old.nComp()), + Old_dfpi(*this, D_old, 4, old_time, DiagEOS_Type, 0 , D_old.nComp()), + New_dfpi(*this, D_old, 4, new_time, DiagEOS_Type, 0 , D_old.nComp()); Old_fpi.isValid() && New_fpi.isValid() && Old_dfpi.isValid() && New_dfpi.isValid(); ++Old_fpi, ++New_fpi, ++Old_dfpi, ++New_dfpi) { diff --git a/Source/AGN/agn_3d.f90 b/Source/AGN/agn_3d.f90 index 23d111cd..787251eb 100644 --- a/Source/AGN/agn_3d.f90 +++ b/Source/AGN/agn_3d.f90 @@ -15,7 +15,7 @@ subroutine nyx_compute_overlap(np, particles, ng, ghosts, delta_x) & cutoff = delta_x(1) - do i = 1, np + do i = 1, np-1 do j = i+1, np r2 = sum((particles(i)%pos - particles(j)%pos)**2) @@ -205,8 +205,7 @@ subroutine agn_particle_velocity(np, particles, & j = particles(n)%pos(2) / dx(2) k = particles(n)%pos(3) / dx(3) - ! momx, momy, momz, E: momentum and total energy. - + ! momx, momy, momz: momentum = volume x change in momentum density. momx = sum((state_new(i-1:i+1, j-1:j+1, k-1:k+1, UMX) - & state_old(i-1:i+1, j-1:j+1, k-1:k+1, UMX)) * weight) * vol momy = sum((state_new(i-1:i+1, j-1:j+1, k-1:k+1, UMY) - & @@ -224,6 +223,7 @@ subroutine agn_particle_velocity(np, particles, & ! Update particle energy if particle isn't brand new if (add_energy .gt. 0) then + ! E: total energy = volume x change in total energy density. E = sum((state_new(i-1:i+1, j-1:j+1, k-1:k+1, UEDEN) - & state_old(i-1:i+1, j-1:j+1, k-1:k+1, UEDEN)) * weight) * vol deltaEnergy = - E / mass @@ -406,7 +406,7 @@ subroutine agn_release_energy(np, particles, & use amrex_fort_module, only : amrex_real use fundamental_constants_module, only: k_B, m_proton use eos_module - use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, NE_COMP + use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, NDIAG, NE_COMP use particle_mod , only: agn_particle_t use eos_module, only : nyx_eos_given_RT use agn_params_module, only : T_min @@ -417,7 +417,7 @@ subroutine agn_release_energy(np, particles, & real(amrex_real), intent(inout) :: state & (slo(1):shi(1),slo(2):shi(2),slo(3):shi(3),NVAR) real(amrex_real), intent(inout) :: diag_eos & - (dlo(1):dhi(1),dlo(2):dhi(2),dlo(3):dhi(3),2) + (dlo(1):dhi(1),dlo(2):dhi(2),dlo(3):dhi(3),NDIAG) real(amrex_real), intent(in ) :: a real(amrex_real), intent(in ) :: dx(3) @@ -443,14 +443,18 @@ subroutine agn_release_energy(np, particles, & call nyx_eos_given_RT(e, pressure, avg_rho, T_min, avg_Ne, a) - print *, 'neighborhood mass: ', m_g - print *, 'e = ', e - print *, 'particle energy: ', particles(n)%energy - print *, 'm_g * e = ', (m_g * e) +! print *, 'AGN particle at ', particles(n)%pos, ':', i, j, k + print 50, particles(n)%pos, i, j, k, particles(n)%mass, & + particles(n)%energy +50 format (1x, 'AGN particle at ', 3F8.3, 3I4, ' m=', E12.5, ' e=', E12.5) if (particles(n)%energy > m_g * e) then print *, 'RELEASING ENERGY of particle at ', particles(n)%pos + print *, 'neighborhood mass: ', m_g + print *, 'e = ', e + print *, 'particle energy: ', particles(n)%energy + print *, 'm_g * e = ', (m_g * e) state(i-1:i+1, j-1:j+1, k-1:k+1, UEDEN) = & state(i-1:i+1, j-1:j+1, k-1:k+1, UEDEN) + & diff --git a/Source/DarkMatterParticleContainer.H b/Source/DarkMatterParticleContainer.H index df1014bc..eb649a2e 100644 --- a/Source/DarkMatterParticleContainer.H +++ b/Source/DarkMatterParticleContainer.H @@ -21,6 +21,7 @@ public: } using MyParIter = amrex::ParIter<1+BL_SPACEDIM>; + using MyConstParIter = amrex::ParConstIter<1+BL_SPACEDIM>; virtual ~DarkMatterParticleContainer () {} @@ -39,6 +40,8 @@ public: virtual void moveKick (amrex::MultiFab& acceleration, int level, amrex::Real timestep, amrex::Real a_new = 1.0, amrex::Real a_half = 1.0); + void InitFromBinaryMortonFile(const std::string& particle_directory, int nextra, int skip_factor); + }; #endif /* _DarkMatterParticleContainer_H_ */ diff --git a/Source/DarkMatterParticleContainer.cpp b/Source/DarkMatterParticleContainer.cpp index 3ea2cb65..219a02db 100644 --- a/Source/DarkMatterParticleContainer.cpp +++ b/Source/DarkMatterParticleContainer.cpp @@ -1,8 +1,78 @@ +#include + #include "DarkMatterParticleContainer.H" #include "dm_F.H" using namespace amrex; +/// These are helper functions used when initializing from a morton-ordered +/// binary particle file. +namespace { + + inline uint64_t split(unsigned int a) { + uint64_t x = a & 0x1fffff; + x = (x | x << 32) & 0x1f00000000ffff; + x = (x | x << 16) & 0x1f0000ff0000ff; + x = (x | x << 8) & 0x100f00f00f00f00f; + x = (x | x << 4) & 0x10c30c30c30c30c3; + x = (x | x << 2) & 0x1249249249249249; + return x; + } + + inline uint64_t get_morton_index(unsigned int x, + unsigned int y, + unsigned int z) { + uint64_t morton_index = 0; + morton_index |= split(x) | ( split(y) << 1) | (split(z) << 2); + return morton_index; + } + + struct BoxMortonKey { + uint64_t morton_id; + int box_id; + }; + + struct by_morton_id { + bool operator()(const BoxMortonKey &a, const BoxMortonKey &b) { + return a.morton_id < b.morton_id; + } + }; + + std::string get_file_name(const std::string& base, int file_num) { + std::stringstream ss; + ss << base << file_num; + return ss.str(); + } + + struct ParticleMortonFileHeader { + long NP; + int DM; + int NX; + int SZ; + int NF; + }; + + void ReadHeader(const std::string& dir, + const std::string& file, + ParticleMortonFileHeader& hdr) { + std::string header_filename = dir; + header_filename += "/"; + header_filename += file; + + Array fileCharPtr; + ParallelDescriptor::ReadAndBcastFile(header_filename, fileCharPtr); + std::string fileCharPtrString(fileCharPtr.dataPtr()); + std::istringstream HdrFile(fileCharPtrString, std::istringstream::in); + + HdrFile >> hdr.NP; + HdrFile >> hdr.DM; + HdrFile >> hdr.NX; + HdrFile >> hdr.SZ; + HdrFile >> hdr.NF; + } + +} + void DarkMatterParticleContainer::moveKickDrift (amrex::MultiFab& acceleration, int lev, @@ -610,3 +680,108 @@ DarkMatterParticleContainer::AssignDensityAndVels (Array file_names; + for (int i = 0; i < num_files; ++i) + file_names.push_back(get_file_name(particle_file_base, i)); + + const int lev = 0; + const BoxArray& ba = ParticleBoxArray(lev); + int num_boxes = ba.size(); + uint64_t num_parts_per_box = num_parts / num_boxes; + uint64_t num_parts_per_file = num_parts / num_files; + uint64_t num_bytes_per_file = num_parts_per_file * psize; + + std::vector box_morton_keys(num_boxes); + for (int i = 0; i < num_boxes; ++i) { + const Box& box = ba[i]; + unsigned int x = box.smallEnd(0); + unsigned int y = box.smallEnd(1); + unsigned int z = box.smallEnd(2); + box_morton_keys[i].morton_id = get_morton_index(x, y, z); + box_morton_keys[i].box_id = i; + } + + std::sort(box_morton_keys.begin(), box_morton_keys.end(), by_morton_id()); + + std::vector file_indices(num_boxes); + for (int i = 0; i < num_boxes; ++i) + file_indices[box_morton_keys[i].box_id] = i; + + ParticleType p; + for (MFIter mfi = MakeMFIter(lev); mfi.isValid(); ++mfi) { + Box tile_box = mfi.tilebox(); + const int grid = mfi.index(); + const int tile = mfi.LocalTileIndex(); + auto& particles = GetParticles(lev); + + uint64_t start = file_indices[grid]*num_parts_per_box; + uint64_t stop = start + num_parts_per_box; + + int file_num = start / num_parts_per_file; + uint64_t seek_pos = (start * psize ) % num_bytes_per_file; + std::string file_name = file_names[file_num]; + + std::ifstream ifs; + ifs.open(file_name.c_str(), std::ios::in|std::ios::binary); + if ( not ifs ) { + amrex::Print() << "Failed to open file " << file_name << " for reading. \n"; + amrex::Abort(); + } + + ifs.seekg(seek_pos, std::ios::beg); + + for (uint64_t i = start; i < stop; ++i) { + int next_file = i / num_parts_per_file; + if (next_file != file_num) { + file_num = next_file; + file_name = file_names[file_num]; + ifs.close(); + ifs.open(file_name.c_str(), std::ios::in|std::ios::binary); + if ( not ifs ) { + amrex::Print() << "Failed to open file " << file_name << " for reading. \n"; + amrex::Abort(); + } + } + + float fpos[DM]; + float fextra[NX]; + ifs.read((char*)&fpos[0], DM*sizeof(float)); + ifs.read((char*)&fextra[0], NX*sizeof(float)); + + if ( (i - start) % skip_factor == 0 ) { + AMREX_D_TERM(p.m_rdata.pos[0] = fpos[0];, + p.m_rdata.pos[1] = fpos[1];, + p.m_rdata.pos[2] = fpos[2];); + + for (int comp = 0; comp < NX; comp++) + p.m_rdata.arr[BL_SPACEDIM+comp] = fextra[comp]; + + p.m_rdata.arr[BL_SPACEDIM] *= skip_factor; + + p.m_idata.id = ParticleType::NextID(); + p.m_idata.cpu = ParallelDescriptor::MyProc(); + particles[std::make_pair(grid, tile)].push_back(p); + } + } + } + + Redistribute(); +} + diff --git a/Source/EOS/Make.package b/Source/EOS/Make.package index a9b60772..b03797a6 100644 --- a/Source/EOS/Make.package +++ b/Source/EOS/Make.package @@ -5,3 +5,4 @@ f90EXE_sources += eos_stuff.f90 endif f90EXE_sources += atomic_rates.f90 +f90EXE_sources += reion_aux_module.f90 diff --git a/Source/EOS/atomic_rates.f90 b/Source/EOS/atomic_rates.f90 index f25409bc..425b877c 100644 --- a/Source/EOS/atomic_rates.f90 +++ b/Source/EOS/atomic_rates.f90 @@ -19,14 +19,11 @@ module atomic_rates_module implicit none - ! Routine which acts like a class constructor - public :: tabulate_rates, interp_to_this_z - ! Photo- rates (from file) - integer , parameter , private :: NCOOLFILE=301 - real(rt), dimension(NCOOLFILE), public :: lzr - real(rt), dimension(NCOOLFILE), public :: rggh0, rgghe0, rgghep - real(rt), dimension(NCOOLFILE), public :: reh0, rehe0, rehep + integer, private :: NCOOLFILE + real(rt), dimension(:), allocatable, private :: lzr + real(rt), dimension(:), allocatable, private :: rggh0, rgghe0, rgghep + real(rt), dimension(:), allocatable, private :: reh0, rehe0, rehep ! Other rates (from equations) integer, parameter, public :: NCOOLTAB=2000 @@ -38,10 +35,13 @@ module atomic_rates_module real(rt), public, save :: this_z, ggh0, gghe0, gghep, eh0, ehe0, ehep real(rt), parameter, public :: TCOOLMIN = 0.0d0, TCOOLMAX = 9.0d0 ! in log10 + real(rt), parameter, public :: TCOOLMIN_R = 10.0d0**TCOOLMIN, TCOOLMAX_R = 10.0d0**TCOOLMAX real(rt), parameter, public :: deltaT = (TCOOLMAX - TCOOLMIN)/NCOOLTAB real(rt), parameter, public :: MPROTON = 1.6726231d-24, BOLTZMANN = 1.3806e-16 + real(rt), public, save :: uvb_density_A = 1.0d0, uvb_density_B = 0.0d0, mean_rhob + ! Note that XHYDROGEN can be set by a call to set_xhydrogen which now ! lives in set_method_params. real(rt), public :: XHYDROGEN = 0.76d0 @@ -49,20 +49,114 @@ module atomic_rates_module contains - subroutine tabulate_rates() - integer :: i + subroutine fort_tabulate_rates() bind(C, name='fort_tabulate_rates') + use parallel, only: parallel_ioprocessor + use amrex_parmparse_module + use bl_constants_module, only: M_PI + use fundamental_constants_module, only: Gconst + use comoving_module, only: comoving_h,comoving_OmB + use reion_aux_module, only: zhi_flash, zheii_flash, T_zhi, T_zheii, & + flash_h, flash_he, inhomogeneous_on + + integer :: i, inhomo_reion logical, parameter :: Katz96=.false. real(rt), parameter :: t3=1.0d3, t5=1.0d5, t6=1.0d6 - real(rt) :: t, U, E, y, sqrt_t, corr_term + real(rt) :: t, U, E, y, sqrt_t, corr_term, tmp logical, save :: first=.true. - !$OMP CRITICAL(TREECOOL_READ) + character(len=:), allocatable :: file_in + type(amrex_parmparse) :: pp + if (first) then first = .false. - ! Read in photoionization rates and heating from a file - open(unit=11,file='TREECOOL_middle',status='old') + ! Get info from inputs + call amrex_parmparse_build(pp, "nyx") + call pp%query("inhomo_reion" , inhomo_reion) + call pp%query("uvb_rates_file" , file_in) + call pp%query("uvb_density_A" , uvb_density_A) + call pp%query("uvb_density_B" , uvb_density_B) + call pp%query("reionization_zHI_flash" , zhi_flash) + call pp%query("reionization_zHeII_flash" , zheii_flash) + call pp%query("reionization_T_zHI" , T_zhi) + call pp%query("reionization_T_zHeII" , T_zheii) + call amrex_parmparse_destroy(pp) + + if (parallel_ioprocessor()) then + print*, 'TABULATE_RATES: reionization parameters are:' + print*, ' reionization_zHI_flash = ', zhi_flash + print*, ' reionization_zHeII_flash = ', zheii_flash + print*, ' reionization_T_zHI = ', T_zhi + print*, ' reionization_T_zHeII = ', T_zheii + + print*, 'TABULATE_RATES: rho-dependent heating parameters are:' + print*, ' A = ', uvb_density_A + print*, ' B = ', uvb_density_B + print*, ' UVB heating rates will be multiplied by A*(rho/rho_mean)**B' + endif + + ! Save mean density (in code units) for density-dependent heating + mean_rhob = comoving_OmB * 3.d0*(comoving_h*100.d0)**2 / (8.d0*M_PI*Gconst) + + ! Set options in reion_aux_module + ! Hydrogen reionization + if (zhi_flash .gt. 0.0) then + if (inhomo_reion .gt. 0) then + if (parallel_ioprocessor()) print*, 'TABULATE_RATES: ignoring reionization_zHI, as nyx.inhomo_reion > 0' + flash_h = .false. + inhomogeneous_on = .true. + else + flash_h = .true. + inhomogeneous_on = .false. + endif + else + flash_h = .false. + if (inhomo_reion .gt. 0) then + inhomogeneous_on = .true. + else + inhomogeneous_on = .false. + endif + endif + + ! Helium reionization + if (zheii_flash .gt. 0.0) then + flash_he = .true. + else + flash_he = .false. + endif + + if (parallel_ioprocessor()) then + print*, 'TABULATE_RATES: reionization flags are set to:' + print*, ' Hydrogen flash = ', flash_h + print*, ' Helium flash = ', flash_he + print*, ' inhomogeneous_on (H only) = ', inhomogeneous_on + endif + + + ! Read in UVB rates from a file + if (len(file_in) .gt. 0) then + open(unit=11, file=file_in, status='old') + if (parallel_ioprocessor()) then + print*, 'TABULATE_RATES: UVB file is set in inputs ('//file_in//').' + endif + else + open(unit=11, file='TREECOOL', status='old') + if (parallel_ioprocessor()) then + print*, 'TABULATE_RATES: UVB file is defaulted to "TREECOOL".' + endif + endif + + NCOOLFILE = 0 + do + read(11,*,end=10) tmp, tmp, tmp, tmp, tmp, tmp, tmp + NCOOLFILE = NCOOLFILE + 1 + end do + 10 rewind(11) + + allocate( lzr(NCOOLFILE), rggh0(NCOOLFILE), rgghe0(NCOOlFILE), rgghep(NCOOLFILE) ) + allocate( reh0(NCOOLFILE), rehe0(NCOOLFILE), rehep(NCOOLFILE) ) + do i = 1, NCOOLFILE read(11,*) lzr(i), rggh0(i), rgghe0(i), rgghep(i), & reh0(i), rehe0(i), rehep(i) @@ -177,19 +271,21 @@ subroutine tabulate_rates() endif ! Katz rates end if ! first_call - !$OMP END CRITICAL(TREECOOL_READ) - end subroutine tabulate_rates + end subroutine fort_tabulate_rates ! **************************************************************************** - subroutine interp_to_this_z(z) + subroutine fort_interp_to_this_z(z) bind(C, name='fort_interp_to_this_z') + + use vode_aux_module, only: z_vode real(rt), intent(in) :: z real(rt) :: lopz, fact integer :: i, j this_z = z + z_vode = z lopz = dlog10(1.0d0 + z) if (lopz .ge. lzr(NCOOLFILE)) then @@ -222,26 +318,6 @@ subroutine interp_to_this_z(z) ehe0 = rehe0(j) + (rehe0(j+1)-rehe0(j))*fact ehep = rehep(j) + (rehep(j+1)-rehep(j))*fact - end subroutine interp_to_this_z + end subroutine fort_interp_to_this_z end module atomic_rates_module - -! ************************************************************************************* -! This must live outside of atomic_rates module so it can be called by the C++ -! ************************************************************************************* - -subroutine fort_init_this_z(comoving_a) & - bind(C, name="fort_init_this_z") - - use amrex_fort_module, only : rt => amrex_real - use atomic_rates_module - - implicit none - - real(rt), intent(in ) :: comoving_a - real(rt) :: z - - z = 1.d0/comoving_a - 1.d0 - call interp_to_this_z(z) - -end subroutine fort_init_this_z diff --git a/Source/EOS/eos_hc.f90 b/Source/EOS/eos_hc.f90 index 1f2ffaba..d3274293 100644 --- a/Source/EOS/eos_hc.f90 +++ b/Source/EOS/eos_hc.f90 @@ -10,16 +10,34 @@ module eos_module use amrex_fort_module, only : rt => amrex_real + use iso_c_binding, only: c_double implicit none ! Routines: - public :: nyx_eos_given_RT, nyx_eos_T_given_Re, eos_init_small_pres - public :: nyx_eos_nh0_and_nhep, iterate_ne + public :: nyx_eos_given_RT, nyx_eos_given_RT_vec, nyx_eos_T_given_Re, nyx_eos_T_given_Re_vec, eos_init_small_pres + public :: nyx_eos_nh0_and_nhep, iterate_ne, iterate_ne_vec private :: ion_n + real(rt), public :: xacc ! EOS Newton-Raphson convergence tolerance + real(c_double), public :: vode_rtol, vode_atol_scaled ! VODE integration tolerances + contains + subroutine fort_setup_eos_params (xacc_in, vode_rtol_in, vode_atol_scaled_in) & + bind(C, name='fort_setup_eos_params') + use amrex_fort_module, only : rt => amrex_real + implicit none + real(rt), intent(in) :: xacc_in, vode_rtol_in, vode_atol_scaled_in + + xacc = xacc_in + vode_rtol = vode_rtol_in + vode_atol_scaled = vode_atol_scaled_in + + end subroutine fort_setup_eos_params + + ! **************************************************************************** + subroutine eos_init_small_pres(R, T, Ne, P, a) use amrex_fort_module, only : rt => amrex_real @@ -92,11 +110,11 @@ subroutine nyx_eos_given_RT(e, P, R, T, Ne, a) use meth_params_module, only: gamma_minus_1 implicit none - real(rt), intent( out) :: e, P - real(rt), intent(in ) :: R, T, Ne - real(rt), intent(in ) :: a + double precision, intent( out) :: e, P + double precision, intent(in ) :: R, T, Ne + double precision, intent(in ) :: a - real(rt) :: mu + double precision :: mu mu = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+Ne) e = T / (gamma_minus_1 * mp_over_kB * mu) @@ -105,20 +123,48 @@ subroutine nyx_eos_given_RT(e, P, R, T, Ne, a) end subroutine nyx_eos_given_RT - ! **************************************************************************** + ! **************************************************************************** + + subroutine nyx_eos_given_RT_vec(e, P, R, T, Ne, a, veclen) + + use atomic_rates_module, ONLY: YHELIUM + use fundamental_constants_module, only: mp_over_kb + use meth_params_module, only: gamma_minus_1 + implicit none + + integer, intent(in) :: veclen + real(rt), dimension(veclen), intent( out) :: e, P + real(rt), dimension(veclen), intent(in ) :: R, T, Ne + real(rt), intent(in ) :: a + + real(rt), dimension(veclen) :: mu + integer :: i - subroutine nyx_eos_T_given_Re(T, Ne, R_in, e_in, a) + do i = 1, veclen + mu(i) = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+Ne(i)) + e(i) = T(i) / (gamma_minus_1 * mp_over_kB * mu(i)) + + P(i) = gamma_minus_1 * R(i) * e(i) + end do + + end subroutine nyx_eos_given_RT_vec + + ! **************************************************************************** + + subroutine nyx_eos_T_given_Re(JH, JHe, T, Ne, R_in, e_in, a, species) use atomic_rates_module, ONLY: XHYDROGEN, MPROTON use fundamental_constants_module, only: density_to_cgs, e_to_cgs ! In/out variables - real(rt), intent(inout) :: T, Ne - real(rt), intent(in ) :: R_in, e_in - real(rt), intent(in ) :: a + integer, intent(in) :: JH, JHe + real(rt), intent(inout) :: T, Ne + real(rt), intent(in ) :: R_in, e_in + real(rt), intent(in ) :: a + real(rt), optional, intent(out) :: species(5) - real(rt) :: nh, nh0, nhep, nhp, nhe0, nhepp - real(rt) :: z, rho, U + double precision :: nh, nh0, nhep, nhp, nhe0, nhepp + double precision :: z, rho, U ! This converts from code units to CGS rho = R_in * density_to_cgs / a**3 @@ -127,18 +173,55 @@ subroutine nyx_eos_T_given_Re(T, Ne, R_in, e_in, a) z = 1.d0/a - 1.d0 - call iterate_ne(z, U, T, nh, ne, nh0, nhp, nhe0, nhep, nhepp) + call iterate_ne(JH, Jhe, z, U, T, nh, ne, nh0, nhp, nhe0, nhep, nhepp) + + if (present(species)) then + species(1) = nh0 + species(2) = nhp + species(3) = nhe0 + species(4) = nhep + species(5) = nhepp + endif end subroutine nyx_eos_T_given_Re - ! **************************************************************************** + ! **************************************************************************** - subroutine nyx_eos_nh0_and_nhep(z, rho, e, nh0, nhep) - ! This is for skewers analysis code, input is in CGS + subroutine nyx_eos_T_given_Re_vec(T, Ne, R_in, e_in, a, veclen) + use amrex_fort_module, only : rt => amrex_real use atomic_rates_module, ONLY: XHYDROGEN, MPROTON + use fundamental_constants_module, only: density_to_cgs, e_to_cgs + + ! In/out variables + integer, intent(in) :: veclen + real(rt), dimension(veclen), intent(inout) :: T, Ne + real(rt), dimension(veclen), intent(in ) :: R_in, e_in + real(rt), intent(in ) :: a + + real(rt), dimension(veclen) :: nh, nh0, nhep, nhp, nhe0, nhepp, rho, U + real(rt) :: z + + ! This converts from code units to CGS + rho = R_in * density_to_cgs / a**3 + U = e_in * e_to_cgs + nh = rho*XHYDROGEN/MPROTON + + z = 1.d0/a - 1.d0 + + call iterate_ne_vec(z, U, T, nh, ne, nh0, nhp, nhe0, nhep, nhepp, veclen) + + end subroutine nyx_eos_T_given_Re_vec + + ! **************************************************************************** + + subroutine nyx_eos_nh0_and_nhep(JH, JHe, z, rho, e, nh0, nhep) + ! This is for skewers analysis code, input is in CGS + + use atomic_rates_module, only: XHYDROGEN, MPROTON ! In/out variables + integer, intent(in) :: JH, Jhe real(rt), intent(in ) :: z, rho, e real(rt), intent( out) :: nh0, nhep @@ -147,34 +230,308 @@ subroutine nyx_eos_nh0_and_nhep(z, rho, e, nh0, nhep) nh = rho*XHYDROGEN/MPROTON ne = 1.0d0 ! Guess - call iterate_ne(z, e, T, nh, ne, nh0, nhp, nhe0, nhep, nhepp) + call iterate_ne(JH, JHe, z, e, T, nh, ne, nh0, nhp, nhe0, nhep, nhepp) nh0 = nh*nh0 nhep = nh*nhep end subroutine nyx_eos_nh0_and_nhep - ! **************************************************************************** + ! **************************************************************************** + + subroutine iterate_ne_vec(z, U, t, nh, ne, nh0, nhp, nhe0, nhep, nhepp, veclen) + + use atomic_rates_module, ONLY: this_z, YHELIUM, BOLTZMANN, MPROTON, TCOOLMAX_R + use meth_params_module, only: gamma_minus_1 + use amrex_error_module, only: amrex_abort + + integer :: i + + integer, intent(in) :: veclen + real(rt), intent (in ) :: z + real(rt), dimension(veclen), intent(in) :: U, nh + real(rt), dimension(veclen), intent (inout) :: ne + real(rt), dimension(veclen), intent ( out) :: t, nh0, nhp, nhe0, nhep, nhepp + + real(rt), parameter :: xacc = 1.0d-6 + + integer, dimension(veclen) :: JH, JHe + real(rt), dimension(veclen) :: f, df, eps, mu + real(rt), dimension(veclen) :: nhp_plus, nhep_plus, nhepp_plus + real(rt), dimension(veclen) :: dnhp_dne, dnhep_dne, dnhepp_dne, dne + real(rt), dimension(veclen):: U_in, t_in, nh_in, ne_in + real(rt), dimension(veclen) :: nhp_out, nhep_out, nhepp_out + integer :: vec_count, orig_idx(veclen) + integer :: ii + character(len=128) :: errmsg + + ! Check if we have interpolated to this z + if (abs(z-this_z) .gt. xacc*z) then + write(errmsg, *) "iterate_ne_vec(): Wrong redshift! z = ", z, " but this_z = ", this_z + call amrex_abort(errmsg) + end if + + ii = 0 + ne(1:veclen) = 1.0d0 ! 0 is a bad guess + + do ! Newton-Raphson solver + ii = ii + 1 + + ! Ion number densities + do i = 1, veclen + mu(i) = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne(i)) + t(i) = gamma_minus_1*MPROTON/BOLTZMANN * U(i) * mu(i) + end do + vec_count = 0 + do i = 1, veclen + if (t(i) .ge. TCOOLMAX_R) then ! Fully ionized plasma + nhp(i) = 1.0d0 + nhep(i) = 0.0d0 + nhepp(i) = YHELIUM + else + vec_count = vec_count + 1 + U_in(vec_count) = U(i) + t_in(vec_count) = t(i) + nh_in(vec_count) = nh(i) + ne_in(vec_count) = ne(i) + orig_idx(vec_count) = i + endif + end do + + call ion_n_vec(JH(1:vec_count), & + JHe(1:vec_count), & + U_in(1:vec_count), & + nh_in(1:vec_count), & + ne_in(1:vec_count), & + nhp_out(1:vec_count), & + nhep_out(1:vec_count), & + nhepp_out(1:vec_count), & + t_in(1:vec_count), & + vec_count) + nhp(orig_idx(1:vec_count)) = nhp_out(1:vec_count) + nhep(orig_idx(1:vec_count)) = nhep_out(1:vec_count) + nhepp(orig_idx(1:vec_count)) = nhepp_out(1:vec_count) + + ! Forward difference derivatives + do i = 1, veclen + if (ne(i) .gt. 0.0d0) then + eps(i) = xacc*ne(i) + else + eps(i) = 1.0d-24 + endif + end do + do i = 1, veclen + mu(i) = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne(i)+eps(i)) + t(i) = gamma_minus_1*MPROTON/BOLTZMANN * U(i) * mu(i) + end do + vec_count = 0 + do i = 1, veclen + if (t(i) .ge. TCOOLMAX_R) then ! Fully ionized plasma + nhp_plus(i) = 1.0d0 + nhep_plus(i) = 0.0d0 + nhepp_plus(i) = YHELIUM + else + vec_count = vec_count + 1 + U_in(vec_count) = U(i) + t_in(vec_count) = t(i) + nh_in(vec_count) = nh(i) + ne_in(vec_count) = ne(i)+eps(i) + orig_idx(vec_count) = i + endif + end do + + call ion_n_vec(JH(1:vec_count), & + JHe(1:vec_count), & + U_in(1:vec_count), & + nh_in(1:vec_count), & + ne_in(1:vec_count), & + nhp_out(1:vec_count), & + nhep_out(1:vec_count), & + nhepp_out(1:vec_count), & + t_in(1:vec_count), & + vec_count) + nhp_plus(orig_idx(1:vec_count)) = nhp_out(1:vec_count) + nhep_plus(orig_idx(1:vec_count)) = nhep_out(1:vec_count) + nhepp_plus(orig_idx(1:vec_count)) = nhepp_out(1:vec_count) + + do i = 1, veclen + dnhp_dne(i) = (nhp_plus(i) - nhp(i)) / eps(i) + dnhep_dne(i) = (nhep_plus(i) - nhep(i)) / eps(i) + dnhepp_dne(i) = (nhepp_plus(i) - nhepp(i)) / eps(i) + end do + + do i = 1, veclen + f(i) = ne(i) - nhp(i) - nhep(i) - 2.0d0*nhepp(i) + df(i) = 1.0d0 - dnhp_dne(i) - dnhep_dne(i) - 2.0d0*dnhepp_dne(i) + dne(i) = f(i)/df(i) + end do + + do i = 1, veclen + ne(i) = max((ne(i)-dne(i)), 0.0d0) + end do + + if (maxval(abs(dne(1:veclen))) < xacc) exit + + if (ii .gt. 15) & + STOP 'iterate_ne_vec(): No convergence in Newton-Raphson!' + + enddo + + ! Get rates for the final ne + do i = 1, veclen + mu(i) = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne(i)) + t(i) = gamma_minus_1*MPROTON/BOLTZMANN * U(i) * mu(i) + end do + vec_count = 0 + do i = 1, veclen + if (t(i) .ge. TCOOLMAX_R) then ! Fully ionized plasma + nhp(i) = 1.0d0 + nhep(i) = 0.0d0 + nhepp(i) = YHELIUM + else + vec_count = vec_count + 1 + U_in(vec_count) = U(i) + t_in(vec_count) = t(i) + nh_in(vec_count) = nh(i) + ne_in(vec_count) = ne(i) + orig_idx(vec_count) = i + endif + end do + call ion_n_vec(JH(1:vec_count), & + JHe(1:vec_count), & + U_in(1:vec_count), & + nh_in(1:vec_count), & + ne_in(1:vec_count), & + nhp_out(1:vec_count), & + nhep_out(1:vec_count), & + nhepp_out(1:vec_count), & + t_in(1:vec_count), & + vec_count) + nhp(orig_idx(1:vec_count)) = nhp_out(1:vec_count) + nhep(orig_idx(1:vec_count)) = nhep_out(1:vec_count) + nhepp(orig_idx(1:vec_count)) = nhepp_out(1:vec_count) + + ! Neutral fractions: + do i = 1, veclen + nh0(i) = 1.0d0 - nhp(i) + nhe0(i) = YHELIUM - (nhep(i) + nhepp(i)) + end do + end subroutine iterate_ne_vec + + ! **************************************************************************** + + subroutine ion_n_vec(JH, JHe, U, nh, ne, nhp, nhep, nhepp, t, vec_count) + + use amrex_fort_module, only : rt => amrex_real + use meth_params_module, only: gamma_minus_1 + use atomic_rates_module, ONLY: YHELIUM, MPROTON, BOLTZMANN, & + TCOOLMIN, TCOOLMAX, NCOOLTAB, deltaT, & + AlphaHp, AlphaHep, AlphaHepp, Alphad, & + GammaeH0, GammaeHe0, GammaeHep, & + ggh0, gghe0, gghep + + integer, intent(in) :: vec_count + integer, dimension(vec_count), intent(in) :: JH, JHe + real(rt), intent(in ) :: U(vec_count), nh(vec_count), ne(vec_count) + real(rt), intent( out) :: nhp(vec_count), nhep(vec_count), nhepp(vec_count), t(vec_count) + real(rt) :: ahp(vec_count), ahep(vec_count), ahepp(vec_count), ad(vec_count), geh0(vec_count), gehe0(vec_count), gehep(vec_count) + real(rt) :: ggh0ne(vec_count), gghe0ne(vec_count), gghepne(vec_count) + real(rt) :: mu(vec_count), tmp(vec_count), logT(vec_count), flo(vec_count), fhi(vec_count) + real(rt), parameter :: smallest_val=tiny(1.0d0) + integer :: j(vec_count), i + + mu(:) = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne(:)) + t(:) = gamma_minus_1*MPROTON/BOLTZMANN * U(:) * mu(:) + + logT(1:vec_count) = dlog10(t(1:vec_count)) + + ! Temperature floor + do i = 1, vec_count + if (logT(i) .le. TCOOLMIN) logT(i) = TCOOLMIN + 0.5d0*deltaT + end do + + ! Interpolate rates + do i = 1, vec_count + tmp(i) = (logT(i)-TCOOLMIN)/deltaT + j(i) = int(tmp(i)) + fhi(i) = tmp(i) - j(i) + flo(i) = 1.0d0 - fhi(i) + j(i) = j(i) + 1 ! F90 arrays start with 1 + end do + + do i = 1, vec_count + ahp(i) = flo(i)*AlphaHp (j(i)) + fhi(i)*AlphaHp (j(i)+1) + ahep(i) = flo(i)*AlphaHep (j(i)) + fhi(i)*AlphaHep (j(i)+1) + ahepp(i) = flo(i)*AlphaHepp(j(i)) + fhi(i)*AlphaHepp(j(i)+1) + ad(i) = flo(i)*Alphad (j(i)) + fhi(i)*Alphad (j(i)+1) + geh0(i) = flo(i)*GammaeH0 (j(i)) + fhi(i)*GammaeH0 (j(i)+1) + gehe0(i) = flo(i)*GammaeHe0(j(i)) + fhi(i)*GammaeHe0(j(i)+1) + gehep(i) = flo(i)*GammaeHep(j(i)) + fhi(i)*GammaeHep(j(i)+1) + end do + + do i = 1, vec_count + if (ne(i) .gt. 0.0d0) then + ggh0ne(i) = JH(i) * ggh0 / (ne(i)*nh(i)) + gghe0ne(i) = JH(i) * gghe0 / (ne(i)*nh(i)) + gghepne(i) = JHe(i) * gghep / (ne(i)*nh(i)) + else + ggh0ne(i) = 0.0d0 + gghe0ne(i) = 0.0d0 + gghepne(i) = 0.0d0 + endif + end do + + ! H+ + do i = 1, vec_count + nhp(i) = 1.0d0 - ahp(i)/(ahp(i) + geh0(i) + ggh0ne(i)) + end do + + ! He+ + do i = 1, vec_count + if ((gehe0(i) + gghe0ne(i)) .gt. smallest_val) then + + nhep(i) = YHELIUM/(1.0d0 + (ahep(i) + ad(i) )/(gehe0(i) + gghe0ne(i)) & + + (gehep(i) + gghepne(i))/ahepp(i)) + else + nhep(i) = 0.0d0 + endif + end do + + ! He++ + do i = 1, vec_count + if (nhep(i) .gt. 0.0d0) then + nhepp(i) = nhep(i)*(gehep(i) + gghepne(i))/ahepp(i) + else + nhepp(i) = 0.0d0 + endif + end do - subroutine iterate_ne(z, U, t, nh, ne, nh0, nhp, nhe0, nhep, nhepp) + end subroutine ion_n_vec - use atomic_rates_module, ONLY: this_z, YHELIUM + ! **************************************************************************** + + subroutine iterate_ne(JH, JHe, z, U, t, nh, ne, nh0, nhp, nhe0, nhep, nhepp) + + use amrex_error_module, only: amrex_abort + use atomic_rates_module, only: this_z, YHELIUM integer :: i + integer, intent(in) :: JH, JHe real(rt), intent (in ) :: z, U, nh real(rt), intent (inout) :: ne real(rt), intent ( out) :: t, nh0, nhp, nhe0, nhep, nhepp - real(rt), parameter :: xacc = 1.0d-6 - real(rt) :: f, df, eps real(rt) :: nhp_plus, nhep_plus, nhepp_plus real(rt) :: dnhp_dne, dnhep_dne, dnhepp_dne, dne + character(len=128) :: errmsg ! Check if we have interpolated to this z - if (abs(z-this_z) .gt. xacc*z) & - STOP 'iterate_ne(): Wrong redshift!' + if (abs(z-this_z) .gt. xacc*z) then + write(errmsg, *) "iterate_ne(): Wrong redshift! z = ", z, " but this_z = ", this_z + call amrex_abort(errmsg) + end if i = 0 ne = 1.0d0 ! 0 is a bad guess @@ -182,7 +539,7 @@ subroutine iterate_ne(z, U, t, nh, ne, nh0, nhp, nhe0, nhep, nhepp) i = i + 1 ! Ion number densities - call ion_n(U, nh, ne, nhp, nhep, nhepp, t) + call ion_n(JH, JHe, U, nh, ne, nhp, nhep, nhepp, t) ! Forward difference derivatives if (ne .gt. 0.0d0) then @@ -190,7 +547,7 @@ subroutine iterate_ne(z, U, t, nh, ne, nh0, nhp, nhe0, nhep, nhepp) else eps = 1.0d-24 endif - call ion_n(U, nh, (ne+eps), nhp_plus, nhep_plus, nhepp_plus, t) + call ion_n(JH, JHe, U, nh, (ne+eps), nhp_plus, nhep_plus, nhepp_plus, t) dnhp_dne = (nhp_plus - nhp) / eps dnhep_dne = (nhep_plus - nhep) / eps @@ -212,32 +569,34 @@ subroutine iterate_ne(z, U, t, nh, ne, nh0, nhp, nhe0, nhep, nhepp) enddo ! Get rates for the final ne - call ion_n(U, nh, ne, nhp, nhep, nhepp, t) + call ion_n(JH, JHe, U, nh, ne, nhp, nhep, nhepp, t) ! Neutral fractions: nh0 = 1.0d0 - nhp nhe0 = YHELIUM - (nhep + nhepp) end subroutine iterate_ne - ! **************************************************************************** + ! **************************************************************************** - subroutine ion_n(U, nh, ne, nhp, nhep, nhepp, t) + subroutine ion_n(JH, JHe, U, nh, ne, nhp, nhep, nhepp, t) - use meth_params_module, only: gamma_minus_1 - use atomic_rates_module, ONLY: YHELIUM, MPROTON, BOLTZMANN, & + use meth_params_module, only: gamma_minus_1 + use atomic_rates_module, only: YHELIUM, MPROTON, BOLTZMANN, & TCOOLMIN, TCOOLMAX, NCOOLTAB, deltaT, & AlphaHp, AlphaHep, AlphaHepp, Alphad, & GammaeH0, GammaeHe0, GammaeHep, & ggh0, gghe0, gghep + integer, intent(in) :: JH, JHe real(rt), intent(in ) :: U, nh, ne real(rt), intent( out) :: nhp, nhep, nhepp, t real(rt) :: ahp, ahep, ahepp, ad, geh0, gehe0, gehep real(rt) :: ggh0ne, gghe0ne, gghepne real(rt) :: mu, tmp, logT, flo, fhi - real(rt) :: smallest_val + real(rt), parameter :: smallest_val=tiny(1.0d0) integer :: j + mu = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne) t = gamma_minus_1*MPROTON/BOLTZMANN * U * mu @@ -268,9 +627,9 @@ subroutine ion_n(U, nh, ne, nhp, nhep, nhepp, t) gehep = flo*GammaeHep(j) + fhi*GammaeHep(j+1) if (ne .gt. 0.0d0) then - ggh0ne = ggh0 /(ne*nh) - gghe0ne = gghe0/(ne*nh) - gghepne = gghep/(ne*nh) + ggh0ne = JH * ggh0 / (ne*nh) + gghe0ne = JH * gghe0 / (ne*nh) + gghepne = JHe * gghep / (ne*nh) else ggh0ne = 0.0d0 gghe0ne = 0.0d0 @@ -281,7 +640,6 @@ subroutine ion_n(U, nh, ne, nhp, nhep, nhepp, t) nhp = 1.0d0 - ahp/(ahp + geh0 + ggh0ne) ! He+ - smallest_val = Tiny(1.0d0) if ((gehe0 + gghe0ne) .gt. smallest_val) then nhep = YHELIUM/(1.0d0 + (ahep + ad )/(gehe0 + gghe0ne) & @@ -299,4 +657,5 @@ subroutine ion_n(U, nh, ne, nhp, nhep, nhepp, t) end subroutine ion_n + end module eos_module diff --git a/Source/EOS/eos_stuff.f90 b/Source/EOS/eos_stuff.f90 index ebcf86a4..5eab4309 100644 --- a/Source/EOS/eos_stuff.f90 +++ b/Source/EOS/eos_stuff.f90 @@ -43,8 +43,8 @@ module eos_module private nspec, aion, zion - public eos_init_small_pres, nyx_eos_T_given_Re, nyx_eos_S_given_Re, & - nyx_eos_soundspeed, nyx_eos_given_RT, eos + public eos_init_small_pres, nyx_eos_T_given_Re, nyx_eos_T_given_Re_vec, nyx_eos_S_given_Re, & + nyx_eos_soundspeed, nyx_eos_given_RT, nyx_eos_given_RT_vec, eos contains @@ -119,11 +119,12 @@ subroutine nyx_eos_soundspeed(c, R, e) end subroutine nyx_eos_soundspeed - subroutine nyx_eos_T_given_Re(T, Ne, R, e, comoving_a) + subroutine nyx_eos_T_given_Re(JH, JHe, T, Ne, R, e, comoving_a) use amrex_fort_module, only : rt => amrex_real ! In/out variables + integer, intent(in) :: JH, JHe ! stubs here real(rt), intent(inout) :: T, Ne real(rt), intent(in ) :: R, e real(rt), intent(in ) :: comoving_a @@ -390,4 +391,36 @@ subroutine eos(input, dens, temp, & end subroutine eos + + subroutine nyx_eos_T_given_Re_vec(T, Ne, R_in, e_in, a, veclen) + + use amrex_fort_module, only : rt => amrex_real + use amrex_error_module, only: amrex_abort + + ! In/out variables + integer, intent(in) :: veclen + real(rt), dimension(veclen), intent(inout) :: T, Ne + real(rt), dimension(veclen), intent(in ) :: R_in, e_in + real(rt), intent(in ) :: a + + call amrex_abort("nyx_eos_T_given_Re_vec supported only with USE_HEATCOOL=TRUE and USE_CVODE=TRUE") + + end subroutine nyx_eos_T_given_Re_vec + + + subroutine nyx_eos_given_RT_vec(e, P, R, T, Ne, a, veclen) + + use amrex_fort_module, only : rt => amrex_real + use amrex_error_module, only: amrex_abort + implicit none + + integer, intent(in) :: veclen + real(rt), dimension(veclen), intent( out) :: e, P + real(rt), dimension(veclen), intent(in ) :: R, T, Ne + real(rt), intent(in ) :: a + + call amrex_abort("nyx_eos_given_RT_vec supported only with USE_HEATCOOL=TRUE and USE_CVODE=TRUE") + + end subroutine nyx_eos_given_RT_vec + end module eos_module diff --git a/Source/EOS/reion_aux_module.f90 b/Source/EOS/reion_aux_module.f90 new file mode 100644 index 00000000..5b84345b --- /dev/null +++ b/Source/EOS/reion_aux_module.f90 @@ -0,0 +1,10 @@ +module reion_aux_module + + use amrex_fort_module, only : rt => amrex_real + implicit none + + ! Global variables (re)set on inputs + real(rt), save :: zhi_flash=-1.0, zheii_flash=-1.0, T_zhi=0.0, T_zheii=0.0 + logical, save :: flash_h=.false., flash_he=.false., inhomogeneous_on=.false. + +end module reion_aux_module diff --git a/Source/Forcing/ext_src_force_3d.f90 b/Source/Forcing/ext_src_force_3d.f90 index 8db2dff0..26a7abe7 100644 --- a/Source/Forcing/ext_src_force_3d.f90 +++ b/Source/Forcing/ext_src_force_3d.f90 @@ -38,7 +38,6 @@ subroutine ext_src_force(lo, hi, old_state, os_l1, os_l2, os_l3, os_h1, os_h2, o use amrex_fort_module, only : rt => amrex_real use meth_params_module, only : NVAR, UMX, UMY, UMZ, UEDEN, UEINT use fundamental_constants_module - use atomic_rates_module, only: interp_to_this_z implicit none diff --git a/Source/Forcing/integrate_state_force_3d.f90 b/Source/Forcing/integrate_state_force_3d.f90 index 70819f40..f9b82620 100644 --- a/Source/Forcing/integrate_state_force_3d.f90 +++ b/Source/Forcing/integrate_state_force_3d.f90 @@ -37,7 +37,7 @@ subroutine integrate_state_force(lo, hi, & use atomic_rates_module, only: XHYDROGEN use probdata_module, only: prob_lo, prob_hi, alpha, rho0, temp0 use meth_params_module, only : NVAR, URHO, UMX, UMY, UMZ, UEDEN, UEINT, & - TEMP_COMP, NE_COMP, small_pres, small_temp, gamma_minus_1 + NDIAG, TEMP_COMP, NE_COMP, small_pres, small_temp, gamma_minus_1 use bl_constants_module, only : TWO, ONE, HALF, ZERO, M_PI, M_SQRT_2 use fundamental_constants_module @@ -50,7 +50,7 @@ subroutine integrate_state_force(lo, hi, & integer , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3 integer , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3 real(rt), intent(inout) :: state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR) - real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2) + real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, NDIAG) real(rt), intent(in) :: dx(3), time, a, half_dt integer :: i, j, k diff --git a/Source/Gravity/Gravity.H b/Source/Gravity/Gravity.H index 0a19aab3..b6c0086f 100644 --- a/Source/Gravity/Gravity.H +++ b/Source/Gravity/Gravity.H @@ -147,7 +147,6 @@ protected: amrex::BCRec* phys_bc; static int verbose; - static int show_timings; static int no_sync; static int no_composite; static int dirichlet_bcs; diff --git a/Source/HeatCool/Make.package b/Source/HeatCool/Make.package index 4d690d8f..7ea7acf0 100644 --- a/Source/HeatCool/Make.package +++ b/Source/HeatCool/Make.package @@ -1,13 +1,14 @@ ifeq ($(USE_HEATCOOL), TRUE) -f90EXE_sources += cooling.f90 f90EXE_sources += ext_src_hc_3d.f90 f90EXE_sources += integrate_state_3d.f90 -f90EXE_sources += integrate_state_hc_3d.f90 f90EXE_sources += integrate_state_vode_3d.f90 ifeq ($(USE_CVODE), TRUE) + f90EXE_sources += fcvode_extras.f90 f90EXE_sources += integrate_state_fcvode_3d.f90 + f90EXE_sources += integrate_state_fcvode_vec_3d.f90 else f90EXE_sources += integrate_state_fcvode_3d_stubs.f90 + f90EXE_sources += integrate_state_fcvode_vec_3d_stubs.f90 endif f90EXE_sources += vode_aux.f90 f90EXE_sources += f_rhs.f90 diff --git a/Source/HeatCool/cooling.f90 b/Source/HeatCool/cooling.f90 deleted file mode 100644 index 2ccd5bda..00000000 --- a/Source/HeatCool/cooling.f90 +++ /dev/null @@ -1,115 +0,0 @@ -! Calculates cooling (H & He) + UV heating rates. -! -! Working units are CGS here, temperature is in K -! - -module heating_cooling_module - - use amrex_fort_module, only : rt => amrex_real - - implicit none - - public :: hc_rates - - contains - - subroutine hc_rates(z, R_in, e_in, t, ne, energy, prnt_d) - - use fundamental_constants_module, only: e_to_cgs, density_to_cgs, & - heat_from_cgs - use eos_module, only: iterate_ne - use atomic_rates_module, ONLY: TCOOLMIN, TCOOLMAX, NCOOLTAB, deltaT, & - MPROTON, XHYDROGEN, & - AlphaHp, AlphaHep, AlphaHepp, Alphad, & - GammaeH0, GammaeHe0, GammaeHep, & - BetaH0, BetaHe0, BetaHep, Betaff1, Betaff4, & - RecHp, RecHep, RecHepp, & - eh0, ehe0, ehep - - real(rt), intent(in ) :: z, R_in, e_in - real(rt), intent(inout) :: t, ne - real(rt), intent( out) :: energy - logical, intent(in) :: prnt_d ! for diagnostics print - - real(rt), parameter :: compt_c = 1.01765467d-37, T_cmb = 2.725d0 - - real(rt) :: logT, tmp, fhi, flo - real(rt) :: ahp, ahep, ahepp, ad, geh0, gehe0, gehep - real(rt) :: bh0, bhe0, bhep, bff1, bff4, rhp, rhep, rhepp - real(rt) :: lambda_c, lambda_ff, lambda, heat - real(rt) :: rho, U - real(rt) :: nh, nh0, nhp, nhe0, nhep, nhepp - integer :: j - - - ! Converts from code units to CGS - rho = R_in * density_to_cgs * (1.0d0+z)**3 - U = e_in * e_to_cgs - nh = rho*XHYDROGEN/MPROTON - - ! Get gas temperature and individual ionization species - call iterate_ne(z, U, t, nh, ne, nh0, nhp, nhe0, nhep, nhepp) - - ! Convert species to CGS units: - ne = nh * ne - nh0 = nh * nh0 - nhp = nh * nhp - nhe0 = nh * nhe0 - nhep = nh * nhep - nhepp = nh * nhepp - - logT = dlog10(t) - if (logT .ge. TCOOLMAX) then ! Only free-free and Compton cooling are relevant - lambda_ff = 1.42d-27 * dsqrt(t) * (1.1d0 + 0.34d0*dexp(-(5.5d0 - logT)**2 / 3.0d0)) & - * (nhp + 4.0d0*nhepp)*ne - lambda_c = compt_c*T_cmb**4*ne*(t - T_cmb*(1.0d0+z))*(1.0d0 + z)**4 - - energy = (-lambda_ff -lambda_c) * heat_from_cgs/(1.0d0+z)**4 - ne = ne / nh - return - endif - - ! Temperature floor - if (logT .le. TCOOLMIN) logT = TCOOLMIN + 0.5d0*deltaT - - ! Interpolate rates - tmp = (logT-TCOOLMIN)/deltaT - j = int(tmp) - fhi = tmp - j - flo = 1.0d0 - fhi - j = j + 1 ! F90 arrays start with 1 - - ahp = flo*AlphaHp (j) + fhi*AlphaHp (j+1) - ahep = flo*AlphaHep (j) + fhi*AlphaHep (j+1) - ahepp = flo*AlphaHepp(j) + fhi*AlphaHepp(j+1) - ad = flo*Alphad (j) + fhi*Alphad (j+1) - geh0 = flo*GammaeH0 (j) + fhi*GammaeH0 (j+1) - gehe0 = flo*GammaeHe0(j) + fhi*GammaeHe0(j+1) - gehep = flo*GammaeHep(j) + fhi*GammaeHep(j+1) - bh0 = flo*BetaH0 (j) + fhi*BetaH0 (j+1) - bhe0 = flo*BetaHe0 (j) + fhi*BetaHe0 (j+1) - bhep = flo*BetaHep (j) + fhi*BetaHep (j+1) - bff1 = flo*Betaff1 (j) + fhi*Betaff1 (j+1) - bff4 = flo*Betaff4 (j) + fhi*Betaff4 (j+1) - rhp = flo*RecHp (j) + fhi*RecHp (j+1) - rhep = flo*RecHep (j) + fhi*RecHep (j+1) - rhepp = flo*RecHepp (j) + fhi*RecHepp (j+1) - - ! Cooling: - lambda = ( bh0*nh0 + bhe0*nhe0 + bhep*nhep + & - rhp*nhp + rhep*nhep + rhepp*nhepp + & - bff1*(nhp+nhep) + bff4*nhepp ) * ne - - lambda_c = compt_c*T_cmb**4*ne*(t - T_cmb*(1.0d0+z))*(1.0d0 + z)**4 ! Compton cooling - lambda = lambda + lambda_c - - ! Heating terms - heat = nh0*eh0 + nhe0*ehe0 + nhep*ehep - - ! Convert back to code units - ne = ne / nh - energy = (heat - lambda)*heat_from_cgs/(1.0d0+z)**4 - - end subroutine hc_rates - -end module heating_cooling_module diff --git a/Source/HeatCool/ext_src_hc_3d.f90 b/Source/HeatCool/ext_src_hc_3d.f90 index dcfae8fb..3089d87d 100644 --- a/Source/HeatCool/ext_src_hc_3d.f90 +++ b/Source/HeatCool/ext_src_hc_3d.f90 @@ -36,9 +36,9 @@ subroutine ext_src_hc(lo, hi, old_state, os_l1, os_l2, os_l3, os_h1, os_h2, os_h ! @todo ! use amrex_fort_module, only : rt => amrex_real + use amrex_error_module, only: amrex_abort use meth_params_module, only : NVAR, UEDEN, UEINT, heat_cool_type use fundamental_constants_module - use atomic_rates_module, only: interp_to_this_z implicit none @@ -87,13 +87,9 @@ subroutine ext_src_hc(lo, hi, old_state, os_l1, os_l2, os_l3, os_h1, os_h2, os_h ! both "old_state" is in fact the "old" state and ! "new_state" is in fact the "new" state - call interp_to_this_z(z) - half_dt = 0.5d0 * dt if (heat_cool_type .eq. 1) then - call integrate_state_hc(lo,hi,tmp_state,ns_l1,ns_l2,ns_l3,ns_h1,ns_h2,ns_h3, & - new_diag ,nd_l1,nd_l2,nd_l3,nd_h1,nd_h2,nd_h3, & - a,half_dt,min_iter,max_iter) + call amrex_abort("ERROR: heat_cool_type = 1 is not in function anymore.") else if (heat_cool_type .eq. 3) then call integrate_state_vode(lo,hi,tmp_state,ns_l1,ns_l2,ns_l3,ns_h1,ns_h2,ns_h3, & new_diag ,nd_l1,nd_l2,nd_l3,nd_h1,nd_h2,nd_h3, & @@ -102,6 +98,10 @@ subroutine ext_src_hc(lo, hi, old_state, os_l1, os_l2, os_l3, os_h1, os_h2, os_h call integrate_state_fcvode(lo,hi,tmp_state,ns_l1,ns_l2,ns_l3, ns_h1,ns_h2,ns_h3, & new_diag ,nd_l1,nd_l2,nd_l3, nd_h1,nd_h2,nd_h3, & a,half_dt,min_iter,max_iter) + else if (heat_cool_type .eq. 7) then + call integrate_state_fcvode_vec(lo,hi,tmp_state,ns_l1,ns_l2,ns_l3, ns_h1,ns_h2,ns_h3, & + new_diag ,nd_l1,nd_l2,nd_l3, nd_h1,nd_h2,nd_h3, & + a,half_dt,min_iter,max_iter) endif ! Recall that this routine is called from a tiled MFIter diff --git a/Source/HeatCool/f_rhs.f90 b/Source/HeatCool/f_rhs.f90 index f025a4e4..d1d41c8d 100644 --- a/Source/HeatCool/f_rhs.f90 +++ b/Source/HeatCool/f_rhs.f90 @@ -7,13 +7,13 @@ subroutine f_rhs(num_eq, time, e_in, energy, rpar, ipar) use eos_module, only: iterate_ne use atomic_rates_module, ONLY: TCOOLMIN, TCOOLMAX, NCOOLTAB, deltaT, & MPROTON, XHYDROGEN, & - AlphaHp, AlphaHep, AlphaHepp, Alphad, & - GammaeH0, GammaeHe0, GammaeHep, & + uvb_density_A, uvb_density_B, mean_rhob, & BetaH0, BetaHe0, BetaHep, Betaff1, Betaff4, & RecHp, RecHep, RecHepp, & eh0, ehe0, ehep - use vode_aux_module , only: z_vode, rho_vode, T_vode, ne_vode, i_vode, j_vode, k_vode + use vode_aux_module , only: z_vode, rho_vode, T_vode, ne_vode, & + JH_vode, JHe_vode, i_vode, j_vode, k_vode integer, intent(in) :: num_eq, ipar real(rt), intent(inout) :: e_in(num_eq) @@ -27,7 +27,7 @@ subroutine f_rhs(num_eq, time, e_in, energy, rpar, ipar) real(rt) :: ahp, ahep, ahepp, ad, geh0, gehe0, gehep real(rt) :: bh0, bhe0, bhep, bff1, bff4, rhp, rhep, rhepp real(rt) :: lambda_c, lambda_ff, lambda, heat - real(rt) :: rho, U, a + real(rt) :: rho, U, a, rho_heat real(rt) :: nh, nh0, nhp, nhe0, nhep, nhepp integer :: j @@ -46,7 +46,7 @@ subroutine f_rhs(num_eq, time, e_in, energy, rpar, ipar) end if ! Get gas temperature and individual ionization species - call iterate_ne(z_vode, U, T_vode, nh, ne_vode, nh0, nhp, nhe0, nhep, nhepp) + call iterate_ne(JH_vode, JHe_vode, z_vode, U, T_vode, nh, ne_vode, nh0, nhp, nhe0, nhep, nhepp) ! Convert species to CGS units: ne_vode = nh * ne_vode @@ -80,13 +80,6 @@ subroutine f_rhs(num_eq, time, e_in, energy, rpar, ipar) flo = 1.0d0 - fhi j = j + 1 ! F90 arrays start with 1 - ahp = flo*AlphaHp (j) + fhi*AlphaHp (j+1) - ahep = flo*AlphaHep (j) + fhi*AlphaHep (j+1) - ahepp = flo*AlphaHepp(j) + fhi*AlphaHepp(j+1) - ad = flo*Alphad (j) + fhi*Alphad (j+1) - geh0 = flo*GammaeH0 (j) + fhi*GammaeH0 (j+1) - gehe0 = flo*GammaeHe0(j) + fhi*GammaeHe0(j+1) - gehep = flo*GammaeHep(j) + fhi*GammaeHep(j+1) bh0 = flo*BetaH0 (j) + fhi*BetaH0 (j+1) bhe0 = flo*BetaHe0 (j) + fhi*BetaHe0 (j+1) bhep = flo*BetaHep (j) + fhi*BetaHep (j+1) @@ -105,7 +98,9 @@ subroutine f_rhs(num_eq, time, e_in, energy, rpar, ipar) lambda = lambda + lambda_c ! Heating terms - heat = nh0*eh0 + nhe0*ehe0 + nhep*ehep + heat = JH_vode*nh0*eh0 + JH_vode*nhe0*ehe0 + JHe_vode*nhep*ehep + rho_heat = uvb_density_A * (rho_vode/mean_rhob)**uvb_density_B + heat = rho_heat*heat ! Convert back to code units ne_vode = ne_vode / nh @@ -117,6 +112,131 @@ subroutine f_rhs(num_eq, time, e_in, energy, rpar, ipar) end subroutine f_rhs + +subroutine f_rhs_vec(time, e_in, energy) + + use amrex_fort_module, only : rt => amrex_real + use fundamental_constants_module, only: e_to_cgs, density_to_cgs, & + heat_from_cgs + use eos_module, only: iterate_ne_vec + use atomic_rates_module, ONLY: TCOOLMIN, TCOOLMAX, NCOOLTAB, deltaT, & + MPROTON, XHYDROGEN, & + BetaH0, BetaHe0, BetaHep, Betaff1, Betaff4, & + RecHp, RecHep, RecHepp, & + eh0, ehe0, ehep + + use vode_aux_module , only: T_vode_vec, ne_vode_vec, rho_vode_vec, z_vode + use misc_params, only: simd_width + + implicit none + + real(rt), intent(in ) :: time + real(rt), dimension(simd_width), intent(inout) :: e_in + real(rt), dimension(simd_width), intent( out) :: energy + + real(rt), parameter :: compt_c = 1.01765467d-37, T_cmb = 2.725d0 + + real(rt), dimension(simd_width) :: logT, tmp, fhi, flo + real(rt), dimension(simd_width) :: ahp, ahep, ahepp, ad, geh0, gehe0, gehep + real(rt), dimension(simd_width) :: bh0, bhe0, bhep, bff1, bff4, rhp, rhep, rhepp + real(rt), dimension(simd_width) :: lambda_c, lambda_ff, lambda, heat + real(rt), dimension(simd_width) :: rho, U + real(rt) :: a + real(rt), dimension(simd_width) :: nh, nh0, nhp, nhe0, nhep, nhepp + integer, dimension(simd_width) :: j + integer :: m + logical, dimension(simd_width) :: hot + + do m = 1, simd_width + if (e_in(m) .lt. 0.d0) then + e_in(m) = tiny(e_in(m)) + endif + end do + + ! Converts from code units to CGS + rho = rho_vode_vec(1:simd_width) * density_to_cgs * (1.0d0+z_vode)**3 + U = e_in * e_to_cgs + nh = rho*XHYDROGEN/MPROTON + + if (time .gt. 1) then + print *,'TIME INTO F_RHS ',time + call bl_pd_abort("TOO BIG TIME IN F_RHS") + end if + + ! Get gas temperature and individual ionization species + call iterate_ne_vec(z_vode, U, T_vode_vec, nh, ne_vode_vec, nh0, nhp, nhe0, nhep, nhepp, simd_width) + + ! Convert species to CGS units: + ne_vode_vec(1:simd_width) = nh * ne_vode_vec(1:simd_width) + nh0 = nh * nh0 + nhp = nh * nhp + nhe0 = nh * nhe0 + nhep = nh * nhep + nhepp = nh * nhepp + + logT = dlog10(T_vode_vec(1:simd_width)) + do m = 1, simd_width + if (logT(m) .ge. TCOOLMAX) then ! Only free-free and Compton cooling are relevant + lambda_ff(m) = 1.42d-27 * dsqrt(T_vode_vec(m)) * (1.1d0 + 0.34d0*dexp(-(5.5d0 - logT(m))**2 / 3.0d0)) & + * (nhp(m) + 4.0d0*nhepp(m))*ne_vode_vec(m) + lambda_c(m) = compt_c*T_cmb**4 * ne_vode_vec(m) * (T_vode_vec(m) - T_cmb*(1.0d0+z_vode))*(1.0d0 + z_vode)**4 + + energy(m) = (-lambda_ff(m) -lambda_c(m)) * heat_from_cgs/(1.0d0+z_vode)**4 + + ! Convert to the actual term to be used in e_out = e_in + dt*energy + energy(m) = energy(m) / rho_vode_vec(m) * (1.0d0+z_vode) + ne_vode_vec(m) = ne_vode_vec(m) / nh(m) + hot(m) = .true. + else + hot(m) = .false. + endif + end do + + do m = 1, simd_width + if (.not. hot(m)) then + ! Temperature floor + if (logT(m) .le. TCOOLMIN) logT(m) = TCOOLMIN + 0.5d0*deltaT + + ! Interpolate rates + tmp(m) = (logT(m)-TCOOLMIN)/deltaT + j(m) = int(tmp(m)) + fhi(m) = tmp(m) - j(m) + flo(m) = 1.0d0 - fhi(m) + j(m) = j(m) + 1 ! F90 arrays start with 1 + + bh0(m) = flo(m)*BetaH0 (j(m)) + fhi(m)*BetaH0 (j(m)+1) + bhe0(m) = flo(m)*BetaHe0 (j(m)) + fhi(m)*BetaHe0 (j(m)+1) + bhep(m) = flo(m)*BetaHep (j(m)) + fhi(m)*BetaHep (j(m)+1) + bff1(m) = flo(m)*Betaff1 (j(m)) + fhi(m)*Betaff1 (j(m)+1) + bff4(m) = flo(m)*Betaff4 (j(m)) + fhi(m)*Betaff4 (j(m)+1) + rhp(m) = flo(m)*RecHp (j(m)) + fhi(m)*RecHp (j(m)+1) + rhep(m) = flo(m)*RecHep (j(m)) + fhi(m)*RecHep (j(m)+1) + rhepp(m) = flo(m)*RecHepp (j(m)) + fhi(m)*RecHepp (j(m)+1) + + ! Cooling: + lambda(m) = ( bh0(m)*nh0(m) + bhe0(m)*nhe0(m) + bhep(m)*nhep(m) + & + rhp(m)*nhp(m) + rhep(m)*nhep(m) + rhepp(m)*nhepp(m) + & + bff1(m)*(nhp(m)+nhep(m)) + bff4(m)*nhepp(m) ) * ne_vode_vec(m) + + lambda_c(m) = compt_c*T_cmb**4*ne_vode_vec(m)*(T_vode_vec(m) - T_cmb*(1.0d0+z_vode))*(1.0d0 + z_vode)**4 ! Compton cooling + lambda(m) = lambda(m) + lambda_c(m) + + ! Heating terms + heat(m) = nh0(m)*eh0 + nhe0(m)*ehe0 + nhep(m)*ehep + + ! Convert back to code units + ne_vode_vec(m) = ne_vode_vec(m) / nh(m) + energy(m) = (heat(m) - lambda(m))*heat_from_cgs/(1.0d0+z_vode)**4 + + ! Convert to the actual term to be used in e_out = e_in + dt*energy + a = 1.d0 / (1.d0 + z_vode) + energy(m) = energy(m) / rho_vode_vec(m) / a + end if + end do + +end subroutine f_rhs_vec + + subroutine jac(neq, t, y, ml, mu, pd, nrpd, rpar, ipar) use amrex_fort_module, only : rt => amrex_real diff --git a/Source/HeatCool/fcvode_extras.f90 b/Source/HeatCool/fcvode_extras.f90 new file mode 100644 index 00000000..8f79183b --- /dev/null +++ b/Source/HeatCool/fcvode_extras.f90 @@ -0,0 +1,186 @@ +module fcvode_extras + + implicit none + + contains + + subroutine fcvode_wrapper(dt, rho_in, T_in, ne_in, e_in, neq, cvmem, & + sunvec_y, yvec, T_out, ne_out, e_out) + + use amrex_fort_module, only : rt => amrex_real + use vode_aux_module, only: rho_vode, T_vode, ne_vode, z_vode + use atomic_rates_module, only: this_z + use cvode_interface + use fnvector_serial + use eos_module, only: vode_rtol, vode_atol_scaled + use, intrinsic :: iso_c_binding + + implicit none + + real(rt), intent(in ) :: dt + real(rt), intent(in ) :: rho_in, T_in, ne_in, e_in + type(c_ptr), value :: cvmem + type(c_ptr), value :: sunvec_y + real(rt), intent( out) :: T_out,ne_out,e_out + + real(c_double) :: atol, rtol + real(c_double) :: time, tout + integer(c_long), intent(in) :: neq + real(c_double), pointer, intent(in) :: yvec(:) + + integer(c_int) :: ierr + + real(c_double) :: t_soln + + T_vode = T_in + ne_vode = ne_in + rho_vode = rho_in + + ! Initialize the integration time + time = 0.d0 + + ! We will integrate "e" in time. + yvec(1) = e_in + + ! Set the tolerances. + atol = vode_atol_scaled * e_in + rtol = vode_rtol + + ierr = FCVodeReInit(cvmem, time, sunvec_y) + ierr = FCVodeSStolerances(CVmem, rtol, atol) + + ierr = FCVode(CVmem, dt, sunvec_y, time, CV_NORMAL) + + e_out = yvec(1) + T_out = T_vode + ne_out = ne_vode + + end subroutine fcvode_wrapper + + subroutine fcvode_wrapper_vec(dt, rho_in, T_in, ne_in, e_in, neq, cvmem, & + sunvec_y, yvec, T_out, ne_out, e_out) + + use amrex_fort_module, only : rt => amrex_real + use vode_aux_module, only: rho_vode_vec, T_vode_vec, ne_vode_vec + use cvode_interface + use fnvector_serial + use misc_params, only: simd_width + use eos_module, only: vode_rtol, vode_atol_scaled + use, intrinsic :: iso_c_binding + + implicit none + + real(rt), intent(in ) :: dt + real(rt), dimension(simd_width), intent(in ) :: rho_in, T_in, ne_in, e_in + type(c_ptr), value :: cvmem + type(c_ptr), value :: sunvec_y + real(rt), dimension(simd_width), intent( out) :: T_out,ne_out,e_out + + real(c_double) :: rtol + real(c_double), pointer, dimension(:) :: atol + real(c_double) :: time, tout + integer(c_long), intent(in) :: neq + real(c_double), pointer, intent(in) :: yvec(:) + type(c_ptr) :: sunvec_atol + + integer(c_int) :: ierr + + real(c_double) :: t_soln + + allocate(atol(simd_width)) + + sunvec_atol = N_VMake_Serial(neq, atol) + + T_vode_vec(1:simd_width) = T_in(1:simd_width) + ne_vode_vec(1:simd_width) = ne_in(1:simd_width) + rho_vode_vec(1:simd_width) = rho_in(1:simd_width) + + ! Initialize the integration time + time = 0.d0 + + ! We will integrate "e" in time. + yvec(1:simd_width) = e_in(1:simd_width) + + ! Set the tolerances. + atol(1:simd_width) = vode_atol_scaled * e_in(1:simd_width) + rtol = vode_rtol + + ierr = FCVodeReInit(cvmem, time, sunvec_y) + ierr = FCVodeSVtolerances(CVmem, rtol, sunvec_atol) + + ierr = FCVode(CVmem, dt, sunvec_y, time, CV_NORMAL) + + e_out(1:simd_width) = yvec(1:simd_width) + T_out(1:simd_width) = T_vode_vec(1:simd_width) + ne_out(1:simd_width) = ne_vode_vec(1:simd_width) + + call N_VDestroy_Serial(sunvec_atol) + deallocate(atol) + + end subroutine fcvode_wrapper_vec + + integer(c_int) function RhsFn(tn, sunvec_y, sunvec_f, user_data) & + result(ierr) bind(C,name='RhsFn') + + use, intrinsic :: iso_c_binding + use fnvector_serial + use cvode_interface + implicit none + + real(c_double), value :: tn + type(c_ptr), value :: sunvec_y + type(c_ptr), value :: sunvec_f + type(c_ptr), value :: user_data + + ! pointers to data in SUNDAILS vectors + real(c_double), pointer :: yvec(:) + real(c_double), pointer :: fvec(:) + + real(c_double) :: energy + + integer(c_long), parameter :: neq = 1 + + ! get data arrays from SUNDIALS vectors + call N_VGetData_Serial(sunvec_y, neq, yvec) + call N_VGetData_Serial(sunvec_f, neq, fvec) + + call f_rhs(1, tn, yvec(1), energy, 0.0, 0) + + fvec(1) = energy + + ierr = 0 + end function RhsFn + + + integer(c_int) function RhsFn_vec(tn, sunvec_y, sunvec_f, user_data) & + result(ierr) bind(C,name='RhsFn_vec') + + use, intrinsic :: iso_c_binding + use fnvector_serial + use cvode_interface + use misc_params, only: simd_width + implicit none + + real(c_double), value :: tn + type(c_ptr), value :: sunvec_y, sunvec_f, user_data + + ! pointers to data in SUNDAILS vectors + real(c_double), dimension(:), pointer :: yvec, fvec + + integer(c_long) :: neq + real(c_double) :: energy(simd_width) + + neq = int(simd_width, c_long) + + ! get data arrays from SUNDIALS vectors + call N_VGetData_Serial(sunvec_y, neq, yvec) + call N_VGetData_Serial(sunvec_f, neq, fvec) + + call f_rhs_vec(tn, yvec, energy) + + fvec = energy + + ierr = 0 + end function RhsFn_vec + +end module fcvode_extras diff --git a/Source/HeatCool/heat_cool_stubs.f90 b/Source/HeatCool/heat_cool_stubs.f90 index 4ac41459..5307be1f 100644 --- a/Source/HeatCool/heat_cool_stubs.f90 +++ b/Source/HeatCool/heat_cool_stubs.f90 @@ -81,3 +81,10 @@ end subroutine adjust_heat_cool end module adjust_heat_cool_module +! unused VODE stubs if we are not doing heating/cooling +module vode_aux_module + use amrex_fort_module, only : rt => amrex_real + implicit none + + real(rt) :: z_vode +end module vode_aux_module diff --git a/Source/HeatCool/integrate_state_3d.f90 b/Source/HeatCool/integrate_state_3d.f90 index 3c0183a2..d463080c 100644 --- a/Source/HeatCool/integrate_state_3d.f90 +++ b/Source/HeatCool/integrate_state_3d.f90 @@ -33,7 +33,8 @@ subroutine integrate_state(lo, hi, & ! use amrex_fort_module, only : rt => amrex_real - use meth_params_module, only : NVAR, heat_cool_type + use amrex_error_module, only: amrex_abort + use meth_params_module, only : NVAR, NDIAG, heat_cool_type implicit none @@ -41,14 +42,12 @@ subroutine integrate_state(lo, hi, & integer , intent(in ) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3 integer , intent(in ) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3 real(rt), intent(inout) :: state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR) - real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2) + real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, NDIAG) real(rt), intent(in ) :: dx(3), time, a, half_dt integer , intent(inout) :: min_iter, max_iter if (heat_cool_type .eq. 1) then - call integrate_state_hc(lo, hi, state , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, & - diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, & - a, half_dt, min_iter, max_iter) + call amrex_abort("ERROR: heat_cool_type = 1 is not in function anymore.") else if (heat_cool_type .eq. 3) then call integrate_state_vode(lo, hi, state , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, & diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, & @@ -57,6 +56,10 @@ subroutine integrate_state(lo, hi, & call integrate_state_fcvode(lo, hi, state , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, & diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, & a, half_dt, min_iter, max_iter) + else if (heat_cool_type .eq. 7) then + call integrate_state_fcvode_vec(lo, hi, state , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, & + diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, & + a, half_dt, min_iter, max_iter) end if diff --git a/Exec/LyA/integrate_state_fcvode_3d.f90 b/Source/HeatCool/integrate_state_fcvode_3d.f90 similarity index 70% rename from Exec/LyA/integrate_state_fcvode_3d.f90 rename to Source/HeatCool/integrate_state_fcvode_3d.f90 index fb83df6d..33f1df51 100644 --- a/Exec/LyA/integrate_state_fcvode_3d.f90 +++ b/Source/HeatCool/integrate_state_fcvode_3d.f90 @@ -32,15 +32,19 @@ subroutine integrate_state_fcvode(lo, hi, & use amrex_fort_module, only : rt => amrex_real use amrex_error_module, only : amrex_abort use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, & - TEMP_COMP, NE_COMP, gamma_minus_1 + NDIAG, TEMP_COMP, NE_COMP, ZHI_COMP, & + gamma_minus_1 use bl_constants_module, only: M_PI use eos_params_module use network use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT use fundamental_constants_module use comoving_module, only: comoving_h, comoving_OmB - use atomic_rates_module, only: tabulate_rates, interp_to_this_z, YHELIUM - use vode_aux_module , only: z_vode, i_vode, j_vode, k_vode + use comoving_nd_module, only: fort_integrate_comoving_a + use atomic_rates_module, only: YHELIUM + use vode_aux_module , only: JH_vode, JHe_vode, z_vode, i_vode, j_vode, k_vode + use reion_aux_module , only: zhi_flash, zheii_flash, flash_h, flash_he, & + T_zhi, T_zheii, inhomogeneous_on use cvode_interface use fnvector_serial use fcvode_extras @@ -52,14 +56,16 @@ subroutine integrate_state_fcvode(lo, hi, & integer , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3 integer , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3 real(rt), intent(inout) :: state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR) - real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2) + real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, NDIAG) real(rt), intent(in) :: a, half_dt integer , intent(inout) :: max_iter, min_iter integer :: i, j, k - real(rt) :: z, rho + real(rt) :: z, z_end, a_end, rho, H_reion_z, He_reion_z real(rt) :: T_orig, ne_orig, e_orig - real(rt) :: T_out , ne_out , e_out, mu, mean_rhob + real(rt) :: T_out , ne_out , e_out, mu, mean_rhob, T_H, T_He + real(rt) :: species(5) + integer(c_int) :: ierr ! error flag from C functions real(c_double) :: tstart ! initial time real(c_double) :: atol, rtol @@ -71,12 +77,25 @@ subroutine integrate_state_fcvode(lo, hi, & allocate(yvec(neq)) z = 1.d0/a - 1.d0 + call fort_integrate_comoving_a(a, a_end, half_dt) + z_end = 1.0d0/a_end - 1.0d0 - z_vode = z mean_rhob = comoving_OmB * 3.d0*(comoving_h*100.d0)**2 / (8.d0*M_PI*Gconst) - ! Interpolate from the table to this redshift - call interp_to_this_z(z) + ! Flash reionization? + if ((flash_h .eqv. .true.) .and. (z .gt. zhi_flash)) then + JH_vode = 0 + else + JH_vode = 1 + endif + if ((flash_he .eqv. .true.) .and. (z .gt. zheii_flash)) then + JHe_vode = 0 + else + JHe_vode = 1 + endif + + if (flash_h ) H_reion_z = zhi_flash + if (flash_he) He_reion_z = zheii_flash ! Note that (lo,hi) define the region of the box containing the grow cells ! Do *not* assume this is just the valid region @@ -124,6 +143,15 @@ subroutine integrate_state_fcvode(lo, hi, & T_orig = diag_eos(i,j,k,TEMP_COMP) ne_orig = diag_eos(i,j,k, NE_COMP) + if (inhomogeneous_on) then + H_reion_z = diag_eos(i,j,k,ZHI_COMP) + if (z .gt. H_reion_z) then + JH_vode = 0 + else + JH_vode = 1 + endif + endif + if (e_orig .lt. 0.d0) then print *,'negative e entering strang integration ',z, i,j,k, rho/mean_rhob, e_orig call bl_abort('bad e in strang') @@ -146,12 +174,34 @@ subroutine integrate_state_fcvode(lo, hi, & ! call bl_abort('bad e out of strang') end if + ! Update T and ne (do not use stuff computed in f_rhs, per vode manual) + call nyx_eos_T_given_Re(JH_vode, JHe_vode, T_out, ne_out, rho, e_out, a, species) + + ! Flash heating in reionization: + T_H = 0.0d0 + if (inhomogeneous_on .or. flash_h) then + if ((H_reion_z .lt. z) .and. (H_reion_z .ge. z_end)) T_H = (1.0d0 - species(2))*T_zhi + endif + + T_He = 0.0d0 + if (flash_he) then + if ((He_reion_z .lt. z) .and. (He_reion_z .ge. z_end)) T_He = (1.0d0 - species(5))*T_zheii + endif + + if ((T_H .gt. 0.0d0) .or. (T_He .gt. 0.0d0)) then + T_out = T_orig + T_H + T_He + ne_out = 1.0d0 + YHELIUM + if (T_He .gt. 0.0d0) ne_out = ne_out + YHELIUM + mu = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne_out) + e_out = T_out / (gamma_minus_1 * mp_over_kB * mu) + call nyx_eos_T_given_Re(JH_vode, JHe_vode, T_out, ne_out, rho, e_out, a, species) + endif + ! Update (rho e) and (rho E) state(i,j,k,UEINT) = state(i,j,k,UEINT) + rho * (e_out-e_orig) state(i,j,k,UEDEN) = state(i,j,k,UEDEN) + rho * (e_out-e_orig) - ! Update T and ne (do not use stuff computed in f_rhs, per vode manual) - call nyx_eos_T_given_Re(T_out, ne_out, rho, e_out, a) + ! Update T and ne diag_eos(i,j,k,TEMP_COMP) = T_out diag_eos(i,j,k, NE_COMP) = ne_out diff --git a/Source/HeatCool/integrate_state_fcvode_3d_stubs.f90 b/Source/HeatCool/integrate_state_fcvode_3d_stubs.f90 index c705c3f3..97b6ab76 100644 --- a/Source/HeatCool/integrate_state_fcvode_3d_stubs.f90 +++ b/Source/HeatCool/integrate_state_fcvode_3d_stubs.f90 @@ -6,14 +6,14 @@ subroutine integrate_state_fcvode(lo, hi, & use amrex_error_module, only : amrex_abort use amrex_fort_module, only : rt => amrex_real use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, & - TEMP_COMP, NE_COMP, gamma_minus_1 + NDIAG, TEMP_COMP, NE_COMP, gamma_minus_1 use bl_constants_module, only: M_PI use eos_params_module use network use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT use fundamental_constants_module use comoving_module, only: comoving_h, comoving_OmB - use atomic_rates_module, only: tabulate_rates, interp_to_this_z, YHELIUM + use atomic_rates_module, only: YHELIUM use vode_aux_module , only: z_vode, i_vode, j_vode, k_vode, firstcall implicit none @@ -22,7 +22,7 @@ subroutine integrate_state_fcvode(lo, hi, & integer , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3 integer , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3 real(rt), intent(inout) :: state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR) - real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2) + real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, NDIAG) real(rt), intent(in) :: a, half_dt integer , intent(inout) :: max_iter, min_iter diff --git a/Source/HeatCool/integrate_state_fcvode_vec_3d.f90 b/Source/HeatCool/integrate_state_fcvode_vec_3d.f90 new file mode 100644 index 00000000..a263c452 --- /dev/null +++ b/Source/HeatCool/integrate_state_fcvode_vec_3d.f90 @@ -0,0 +1,197 @@ +subroutine integrate_state_fcvode_vec(lo, hi, & + state , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, & + diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, & + a, half_dt, min_iter, max_iter) +! +! Calculates the sources to be added later on. +! +! Parameters +! ---------- +! lo : double array (3) +! The low corner of the current box. +! hi : double array (3) +! The high corner of the current box. +! state_* : double arrays +! The state vars +! diag_eos_* : double arrays +! Temp and Ne +! src_* : doubles arrays +! The source terms to be added to state (iterative approx.) +! double array (3) +! The low corner of the entire domain +! a : double +! The current a +! half_dt : double +! time step size, in Mpc km^-1 s ~ 10^12 yr. +! +! Returns +! ------- +! state : double array (dims) @todo +! The state vars +! + use amrex_fort_module, only : rt => amrex_real + use amrex_error_module, only : amrex_abort + use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, & + NDIAG, TEMP_COMP, NE_COMP, gamma_minus_1 + use bl_constants_module, only: M_PI + use eos_params_module + use network + use eos_module, only: nyx_eos_T_given_Re, nyx_eos_T_given_Re_vec, nyx_eos_given_RT + use fundamental_constants_module + use comoving_module, only: comoving_h, comoving_OmB + use atomic_rates_module, only: YHELIUM + use vode_aux_module , only: z_vode, i_vode, j_vode, k_vode + use cvode_interface + use fnvector_serial + use fcvode_extras + use misc_params, only: simd_width + use parallel, only : parallel_ioprocessor + use, intrinsic :: iso_c_binding + + implicit none + + integer , intent(in) :: lo(3), hi(3) + integer , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3 + integer , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3 + real(rt), intent(inout) :: state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR) + real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, NDIAG) + real(rt), intent(in) :: a, half_dt + integer , intent(inout) :: max_iter, min_iter + + integer :: i, j, k, ii + real(rt) :: z + real(rt), dimension(simd_width) :: rho + real(rt), dimension(simd_width) :: T_orig, ne_orig, e_orig + real(rt), dimension(simd_width) :: T_out , ne_out , e_out, mu + real(rt) :: mean_rhob + integer(c_int) :: ierr ! error flag from C functions + real(c_double) :: tstart ! initial time + real(c_double) :: rtol + real(c_double), pointer, dimension(:) :: atol + type(c_ptr) :: sunvec_y ! sundials vector + type(c_ptr) :: CVmem ! CVODE memory + type(c_ptr) :: sunvec_atol + integer(c_long) :: neq + real(c_double), pointer :: yvec(:) + character(len=128) :: errmsg + + if (mod(hi(1)-lo(1)+1, simd_width) /= 0) then + if (parallel_ioprocessor()) then + !$omp single + write(errmsg, *) "simd_width does not divide evenly to tile x-length! lo(1) = ", & + lo(1), " hi(1) = ", hi(1), " simd_width = ", simd_width + call amrex_abort(errmsg) + !$omp end single + endif + end if + + neq = int(simd_width, c_long) + + allocate(yvec(neq)) + allocate(atol(neq)) + + z = 1.d0/a - 1.d0 + + z_vode = z + mean_rhob = comoving_OmB * 3.d0*(comoving_h*100.d0)**2 / (8.d0*M_PI*Gconst) + + ! Note that (lo,hi) define the region of the box containing the grow cells + ! Do *not* assume this is just the valid region + ! apply heating-cooling to UEDEN and UEINT + + sunvec_y = N_VMake_Serial(NEQ, yvec) + if (.not. c_associated(sunvec_y)) then + call amrex_abort('integrate_state_fcvode_vec: sunvec_y = NULL') + end if + + sunvec_atol = N_VMake_Serial(NEQ, atol) + if (.not. c_associated(sunvec_atol)) then + call amrex_abort('integrate_state_fcvode_vec: sunvec_atol = NULL') + end if + + CVmem = FCVodeCreate(CV_BDF, CV_NEWTON) + if (.not. c_associated(CVmem)) then + call amrex_abort('integrate_state_fcvode_vec: CVmem = NULL') + end if + + tstart = 0.0 + ! CVodeMalloc allocates variables and initialize the solver. We can + ! initialize the solver with junk because once we enter the (i,j,k) loop we will + ! immediately call fcvreinit which reuses the same memory allocated from + ! CVodeMalloc but sets up new initial conditions. + ierr = FCVodeInit(CVmem, c_funloc(RhsFn_vec), tstart, sunvec_y) + if (ierr /= 0) then + call amrex_abort('integrate_state_fcvode_vec: FCVodeInit() failed') + end if + + ! Set dummy tolerances. These will be overwritten as soon as we enter the + ! loop and reinitialize the solver. + rtol = 1.0d-5 + atol(:) = 1.0d-10 + ierr = FCVodeSVtolerances(CVmem, rtol, sunvec_atol) + if (ierr /= 0) then + call amrex_abort('integrate_state_fcvode_vec: FCVodeSVtolerances() failed') + end if + + ierr = FCVDiag(CVmem) + if (ierr /= 0) then + call amrex_abort('integrate_state_fcvode_vec: FCVDiag() failed') + end if + + do k = lo(3),hi(3) + do j = lo(2),hi(2) + do i = lo(1),hi(1),simd_width + + ! Original values + rho = state(i:i+simd_width-1,j,k,URHO) + e_orig = state(i:i+simd_width-1,j,k,UEINT) / rho + T_orig = diag_eos(i:i+simd_width-1,j,k,TEMP_COMP) + ne_orig = diag_eos(i:i+simd_width-1,j,k, NE_COMP) + + do ii = 1, simd_width + if (e_orig(ii) .lt. 0.d0) then + print *,'negative e entering strang integration ',z, i+ii-1,j,k, rho(ii)/mean_rhob, e_orig(ii) + call bl_abort('bad e in strang') + end if + end do + + i_vode = i + j_vode = j + k_vode = k + + call fcvode_wrapper_vec(half_dt,rho,T_orig,ne_orig,e_orig,neq,CVmem,sunvec_y,yvec, & + T_out ,ne_out ,e_out) + + do ii = 1, simd_width + if (e_out(ii) .lt. 0.d0) then + print *,'negative e exiting strang integration ',z, i,j,k, rho(ii)/mean_rhob, e_out(ii) + T_out(ii) = 10.0 + ne_out(ii) = 0.0 + mu(ii) = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne_out(ii)) + e_out(ii) = T_out(ii) / (gamma_minus_1 * mp_over_kB * mu(ii)) + call flush(6) + ! call bl_abort('bad e out of strang') + end if + end do + + ! Update (rho e) and (rho E) + state(i:i+simd_width-1,j,k,UEINT) = state(i:i+simd_width-1,j,k,UEINT) + rho(1:simd_width) * (e_out(1:simd_width)-e_orig(1:simd_width)) + state(i:i+simd_width-1,j,k,UEDEN) = state(i:i+simd_width-1,j,k,UEDEN) + rho(1:simd_width) * (e_out(1:simd_width)-e_orig(1:simd_width)) + + ! Update T and ne (do not use stuff computed in f_rhs, per vode manual) + call nyx_eos_T_given_Re_vec(T_out(1:simd_width), ne_out(1:simd_width), rho(1:simd_width), e_out(1:simd_width), a, simd_width) + diag_eos(i:i+simd_width-1,j,k,TEMP_COMP) = T_out(1:simd_width) + diag_eos(i:i+simd_width-1,j,k, NE_COMP) = ne_out(1:simd_width) + + end do ! i + end do ! j + end do ! k + + call N_VDestroy_Serial(sunvec_atol) + call N_VDestroy_Serial(sunvec_y) + call FCVodeFree(cvmem) + + deallocate(yvec) + deallocate(atol) + +end subroutine integrate_state_fcvode_vec diff --git a/Source/HeatCool/integrate_state_fcvode_vec_3d_stubs.f90 b/Source/HeatCool/integrate_state_fcvode_vec_3d_stubs.f90 new file mode 100644 index 00000000..6ce893bf --- /dev/null +++ b/Source/HeatCool/integrate_state_fcvode_vec_3d_stubs.f90 @@ -0,0 +1,36 @@ +subroutine integrate_state_fcvode_vec(lo, hi, & + state , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, & + diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, & + a, half_dt, min_iter, max_iter) +! + use amrex_error_module, only : amrex_abort + use amrex_fort_module, only : rt => amrex_real + use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, & + NDIAG, TEMP_COMP, NE_COMP, gamma_minus_1 + use bl_constants_module, only: M_PI + use eos_params_module + use network + use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT + use fundamental_constants_module + use comoving_module, only: comoving_h, comoving_OmB + use atomic_rates_module, only: YHELIUM + use vode_aux_module , only: z_vode, i_vode, j_vode, k_vode, firstcall + + implicit none + + integer , intent(in) :: lo(3), hi(3) + integer , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3 + integer , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3 + real(rt), intent(inout) :: state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR) + real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, NDIAG) + real(rt), intent(in) :: a, half_dt + integer , intent(inout) :: max_iter, min_iter + + integer :: i, j, k + real(rt) :: z, rho + real(rt) :: T_orig, ne_orig, e_orig + real(rt) :: T_out , ne_out , e_out, mu, mean_rhob + + call amrex_abort("Cannot call fcvode without compiling with USE_CVODE=TRUE") + +end subroutine integrate_state_fcvode_vec diff --git a/Source/HeatCool/integrate_state_hc_3d.f90 b/Source/HeatCool/integrate_state_hc_3d.f90 deleted file mode 100644 index f2824a78..00000000 --- a/Source/HeatCool/integrate_state_hc_3d.f90 +++ /dev/null @@ -1,233 +0,0 @@ -subroutine integrate_state_hc(lo, hi, & - state , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, & - diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, & - a, half_dt, min_iter, max_iter) -! -! Calculates the sources to be added later on. -! -! Parameters -! ---------- -! lo : double array (3) -! The low corner of the current box. -! hi : double array (3) -! The high corner of the current box. -! state_* : double arrays -! The state vars -! diag_eos_* : double arrays -! Temp and Ne -! src_* : doubles arrays -! The source terms to be added to state (iterative approx.) -! double array (3) -! The low corner of the entire domain -! a : double -! The current a -! half_dt : double -! time step size, in Mpc km^-1 s ~ 10^12 yr. -! -! Returns -! ------- -! state : double array (dims) @todo -! The state vars -! - use amrex_fort_module, only : rt => amrex_real - use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, & - TEMP_COMP, NE_COMP, small_pres, gamma_minus_1 - use eos_params_module - use network - use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT - use fundamental_constants_module - use atomic_rates_module, only: tabulate_rates, interp_to_this_z - use heating_cooling_module, only: hc_rates - - implicit none - - integer , intent(in) :: lo(3), hi(3) - integer , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3 - integer , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3 - real(rt), intent(inout) :: state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR) - real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2) - real(rt), intent(in) :: a, half_dt - integer , intent(inout) :: max_iter, min_iter - - integer, parameter :: NITERS = 20 - real(rt), parameter :: xacc = 1.0d-3 - - integer :: i, j, k, n, iter, nsteps, cnt - real(rt) :: z, rho, T, ne - real(rt) :: T_orig, rho_e_orig, ne_orig, e_int_old, De_int - real(rt) :: T_first, ne_first, src_first - real(rt) :: src_old, src_new, delta_re, delta_t, rho_e, e_int, prev_soln - real(rt) :: b_fac - logical :: do_diag, prnt_cell, done_iter - logical :: went_negative, went_negative_at_first - - z = 1.d0/a - 1.d0 - do_diag = .false. - prnt_cell = .false. - - ! Interpolate from the table to this redshift - call interp_to_this_z(z) - - b_fac = 0.0d0 - max_iter = 0 - min_iter = NITERS+1 - - ! Note that (lo,hi) define the region of the box containing the grow cells - ! Do *not* assume this is just the valid region - ! apply heating-cooling to UEDEN and UEINT - - do k = lo(3),hi(3) - do j = lo(2),hi(2) - do i = lo(1),hi(1) - ! Original values - rho = state(i,j,k,URHO) - rho_e_orig = state(i,j,k,UEINT) - T_orig = diag_eos(i,j,k,TEMP_COMP) - ne_orig = diag_eos(i,j,k, NE_COMP) - - if (rho_e_orig .lt. 0.d0) then - print *,'(rho e) entering strang integration negative ',i,j,k, rho_e_orig - call bl_abort('bad rho e in strang') - end if - - e_int = rho_e_orig/rho - call hc_rates(z, rho, e_int, T, ne, src_new, prnt_cell) - T_first = T - ne_first = ne - src_first = src_new - - went_negative_at_first = .false. - - prev_soln = HUGE(prev_soln) - do iter = 1, NITERS ! max allowed iterations - - nsteps = 2**(iter-1) - delta_t = half_dt / nsteps - rho_e = rho_e_orig - e_int = rho_e/rho - - delta_re = 0.0d0 - src_old = 0.0d0 - - do n = 1, nsteps - - done_iter = .false. - e_int_old = e_int - e_int = rho_e/rho - - if (n.eq.1) then - T = T_first - ne = ne_first - src_new = src_first - else - call hc_rates(z, rho, e_int, T, ne, src_new, prnt_cell) - end if - - if ( (rho_e+delta_t*src_new/a) .gt. 0.0d0) then - - went_negative = .false. - - src_new = delta_t * src_new / a - rho_e = rho_e + src_new - - if (src_old*src_new .lt. 0.0d0) then ! src=0 in between - if (rho_e .le. 0.0d0) then - De_int = e_int/2.0d0 - e_int = e_int/2.0d0 - else - De_int = abs(e_int_old - e_int)/2.0d0 - e_int = e_int + sign(De_int, src_new) - endif - cnt = 0 - do - cnt = cnt + 1 - call hc_rates(z, rho, e_int, T, ne, src_new, prnt_cell) - if (abs(delta_t*src_new/a)/rho .lt. xacc) EXIT - if (cnt .gt. 40) then - print*, 'BISECTION problem in cell:',i,j,k,iter,n - call flush(6) - call bl_error("Problem in bisection in integrate_hc_3d.f90") - endif - De_int = De_int / 2.0d0 - e_int = e_int + sign(De_int, src_new) - enddo - - rho_e = e_int * rho - delta_re = rho_e - rho_e_orig - done_iter = .true. - b_fac = b_fac+1 ! just for diagnostics - EXIT - endif - - delta_re = delta_re + src_new ! Cumulative update - src_old = src_new - - ! Here we just leave rho_e alone and proceed to the next iter - else ! (rho_e + src_new) <= 0 - went_negative = .true. - went_negative_at_first = .true. - ! print *,'WENT NEGATIVE n, nsteps, iter ',n, nsteps, iter - ! print *,' at cell ',i,j,k - ! print *,' rho_e_orig ',rho_e_orig - ! print *,' src ',src_new - ! print *,' dt/a ',delta_t/a - ! print *,' src*dt/a ',src_new*delta_t/a - ! print *,' ' - ! call flush(6) - exit ! Exit the n loop to go to higher nsteps - end if - - enddo ! n loop - - if (.not. went_negative) then - if (abs(1.0d0-rho_e/prev_soln) .lt. xacc .or. done_iter) EXIT - end if - - if (iter .ge. NITERS-2) then - print*, 'INTEGRATE_HC ITERATIONS:', i,j,k, iter, rho_e_orig, rho_e, (1.0d0-rho_e/prev_soln) - call flush(6) - endif - - if (iter .eq. NITERS) then - print*, 'MAXITER too small!', i,j,k, rho, T_orig - call bl_abort('too small MAXITER') - endif - - if (.not. went_negative) & - prev_soln = rho_e - enddo ! iter loop - - ! Update cell quantities - state(i,j,k,UEINT) = state(i,j,k,UEINT) + delta_re - state(i,j,k,UEDEN) = state(i,j,k,UEDEN) + delta_re - diag_eos(i,j,k,TEMP_COMP) = T - diag_eos(i,j,k, NE_COMP) = ne - - if (state(i,j,k,UEINT) .lt. 0.d0) then - print *,'(rho e) exiting strang integration negative ',i,j,k, rho, rho_e_orig/rho, state(i,j,k,UEINT)/rho - call bl_abort('negative rho e exiting strang') - end if - - if (state(i,j,k,UEINT) .lt. small_pres/gamma_minus_1) then - print *,'!!!!! Pressure and (rho e) are too small coming out of integrate !!!!!' - print *,'!!!!! (i,j,k) !!!!! ' ,i,j,k - print *,'!!!!! pressure ',state(i,j,k,UEINT) * gamma_minus_1 - call flush(6) - end if - - if (max_iter .le. iter) max_iter = iter - if (min_iter .ge. iter) min_iter = iter - - ! if (went_negative_at_first) print *,'MAX NSTEPS OF NEG AT ',i,j,k,nsteps - - end do ! i - end do ! j - end do ! k - - if (do_diag) then - print*, 'HC_ITERATIONS: ', z, max_iter, min_iter, b_fac/((hi(3)-lo(3))*(hi(2)-lo(2))*(hi(1)-lo(1))) - call flush(6) - endif - -end subroutine integrate_state_hc - diff --git a/Source/HeatCool/integrate_state_vode_3d.f90 b/Source/HeatCool/integrate_state_vode_3d.f90 index 7c9a6b5f..82453734 100644 --- a/Source/HeatCool/integrate_state_vode_3d.f90 +++ b/Source/HeatCool/integrate_state_vode_3d.f90 @@ -31,13 +31,18 @@ subroutine integrate_state_vode(lo, hi, & ! use amrex_fort_module, only : rt => amrex_real use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, & - TEMP_COMP, NE_COMP, gamma_minus_1 + NDIAG, TEMP_COMP, NE_COMP, ZHI_COMP, gamma_minus_1 + use bl_constants_module, only: M_PI use eos_params_module use network - use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT + use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT, iterate_ne use fundamental_constants_module - use atomic_rates_module, only: tabulate_rates, interp_to_this_z - use vode_aux_module , only: z_vode, i_vode, j_vode, k_vode, T_vode + use comoving_module, only: comoving_h, comoving_OmB + use comoving_nd_module, only: fort_integrate_comoving_a + use atomic_rates_module, only: YHELIUM + use vode_aux_module , only: JH_vode, JHe_vode, z_vode, i_vode, j_vode, k_vode + use reion_aux_module , only: zhi_flash, zheii_flash, flash_h, flash_he, & + T_zhi, T_zheii, inhomogeneous_on implicit none @@ -45,21 +50,36 @@ subroutine integrate_state_vode(lo, hi, & integer , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3 integer , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3 real(rt), intent(inout) :: state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR) - real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2) + real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, NDIAG) real(rt), intent(in) :: a, half_dt integer , intent(inout) :: max_iter, min_iter integer :: i, j, k - real(rt) :: z, rho + real(rt) :: z, z_end, a_end, rho, H_reion_z, He_reion_z real(rt) :: T_orig, ne_orig, e_orig - real(rt) :: T_out , ne_out , e_out + real(rt) :: T_out , ne_out , e_out, mu, mean_rhob, T_H, T_He + real(rt) :: species(5) z = 1.d0/a - 1.d0 + call fort_integrate_comoving_a(a, a_end, half_dt) + z_end = 1.0d0/a_end - 1.0d0 - z_vode = z + mean_rhob = comoving_OmB * 3.d0*(comoving_h*100.d0)**2 / (8.d0*M_PI*Gconst) - ! Interpolate from the table to this redshift - call interp_to_this_z(z) + ! Flash reionization? + if ((flash_h .eqv. .true.) .and. (z .gt. zhi_flash)) then + JH_vode = 0 + else + JH_vode = 1 + endif + if ((flash_he .eqv. .true.) .and. (z .gt. zheii_flash)) then + JHe_vode = 0 + else + JHe_vode = 1 + endif + + if (flash_h ) H_reion_z = zhi_flash + if (flash_he) He_reion_z = zheii_flash ! Note that (lo,hi) define the region of the box containing the grow cells ! Do *not* assume this is just the valid region @@ -75,8 +95,19 @@ subroutine integrate_state_vode(lo, hi, & T_orig = diag_eos(i,j,k,TEMP_COMP) ne_orig = diag_eos(i,j,k, NE_COMP) + if (inhomogeneous_on) then + H_reion_z = diag_eos(i,j,k,ZHI_COMP) + if (z .gt. H_reion_z) then + JH_vode = 0 + else + JH_vode = 1 + endif + endif + if (e_orig .lt. 0.d0) then - print *,'negative e entering strang integration ',i,j,k, e_orig + print *,'negative e entering strang integration ', z, i,j,k, e_orig + print *, 'state(i,j,k,UEINT) = ', state(i,j,k,UEINT) + print *, 'rho / mean_rhob = ', rho / mean_rhob call bl_abort('bad e in strang') end if @@ -88,16 +119,43 @@ subroutine integrate_state_vode(lo, hi, & T_out ,ne_out ,e_out) if (e_out .lt. 0.d0) then - print *,'negative e entering strang integration ',i,j,k, e_out - call bl_abort('bad e out of strang') + print *,'negative e exiting strang integration ', z, i,j,k, e_out + T_out = 10.0 + ne_out = 0.0 + mu = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne_out) + e_out = T_out / (gamma_minus_1 * mp_over_kB * mu) + call flush(6) + !call bl_abort('bad e out of strang') end if + ! Update T and ne (do not use stuff computed in f_rhs, per vode manual) + call nyx_eos_T_given_Re(JH_vode, JHe_vode, T_out, ne_out, rho, e_out, a, species) + + ! Flash heating in reionization: + T_H = 0.0d0 + if (inhomogeneous_on .or. flash_h) then + if ((H_reion_z .lt. z) .and. (H_reion_z .ge. z_end)) T_H = (1.0d0 - species(2))*T_zhi + endif + + T_He = 0.0d0 + if (flash_he) then + if ((He_reion_z .lt. z) .and. (He_reion_z .ge. z_end)) T_He = (1.0d0 - species(5))*T_zheii + endif + + if ((T_H .gt. 0.0d0) .or. (T_He .gt. 0.0d0)) then + T_out = T_orig + T_H + T_He + ne_out = 1.0d0 + YHELIUM + if (T_He .gt. 0.0d0) ne_out = ne_out + YHELIUM + mu = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne_out) + e_out = T_out / (gamma_minus_1 * mp_over_kB * mu) + call nyx_eos_T_given_Re(JH_vode, JHe_vode, T_out, ne_out, rho, e_out, a, species) + endif + ! Update (rho e) and (rho E) state(i,j,k,UEINT) = state(i,j,k,UEINT) + rho * (e_out-e_orig) state(i,j,k,UEDEN) = state(i,j,k,UEDEN) + rho * (e_out-e_orig) - ! Update T and ne (do not use stuff computed in f_rhs, per vode manual) - call nyx_eos_T_given_Re(T_out, ne_out, rho, e_out, a) + ! Update T and ne diag_eos(i,j,k,TEMP_COMP) = T_out diag_eos(i,j,k, NE_COMP) = ne_out @@ -107,8 +165,10 @@ subroutine integrate_state_vode(lo, hi, & end subroutine integrate_state_vode + subroutine vode_wrapper(dt, rho_in, T_in, ne_in, e_in, T_out, ne_out, e_out) + use amrex_fort_module, only : rt => amrex_real use vode_aux_module, only: rho_vode, T_vode, ne_vode, & i_vode, j_vode, k_vode @@ -188,7 +248,7 @@ subroutine vode_wrapper(dt, rho_in, T_in, ne_in, e_in, T_out, ne_out, e_out) iwork(:) = 0 ! Set the maximum number of steps allowed (the VODE default is 500) - iwork(6) = 1000 + iwork(6) = 2000 ! Initialize the integration time time = 0.d0 diff --git a/Source/HeatCool/vode_aux.f90 b/Source/HeatCool/vode_aux.f90 index 0ac68646..0696c38b 100644 --- a/Source/HeatCool/vode_aux.f90 +++ b/Source/HeatCool/vode_aux.f90 @@ -8,8 +8,9 @@ module vode_aux_module real(rt), save :: z_vode real(rt), save :: rho_vode, T_vode, ne_vode - integer , save :: i_vode, j_vode, k_vode + real(rt), dimension(:), allocatable, save :: rho_vode_vec, T_vode_vec, ne_vode_vec + integer , save :: JH_vode, JHe_vode, i_vode, j_vode, k_vode logical, save :: firstcall - !$OMP THREADPRIVATE (rho_vode, T_vode, ne_vode, i_vode, j_vode, k_vode, firstcall) + !$OMP THREADPRIVATE (rho_vode, rho_vode_vec, T_vode, T_vode_vec, ne_vode, ne_vode_vec, JH_vode, JHe_vode, i_vode, j_vode, k_vode, firstcall) end module vode_aux_module diff --git a/Source/Initialization/Make.package b/Source/Initialization/Make.package index f9337fa1..194cbb2b 100644 --- a/Source/Initialization/Make.package +++ b/Source/Initialization/Make.package @@ -6,3 +6,6 @@ CEXE_sources += Nyx_setup.cpp CEXE_sources += Nyx_initdata.cpp CEXE_sources += Nyx_initcosmo.cpp CEXE_sources += read_plotfile.cpp +ifeq ($(USE_CVODE), TRUE) + f90EXE_sources += cvode_simd.f90 +endif diff --git a/Source/Initialization/Nyx_initcosmo.cpp b/Source/Initialization/Nyx_initcosmo.cpp index 8dc1b802..244771f5 100644 --- a/Source/Initialization/Nyx_initcosmo.cpp +++ b/Source/Initialization/Nyx_initcosmo.cpp @@ -124,8 +124,8 @@ void Nyx::initcosmo() MultiFab& S_new = get_level(level).get_new_data(State_Type); MultiFab& D_new = get_level(level).get_new_data(DiagEOS_Type); - FillCoarsePatch(S_new, 0, 0, State_Type, 0, NUM_STATE); - FillCoarsePatch(D_new, 0, 0, DiagEOS_Type, 0, 2); + FillCoarsePatch(S_new, 0, 0, State_Type, 0, S_new.nComp()); + FillCoarsePatch(D_new, 0, 0, DiagEOS_Type, 0, D_new.nComp()); return; } @@ -378,8 +378,8 @@ void Nyx::initcosmo() //seems to have no effect... if (level > 0) { - FillCoarsePatch(S_new, 0, 0, State_Type, 0, NUM_STATE); - FillCoarsePatch(D_new, 0, 0, DiagEOS_Type, 0, 2); + FillCoarsePatch(S_new, 0, 0, State_Type, 0, S_new.nComp()); + FillCoarsePatch(D_new, 0, 0, DiagEOS_Type, 0, D_new.nComp()); } //copy density @@ -421,6 +421,8 @@ void Nyx::initcosmo() D_new.setVal(tempInit, Temp_comp); D_new.setVal(0.0, Ne_comp); + if (inhomo_reion > 0) + D_new.setVal(0.0, Zhi_comp); #ifdef _OPENMP #pragma omp parallel diff --git a/Source/Initialization/Nyx_initdata.cpp b/Source/Initialization/Nyx_initdata.cpp index 3147be56..4761a3d9 100644 --- a/Source/Initialization/Nyx_initdata.cpp +++ b/Source/Initialization/Nyx_initdata.cpp @@ -83,10 +83,11 @@ Nyx::read_init_params () // Input error check if (!binary_particle_file.empty() && (particle_init_type != "BinaryFile" && - particle_init_type != "BinaryMetaFile")) + particle_init_type != "BinaryMetaFile" && + particle_init_type != "BinaryMortonFile")) { if (ParallelDescriptor::IOProcessor()) - std::cerr << "ERROR::particle_init_type is not BinaryFile or BinaryMetaFile but you specified binary_particle_file" << std::endl; + std::cerr << "ERROR::particle_init_type is not BinaryFile, BinaryMetaFile, or BinaryMortonFile but you specified binary_particle_file" << std::endl; amrex::Error(); } @@ -109,6 +110,21 @@ Nyx::read_init_params () amrex::Error(); } #endif + +#ifdef HEATCOOL + Real eos_nr_eps = 1.0e-6; + Real vode_rtol = 1.0e-4; + Real vode_atol_scaled = 1.0e-4; + + // Tolerance for Newton-Raphson iteration of iterate_ne() in the EOS + pp.query("eos_nr_eps", eos_nr_eps); + // Relative tolerance of VODE integration + pp.query("vode_rtol", vode_rtol); + // Absolute tolerance of VODE integration (scaled by initial value of ODE) + pp.query("vode_atol_scaled", vode_atol_scaled); + + fort_setup_eos_params(&eos_nr_eps, &vode_rtol, &vode_atol_scaled); +#endif } void diff --git a/Source/Initialization/Nyx_setup.cpp b/Source/Initialization/Nyx_setup.cpp index 555b0011..e689ac60 100644 --- a/Source/Initialization/Nyx_setup.cpp +++ b/Source/Initialization/Nyx_setup.cpp @@ -197,8 +197,16 @@ Nyx::hydro_setup() cnt += NumAdv; } + int NDIAG_C; Temp_comp = 0; Ne_comp = 1; + if (inhomo_reion > 0) + { + NDIAG_C = 3; + Zhi_comp = 2; + } else { + NDIAG_C = 2; + } int dm = BL_SPACEDIM; @@ -228,12 +236,18 @@ Nyx::hydro_setup() // Define NUM_GROW from the f90 module. fort_get_method_params(&NUM_GROW); + // Note that we must set NDIAG_C before we call set_method_params because + // we use the C++ value to set the Fortran value fort_set_method_params - (dm, NumAdv, do_hydro, ppm_type, ppm_reference, + (dm, NumAdv, NDIAG_C, do_hydro, ppm_type, ppm_reference, ppm_flatten_before_integrals, use_colglaz, use_flattening, corner_coupling, version_2, use_const_species, gamma, normalize_species, - heat_cool_type, ParallelDescriptor::Communicator()); + heat_cool_type, inhomo_reion); + +#ifdef HEATCOOL + fort_tabulate_rates(); +#endif if (use_const_species == 1) fort_set_eos_params(h_species, he_species); @@ -258,7 +272,7 @@ Nyx::hydro_setup() // This has two components: Temperature and Ne desc_lst.addDescriptor(DiagEOS_Type, IndexType::TheCellType(), - StateDescriptor::Point, 1, 2, interp, + StateDescriptor::Point, 1, NDIAG_C, interp, state_data_extrap, store_in_checkpoint); #ifdef GRAVITY @@ -442,17 +456,6 @@ Nyx::hydro_setup() //derive_lst.addComponent("rhog",desc_lst,Gravity_Type,0,BL_SPACEDIM); #endif - // - // Entropy (S) - // - derive_lst.add("entropy", IndexType::TheCellType(), 1, - BL_FORT_PROC_CALL(DERENTROPY, derentropy), - the_same_box); - // We add exactly (Density,Xmom,Ymom,Zmom,Eden,Eint) from State and - // (Temp ,Ne) from Diag_EOS - derive_lst.addComponent("entropy", desc_lst, State_Type, Density, 6); - derive_lst.addComponent("entropy", desc_lst, DiagEOS_Type, 0, 2); - // // Div(u) // @@ -652,15 +655,19 @@ Nyx::no_hydro_setup() Density = 0; NUM_STATE = 1; + int NDIAG_C = -1; + // Define NUM_GROW from the f90 module. fort_get_method_params(&NUM_GROW); fort_set_method_params - (dm, NumAdv, do_hydro, ppm_type, ppm_reference, + (dm, NumAdv, NDIAG_C, do_hydro, ppm_type, ppm_reference, ppm_flatten_before_integrals, use_colglaz, use_flattening, corner_coupling, version_2, use_const_species, gamma, normalize_species, - heat_cool_type, ParallelDescriptor::Communicator()); + heat_cool_type, inhomo_reion); + + fort_tabulate_rates(); int coord_type = Geometry::Coord(); fort_set_problem_params(dm, phys_bc.lo(), phys_bc.hi(), Outflow, Symmetry, coord_type); @@ -766,3 +773,22 @@ Nyx::no_hydro_setup() } #endif +#ifdef USE_CVODE +void +Nyx::set_simd_width(const int simd_width) +{ + set_simd(&simd_width); +} + +void +Nyx::alloc_simd_vec() +{ + fort_alloc_simd_vec(); +} + +void +Nyx::dealloc_simd_vec() +{ + fort_dealloc_simd_vec(); +} +#endif diff --git a/Source/Initialization/check_initial_species_3d.f90 b/Source/Initialization/check_initial_species_3d.f90 index 70469ff0..997865d3 100644 --- a/Source/Initialization/check_initial_species_3d.f90 +++ b/Source/Initialization/check_initial_species_3d.f90 @@ -7,6 +7,7 @@ subroutine fort_check_initial_species(lo,hi,& use eos_params_module use amrex_fort_module, only : rt => amrex_real + use amrex_error_module, only : amrex_abort implicit none integer :: lo(3), hi(3) @@ -16,6 +17,7 @@ subroutine fort_check_initial_species(lo,hi,& ! Local variables integer :: i,j,k,n real(rt) :: sum + character(len=256) :: errmsg_pt1, errmsg_pt2 if (UFS .gt. 0) then @@ -30,13 +32,11 @@ subroutine fort_check_initial_species(lo,hi,& end do if (abs(state(i,j,k,URHO)-sum).gt. 1.d-8 * state(i,j,k,URHO)) then - ! - ! A critical region since we usually can't write from threads. - ! - !$OMP CRITICAL - print *,'Sum of (rho X)_i vs rho at (i,j,k): ',i,j,k,sum,state(i,j,k,URHO) - call bl_error("Error:: Failed check of initial species summing to 1") - !$OMP END CRITICAL + write(errmsg_pt1, *) 'Sum of (rho X)_i vs rho at (i,j,k): ', & + i,j,k,sum,state(i,j,k,URHO) + write(errmsg_pt2, *) trim(errmsg_pt1) // new_line('a') // & + 'Failed check of initial species summing to 1' + call amrex_abort(errmsg_pt2) end if enddo diff --git a/Source/Initialization/cvode_simd.f90 b/Source/Initialization/cvode_simd.f90 new file mode 100644 index 00000000..fc5578b4 --- /dev/null +++ b/Source/Initialization/cvode_simd.f90 @@ -0,0 +1,44 @@ +subroutine set_simd (simd_width_in) bind(C, name='set_simd') + + use misc_params, only: simd_width + implicit none + + integer, intent(in) :: simd_width_in + + simd_width = simd_width_in + +end subroutine set_simd + +subroutine fort_alloc_simd_vec() bind(C, name='fort_alloc_simd_vec') + use misc_params, only: simd_width + use vode_aux_module, only: T_vode_vec, ne_vode_vec, rho_vode_vec + use amrex_error_module, only: amrex_abort + implicit none + + !$omp parallel + if (allocated(T_vode_vec) .or. allocated(ne_vode_vec) .or. allocated(rho_vode_vec)) then + !$omp single + call amrex_abort("Why are VODE SIMD vectors already allocated??") + !$omp end single + end if + + allocate(T_vode_vec(simd_width), ne_vode_vec(simd_width), rho_vode_vec(simd_width)) + !$omp end parallel +end subroutine fort_alloc_simd_vec + + +subroutine fort_dealloc_simd_vec() bind(C, name='fort_dealloc_simd_vec') + use vode_aux_module, only: T_vode_vec, ne_vode_vec, rho_vode_vec + use amrex_error_module, only: amrex_abort + implicit none + + !$omp parallel + if (.not. (allocated(T_vode_vec) .and. allocated(ne_vode_vec) .and. allocated(rho_vode_vec))) then + !$omp single + call amrex_abort("Why are VODE SIMD vectors already deallocated??") + !$omp end single + end if + + deallocate(T_vode_vec, ne_vode_vec, rho_vode_vec) + !$omp end parallel +end subroutine fort_dealloc_simd_vec diff --git a/Source/Make.package b/Source/Make.package index 2d818e42..22c32a48 100644 --- a/Source/Make.package +++ b/Source/Make.package @@ -35,6 +35,7 @@ f90EXE_headers += dm_F.H f90EXE_sources += Nyx_nd.f90 f90EXE_sources += eos_params.f90 f90EXE_sources += meth_params.f90 +f90EXE_sources += misc_params.f90 f90EXE_sources += prob_params.f90 f90EXE_sources += comoving_params.f90 f90EXE_sources += comoving_nd.f90 diff --git a/Source/Nyx.H b/Source/Nyx.H index d7f0c8a2..976c6c56 100644 --- a/Source/Nyx.H +++ b/Source/Nyx.H @@ -476,7 +476,7 @@ public: void compute_new_temp(); - void compute_rho_temp(amrex::Real& rho_T_avg, amrex::Real& T_avg, amrex::Real& T_meanrho); + void compute_rho_temp(amrex::Real& rho_T_avg, amrex::Real& T_avg, amrex::Real& Tinv_avg, amrex::Real& T_meanrho); void get_old_source(amrex::Real old_time, amrex::Real dt, amrex::MultiFab& Rhs); void get_new_source(amrex::Real old_time, amrex::Real new_time, amrex::Real dt, amrex::MultiFab& Rhs); @@ -520,7 +520,8 @@ public: static int NUM_STATE; static int Density, Xmom, Ymom, Zmom, Eden, Eint; - static int Temp_comp, Ne_comp; + + static int Temp_comp, Ne_comp, Zhi_comp; static int FirstSpec, FirstAux, FirstAdv; static int NumSpec, NumAux, NumAdv; @@ -552,6 +553,10 @@ public: static void InitErrorList(); static void InitDeriveList(); + static void set_simd_width(const int simd_width); + static void alloc_simd_vec(); + static void dealloc_simd_vec(); + protected: // @@ -648,6 +653,7 @@ protected: static long particle_initrandom_count_per_box; static amrex::Real particle_initrandom_mass; static int particle_initrandom_iseed; + static int particle_skip_factor; static amrex::IntVect Nrep; // how many times the initial conditions are replicated in each direction @@ -668,6 +674,9 @@ protected: // specifies the heating/cooling source term static int heat_cool_type; + // specifies inhomogeneous reionization type + static int inhomo_reion; + // permits forcing to be switched on and off static int do_forcing; @@ -711,7 +720,6 @@ protected: static amrex::Real getCPUTime(); - }; // time step interval for finding halos diff --git a/Source/Nyx.cpp b/Source/Nyx.cpp index ccf5057c..a94cfc35 100644 --- a/Source/Nyx.cpp +++ b/Source/Nyx.cpp @@ -21,6 +21,7 @@ using std::string; #include #include #include +#include #if BL_USE_MPI #include "MemInfo.H" @@ -63,6 +64,8 @@ static Real fixed_dt = -1.0; static Real initial_dt = -1.0; static Real dt_cutoff = 0; +int simd_width = 1; + int Nyx::strict_subcycling = 0; Real Nyx::old_a = -1.0; @@ -83,7 +86,7 @@ Real Nyx::change_max = 1.1; BCRec Nyx::phys_bc; int Nyx::do_reflux = 1; int Nyx::NUM_STATE = -1; -int Nyx::NUM_GROW = -1; +int Nyx::NUM_GROW = -1; int Nyx::nsteps_from_plotfile = -1; @@ -98,6 +101,7 @@ int Nyx::Zmom = -1; int Nyx::Temp_comp = -1; int Nyx:: Ne_comp = -1; +int Nyx:: Zhi_comp = -1; int Nyx::NumSpec = 0; int Nyx::NumAux = 0; @@ -118,6 +122,7 @@ Real Nyx::comoving_h; int Nyx::do_hydro = -1; int Nyx::add_ext_src = 0; int Nyx::heat_cool_type = 0; +int Nyx::inhomo_reion = 0; int Nyx::strang_split = 0; Real Nyx::average_gas_density = 0; @@ -253,6 +258,16 @@ Nyx::read_params () pp.query("strict_subcycling",strict_subcycling); +#ifdef USE_CVODE + pp.query("simd_width", simd_width); + if (simd_width < 1) amrex::Abort("simd_width must be a positive integer"); + set_simd_width(simd_width); + + if (verbose > 1) amrex::Print() + << "SIMD width (# zones) for heating/cooling integration: " + << simd_width << std::endl; +#endif + // Get boundary conditions Array lo_bc(BL_SPACEDIM), hi_bc(BL_SPACEDIM); pp.getarr("lo_bc", lo_bc, 0, BL_SPACEDIM); @@ -354,14 +369,32 @@ Nyx::read_params () #endif pp.query("heat_cool_type", heat_cool_type); + if (heat_cool_type == 7) + { + amrex::Print() << "----- WARNING WARNING WARNING WARNING WARNING -----" << std::endl; + amrex::Print() << " " << std::endl; + amrex::Print() << " SIMD CVODE is currently EXPERIMENTAL. " << std::endl; + amrex::Print() << " Use at your own risk. " << std::endl; + amrex::Print() << " " << std::endl; + amrex::Print() << "----- WARNING WARNING WARNING WARNING WARNING -----" << std::endl; + Array n_cell(BL_SPACEDIM); + ParmParse pp("amr"); + pp.getarr("n_cell", n_cell, 0, BL_SPACEDIM); + if (n_cell[0] % simd_width) { + const std::string errmsg = "Currently the SIMD CVODE solver requires that n_cell[0] % simd_width = 0"; + amrex::Abort(errmsg); + } + } pp.query("use_exact_gravity", use_exact_gravity); + pp.query("inhomo_reion", inhomo_reion); + #ifdef HEATCOOL if (heat_cool_type > 0 && add_ext_src == 0) amrex::Error("Nyx::must set add_ext_src to 1 if heat_cool_type > 0"); - if (heat_cool_type != 1 && heat_cool_type != 3 && heat_cool_type != 5) - amrex::Error("Nyx:: nonzero heat_cool_type must equal 1 or 3 or 5"); + if (heat_cool_type != 1 && heat_cool_type != 3 && heat_cool_type != 5 && heat_cool_type != 7) + amrex::Error("Nyx:: nonzero heat_cool_type must equal 1 or 3 or 5 or 7"); if (heat_cool_type == 0) amrex::Error("Nyx::contradiction -- HEATCOOL is defined but heat_cool_type == 0"); @@ -377,18 +410,23 @@ Nyx::read_params () case 5: std::cout << "CVODE"; break; + case 7: + std::cout << "SIMD CVODE"; + break; } std::cout << std::endl; } #ifndef USE_CVODE - if (heat_cool_type == 5) - amrex::Error("Nyx:: cannot set heat_cool_type = 5 unless USE_CVODE=TRUE"); + if (heat_cool_type == 5 || heat_cool_type == 7) + amrex::Error("Nyx:: cannot set heat_cool_type = 5 or 7 unless USE_CVODE=TRUE"); #endif #else if (heat_cool_type > 0) amrex::Error("Nyx::you set heat_cool_type > 0 but forgot to set USE_HEATCOOL = TRUE"); + if (inhomo_reion > 0) + amrex::Error("Nyx::you set inhomo_reion > 0 but forgot to set USE_HEATCOOL = TRUE"); #endif pp.query("allow_untagging", allow_untagging); @@ -563,9 +601,11 @@ Nyx::Nyx (Amr& papa, new_a = old_a; } +#ifdef HEATCOOL // Initialize "this_z" in the atomic_rates_module - if (heat_cool_type == 1 || heat_cool_type == 3 || heat_cool_type == 5) - fort_init_this_z(&old_a); + if (heat_cool_type == 1 || heat_cool_type == 3 || heat_cool_type == 5 || heat_cool_type == 7) + fort_interp_to_this_z(&initial_z); +#endif #ifdef AGN // Initialize the uniform(0,1) random number generator. @@ -685,7 +725,7 @@ Nyx::init (AmrLevel& old) for (FillPatchIterator fpi(old, S_new, 0, cur_time, State_Type, 0, NUM_STATE), - dfpi(old, D_new, 0, cur_time, DiagEOS_Type, 0, 2); + dfpi(old, D_new, 0, cur_time, DiagEOS_Type, 0, D_new.nComp()); fpi.isValid() && dfpi.isValid(); ++fpi,++dfpi) { @@ -2070,6 +2110,13 @@ Nyx::compute_new_temp () Real a = get_comoving_a(cur_time); +#ifdef HEATCOOL + if (heat_cool_type == 1 || heat_cool_type == 3 || heat_cool_type == 5 || heat_cool_type == 7) { + const Real z = 1.0/a - 1.0; + fort_interp_to_this_z(&z); + } +#endif + #ifdef _OPENMP #pragma omp parallel #endif @@ -2077,11 +2124,19 @@ Nyx::compute_new_temp () { const Box& bx = mfi.tilebox(); - fort_compute_temp - (bx.loVect(), bx.hiVect(), - BL_TO_FORTRAN(S_new[mfi]), - BL_TO_FORTRAN(D_new[mfi]), &a, - &print_fortran_warnings); + if (heat_cool_type == 7) { + fort_compute_temp_vec + (bx.loVect(), bx.hiVect(), + BL_TO_FORTRAN(S_new[mfi]), + BL_TO_FORTRAN(D_new[mfi]), &a, + &print_fortran_warnings); + } else { + fort_compute_temp + (bx.loVect(), bx.hiVect(), + BL_TO_FORTRAN(S_new[mfi]), + BL_TO_FORTRAN(D_new[mfi]), &a, + &print_fortran_warnings); + } } // Compute the maximum temperature @@ -2123,17 +2178,17 @@ Nyx::compute_new_temp () #ifndef NO_HYDRO void -Nyx::compute_rho_temp (Real& rho_T_avg, Real& T_avg, Real& T_meanrho) +Nyx::compute_rho_temp (Real& rho_T_avg, Real& T_avg, Real& Tinv_avg, Real& T_meanrho) { BL_PROFILE("Nyx::compute_rho_temp()"); MultiFab& S_new = get_new_data(State_Type); MultiFab& D_new = get_new_data(DiagEOS_Type); - Real rho_T_sum=0.0, T_sum=0.0, T_meanrho_sum=0.0; + Real rho_T_sum=0.0, T_sum=0.0, Tinv_sum=0.0, T_meanrho_sum=0.0; Real rho_sum=0.0, vol_sum=0.0, vol_mn_sum=0.0; #ifdef _OPENMP -#pragma omp parallel reduction(+:rho_T_sum, rho_sum, T_sum, T_meanrho_sum, vol_sum, vol_mn_sum) +#pragma omp parallel reduction(+:rho_T_sum, rho_sum, T_sum, Tinv_sum, T_meanrho_sum, vol_sum, vol_mn_sum) #endif for (MFIter mfi(S_new,true); mfi.isValid(); ++mfi) { @@ -2143,15 +2198,16 @@ Nyx::compute_rho_temp (Real& rho_T_avg, Real& T_avg, Real& T_meanrho) (bx.loVect(), bx.hiVect(), geom.CellSize(), BL_TO_FORTRAN(S_new[mfi]), BL_TO_FORTRAN(D_new[mfi]), &average_gas_density, - &rho_T_sum, &T_sum, &T_meanrho_sum, &rho_sum, &vol_sum, &vol_mn_sum); + &rho_T_sum, &T_sum, &Tinv_sum, &T_meanrho_sum, &rho_sum, &vol_sum, &vol_mn_sum); } - Real sums[6] = {rho_T_sum, rho_sum, T_sum, T_meanrho_sum, vol_sum, vol_mn_sum}; - ParallelDescriptor::ReduceRealSum(sums,6); + Real sums[7] = {rho_T_sum, rho_sum, T_sum, Tinv_sum, T_meanrho_sum, vol_sum, vol_mn_sum}; + ParallelDescriptor::ReduceRealSum(sums,7); rho_T_avg = sums[0] / sums[1]; // density weighted T - T_avg = sums[2] / sums[4]; // volume weighted T - if (sums[5] > 0) { - T_meanrho = sums[3] / sums[5]; // T at mean density + T_avg = sums[2] / sums[5]; // volume weighted T + Tinv_avg = sums[3] / sums[1]; // 21cm T + if (sums[6] > 0) { + T_meanrho = sums[4] / sums[6]; // T at mean density T_meanrho = pow(10.0, T_meanrho); } } @@ -2227,6 +2283,7 @@ Nyx::AddProcsToComp(Amr *aptr, int nSidecarProcs, int prevSidecarProcs, allInts.push_back(Eint); allInts.push_back(Temp_comp); allInts.push_back(Ne_comp); + allInts.push_back(Zhi_comp); allInts.push_back(FirstSpec); allInts.push_back(FirstAux); allInts.push_back(FirstAdv); @@ -2256,6 +2313,7 @@ Nyx::AddProcsToComp(Amr *aptr, int nSidecarProcs, int prevSidecarProcs, allInts.push_back(do_grav); allInts.push_back(add_ext_src); allInts.push_back(heat_cool_type); + allInts.push_back(inhomo_reion); allInts.push_back(strang_split); allInts.push_back(reeber_int); allInts.push_back(gimlet_int); @@ -2282,6 +2340,7 @@ Nyx::AddProcsToComp(Amr *aptr, int nSidecarProcs, int prevSidecarProcs, Eint = allInts[count++]; Temp_comp = allInts[count++]; Ne_comp = allInts[count++]; + Zhi_comp = allInts[count++]; FirstSpec = allInts[count++]; FirstAux = allInts[count++]; FirstAdv = allInts[count++]; @@ -2311,6 +2370,7 @@ Nyx::AddProcsToComp(Amr *aptr, int nSidecarProcs, int prevSidecarProcs, do_grav = allInts[count++]; add_ext_src = allInts[count++]; heat_cool_type = allInts[count++]; + inhomo_reion = allInts[count++]; strang_split = allInts[count++]; reeber_int = allInts[count++]; gimlet_int = allInts[count++]; diff --git a/Source/NyxParticleContainer.H b/Source/NyxParticleContainer.H index 640581c0..68369c30 100644 --- a/Source/NyxParticleContainer.H +++ b/Source/NyxParticleContainer.H @@ -9,6 +9,10 @@ class NyxParticleContainerBase { public: + + using MyParIter = amrex::ParIter<1+BL_SPACEDIM>; + using MyConstParIter = amrex::ParConstIter<1+BL_SPACEDIM>; + virtual ~NyxParticleContainerBase() {} virtual void moveKickDrift (amrex::MultiFab& acceleration, int level, amrex::Real timestep, @@ -71,9 +75,9 @@ public: amrex::Real estTimestep (amrex::MultiFab& acceleration, amrex::Real a, int level, amrex::Real cfl) const; virtual int finestLevel() const override - { - return amrex::AmrParticleContainer::finestLevel(); - } + { + return amrex::AmrParticleContainer::finestLevel(); + } virtual void Redistribute (int lev_min = 0, int lev_max =-1, @@ -188,8 +192,8 @@ NyxParticleContainer::SetParticleVelocities (amrex::Array void -NyxParticleContainer::sumParticleMomentum (int lev, - amrex::Real* mom) const +NyxParticleContainer::sumParticleMomentum (int lev, + amrex::Real* mom) const { BL_PROFILE("NyxParticleContainer::sumParticleMomentum()"); BL_ASSERT(NSR >= BL_SPACEDIM+1); @@ -231,8 +235,8 @@ NyxParticleContainer::sumParticleMomentum (int lev, template amrex::Real NyxParticleContainer::estTimestep (amrex::MultiFab& acceleration, - int lev, - amrex::Real cfl) const + int lev, + amrex::Real cfl) const { return estTimestep(acceleration,1.0,lev,cfl); } @@ -240,9 +244,9 @@ NyxParticleContainer::estTimestep (amrex::MultiFab& accel template amrex::Real NyxParticleContainer::estTimestep (amrex::MultiFab& acceleration, - amrex::Real a, - int lev, - amrex::Real cfl) const + amrex::Real a, + int lev, + amrex::Real cfl) const { BL_PROFILE("NyxParticleContainer::estTimestep(lev)"); amrex::Real dt = 1e50; @@ -282,21 +286,17 @@ NyxParticleContainer::estTimestep (amrex::MultiFab& accel ac_pointer->FillBoundary(geom.periodicity()); // DO WE NEED GHOST CELLS FILLED ??? } - for (typename ParticleLevel::const_iterator pmap_it = pmap.begin(), pmapEnd = pmap.end(); pmap_it != pmapEnd; ++pmap_it) - { - const int grid = pmap_it->first.first; - const AoS& pbox = pmap_it->second.GetArrayOfStructs(); +#ifdef _OPENMP +#pragma omp parallel +#endif + for (MyConstParIter pti(*this, lev); pti.isValid(); ++pti) { + const int grid = pti.index(); + const AoS& pbox = pti.GetArrayOfStructs(); const int n = pbox.size(); const amrex::FArrayBox& gfab = (ac_pointer) ? (*ac_pointer)[grid] : acceleration[grid]; num_particles_at_level += n; - -#ifdef _OPENMP -#pragma omp parallel for -#endif - - for (int i = 0; i < n; i++) - { + for (int i = 0; i < n; i++) { const ParticleType& p = pbox[i]; if (p.id() <= 0) continue; diff --git a/Source/NyxParticles.cpp b/Source/NyxParticles.cpp index 3d73594d..30a9993d 100644 --- a/Source/NyxParticles.cpp +++ b/Source/NyxParticles.cpp @@ -112,6 +112,7 @@ namespace bool Nyx::do_dm_particles = false; int Nyx::num_particle_ghosts = 1; +int Nyx::particle_skip_factor = 1; std::string Nyx::particle_init_type = ""; std::string Nyx::particle_move_type = ""; @@ -256,7 +257,7 @@ Nyx::read_particle_params () pp.query("particle_initrandom_count_per_box", particle_initrandom_count_per_box); pp.query("particle_initrandom_mass", particle_initrandom_mass); pp.query("particle_initrandom_iseed", particle_initrandom_iseed); - + pp.query("particle_skip_factor", particle_skip_factor); pp.query("ascii_particle_file", ascii_particle_file); // Input error check @@ -289,10 +290,11 @@ Nyx::read_particle_params () // Input error check if (!binary_particle_file.empty() && (particle_init_type != "BinaryFile" && - particle_init_type != "BinaryMetaFile")) + particle_init_type != "BinaryMetaFile" && + particle_init_type != "BinaryMortonFile")) { if (ParallelDescriptor::IOProcessor()) - std::cerr << "ERROR::particle_init_type is not BinaryFile or BinaryMetaFile but you specified binary_particle_file" << std::endl; + std::cerr << "ERROR::particle_init_type is not BinaryFile, BinaryMetaFile, or BinaryMortonFile but you specified binary_particle_file" << std::endl; amrex::Error(); } @@ -482,6 +484,24 @@ Nyx::init_particles () if (init_with_sph_particles == 1) SPHPC->InitFromBinaryMetaFile(sph_particle_file, BL_SPACEDIM + 1); } + else if (particle_init_type == "BinaryMortonFile") + { + if (verbose) + { + amrex::Print() << "\nInitializing DM particles from morton-ordered binary file\"" + << binary_particle_file << "\" ...\n\n"; + if (init_with_sph_particles == 1) + amrex::Error("Morton-ordered input is not supported for sph particles."); + } + // + // The second argument is how many Reals we read into `m_data[]` + // after reading in `m_pos[]` in each of the binary particle files. + // Here we're reading in the particle mass and velocity. + // + DMPC->InitFromBinaryMortonFile(binary_particle_file, + BL_SPACEDIM + 1, + particle_skip_factor); + } else { amrex::Error("not a valid input for nyx.particle_init_type"); diff --git a/Source/Nyx_F.H b/Source/Nyx_F.H index 6049f962..f01c1cd8 100644 --- a/Source/Nyx_F.H +++ b/Source/Nyx_F.H @@ -8,12 +8,18 @@ extern "C" { #endif - void fort_integrate_comoving_a + + void fort_alloc_simd_vec(); + void fort_dealloc_simd_vec(); + + void fort_integrate_comoving_a (amrex::Real* old_a, amrex::Real* new_a, amrex::Real* dt); void fort_integrate_comoving_a_to_z (amrex::Real* old_a, amrex::Real* z_value, amrex::Real* dt); + void set_simd(const int *simd_width); + // void fort_get_omm (amrex::Real* omm); // void fort_get_omb (amrex::Real* frac); // void fort_get_hubble (amrex::Real* hubble); @@ -34,14 +40,16 @@ extern "C" void fort_get_method_params(int* HYP_GROW); void fort_set_method_params - (const int& dm, const int& NumAdv, const int& do_hydro, + (const int& dm, const int& NumAdv, const int& Ndiag, const int& do_hydro, const int& ppm_type, const int& ppm_ref, const int& ppm_flatten_before_integrals, const int& use_colglaz, const int& use_flattening, const int& corner_coupling, const int& version_2, const int& use_const_species, const amrex::Real& gamma_in, const int& normalize_species, - const int& heat_cool_type, const MPI_Comm& comm); + const int& heat_cool_type, const int& inhomo_reion); + + void fort_tabulate_rates(); void filcc (const amrex::Real * q, ARLIM_P(q_lo), ARLIM_P(q_hi), @@ -246,8 +254,20 @@ extern "C" amrex::Real* comoving_a, const int* print_fortran_warnings); - void fort_init_this_z - (amrex::Real* comoving_a); + void fort_compute_temp_vec + (const int lo[], const int hi[], + const BL_FORT_FAB_ARG(state), + const BL_FORT_FAB_ARG(diag_eos), + amrex::Real* comoving_a, + const int* print_fortran_warnings); + + void fort_interp_to_this_z + (const amrex::Real* z); + + void fort_setup_eos_params + (amrex::Real* eos_nr_eps, + amrex::Real* vode_rtol, + amrex::Real* vode_atol_scaled); void fort_compute_max_temp_loc (const int lo[], const int hi[], @@ -261,7 +281,7 @@ extern "C" const BL_FORT_FAB_ARG(state), const BL_FORT_FAB_ARG(diag_eos), amrex::Real* rho_ave, amrex::Real* rho_T_sum, amrex::Real* T_sum, - amrex::Real* T_meanrho_sum, amrex::Real* rho_sum, + amrex::Real* Tinv_sum, amrex::Real* T_meanrho_sum, amrex::Real* rho_sum, amrex::Real* vol_sum, amrex::Real* vol_mn_sum); #ifdef AUX_UPDATE @@ -281,4 +301,5 @@ extern "C" #ifdef __cplusplus } #endif + #endif diff --git a/Source/Nyx_halos.cpp b/Source/Nyx_halos.cpp index 57cb7524..66c5b925 100644 --- a/Source/Nyx_halos.cpp +++ b/Source/Nyx_halos.cpp @@ -170,9 +170,6 @@ Nyx::halo_find (Real dt) for (BoxIterator bit(vertBox); bit.ok(); ++bit) { IntVect vert = bit(); - int i = vert[0]; - int j = vert[1]; - int k = vert[2]; IntVect iv(D_DECL(vertices[vert[0]][0], vertices[vert[1]][1], vertices[vert[2]][2])); @@ -190,6 +187,27 @@ Nyx::halo_find (Real dt) std::cout << " " << std::endl; std::cout << " *************************************** " << std::endl; + // agn_density_old will hold the density from depositing the + // mass of existing particles. + MultiFab agn_density_old(simBA, simDM, ncomp1, nghost1); + agn_density_old.setVal(0.0); + + // Deposit the mass now in the particles onto agn_density_old, on grid. + // (No change to mass of particles.) + Nyx::theAPC()->AssignDensitySingleLevel(agn_density_old, level); + + // Make sure the density put into ghost cells is added to valid regions + agn_density_old.SumBoundary(geom.periodicity()); + + // Convert new_state to primitive variables: rho, velocity, energy/rho. + conserved_to_primitive(new_state); + + // Add agn_density_old to new_state, which holds primitive variables. + // This is from depositing mass of existing particles. + // Later, we'll subtract the deposited mass of all particles, old & new. + amrex::MultiFab::Add(new_state, agn_density_old, + comp0, Density, ncomp1, nghost0); + #ifdef REEBER for (const Halo& h : reeber_halos) { @@ -243,10 +261,17 @@ Nyx::halo_find (Real dt) // Call Redistribute so that the new particles get their cell, grid and process defined Nyx::theAPC()->Redistribute(lev_min, lev_max, ngrow); + // Fill the "ghosts" vector with particles in ghost cells of each grid Nyx::theAPC()->fillNeighbors(level); + + // ComputeOverlap sets the ID of a particle to -1 if it is less than "cutoff" away from another + // particle and if it is newer than that particle Nyx::theAPC()->ComputeOverlap(level); + + // Clear the Neighbor Particle data structure Nyx::theAPC()->clearNeighbors(level); + // This Redistribute is used to remove particles whose ID's have been set to -1 in ComputeOverlap Nyx::theAPC()->Redistribute(lev_min, lev_max, ngrow); // agn_density will hold the density we're going to remove from the grid. @@ -260,21 +285,26 @@ Nyx::halo_find (Real dt) // Make sure the density put into ghost cells is added to valid regions agn_density.SumBoundary(geom.periodicity()); - // Take away the density from the gas that was added to the AGN particle. + // Take away the density from the gas that was added to the AGN particle: + // density is in new_state, which holds primitive variables. amrex::MultiFab::Subtract(new_state, agn_density, comp0, Density, ncomp1, nghost0); - cout << "Going into ComputeParticleVelocity (no energy), number of AGN particles on this proc is " - << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl; + // Convert new_state to conserved variables: rho, momentum, energy. + primitive_to_conserved(new_state); + + pout() << "Going into ComputeParticleVelocity (no energy), number of AGN particles on this proc is " + << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl; // Re-set the particle velocity (but not energy) after accretion, - // using change of momentum density in state. + // using change of momentum density from orig_state to new_state, + // which hold conserved variables. // No change to state, other than filling ghost cells. int add_energy = 0; Nyx::theAPC()->ComputeParticleVelocity(level, orig_state, new_state, add_energy); - cout << "Going into ReleaseEnergy, number of AGN particles on this proc is " - << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl; + pout() << "Going into ReleaseEnergy, number of AGN particles on this proc is " + << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl; // AGN particles: may zero out energy. // new_state: may increase internal and total energy. MultiFab& D_new = get_new_data(DiagEOS_Type); @@ -361,7 +391,7 @@ Nyx::halo_accrete (Real dt) const DistributionMapping& simDM = new_state.DistributionMap(); int ncomp = new_state.nComp(); - // First copy the existing state into orig_state. + // First copy the existing state (new_state) into orig_state. MultiFab orig_state(simBA, simDM, ncomp, nghost1); MultiFab::Copy(orig_state, new_state, comp0, comp0, ncomp, nghost1); @@ -373,8 +403,8 @@ Nyx::halo_accrete (Real dt) MultiFab agn_density_lost(simBA, simDM, ncomp1, nghost1); agn_density_lost.setVal(0.0); - cout << "Going into AccreteMass, number of AGN particles on this proc is " - << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl; + pout() << "Going into AccreteMass, number of AGN particles on this proc is " + << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl; // AGN particles: increase mass and energy. // new_state: no change, other than filling in ghost cells. // agn_density_lost: gets filled in. @@ -394,8 +424,8 @@ Nyx::halo_accrete (Real dt) // using change of momentum density in state. // No change to state, other than filling ghost cells. int add_energy = 1; - cout << "Going into ComputeParticleVelocity (and energy), number of AGN particles on this proc is " - << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl; + pout() << "Going into ComputeParticleVelocity (and energy), number of AGN particles on this proc is " + << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl; Nyx::theAPC()->ComputeParticleVelocity(level, orig_state, new_state, add_energy); // Now new_state = get_new_data(State_Type) has been updated. } diff --git a/Source/Nyx_hydro.cpp b/Source/Nyx_hydro.cpp index 604ac4a4..c28b1d3b 100644 --- a/Source/Nyx_hydro.cpp +++ b/Source/Nyx_hydro.cpp @@ -122,14 +122,15 @@ Nyx::just_the_hydro (Real time, // Create FAB for extended grid values (including boundaries) and fill. MultiFab S_old_tmp(S_old.boxArray(), S_old.DistributionMap(), NUM_STATE, NUM_GROW); FillPatch(*this, S_old_tmp, NUM_GROW, time, State_Type, 0, NUM_STATE); - MultiFab D_old_tmp(D_old.boxArray(), D_old.DistributionMap(), 2, NUM_GROW); - FillPatch(*this, D_old_tmp, NUM_GROW, time, DiagEOS_Type, 0, 2); + + MultiFab D_old_tmp(D_old.boxArray(), D_old.DistributionMap(), D_old.nComp(), NUM_GROW); + FillPatch(*this, D_old_tmp, NUM_GROW, time, DiagEOS_Type, 0, D_old.nComp()); if (add_ext_src && strang_split) strang_first_step(time,dt,S_old_tmp,D_old_tmp); #ifdef _OPENMP -#pragma omp parallel reduction(max:courno) +#pragma omp parallel reduction(max:courno) reduction(+:e_added,ke_added) #endif { FArrayBox flux[BL_SPACEDIM], u_gdnv[BL_SPACEDIM]; diff --git a/Source/Nyx_nd.f90 b/Source/Nyx_nd.f90 index f70929eb..9c48bc4d 100644 --- a/Source/Nyx_nd.f90 +++ b/Source/Nyx_nd.f90 @@ -186,11 +186,12 @@ end subroutine fort_set_small_values ! ::: subroutine fort_set_method_params( & - dm, numadv, do_hydro, ppm_type_in, ppm_ref_in, & + dm, numadv, ndiag_in, do_hydro, ppm_type_in, ppm_ref_in, & ppm_flatten_before_integrals_in, & use_colglaz_in, use_flattening_in, & corner_coupling_in, version_2_in, & - use_const_species_in, gamma_in, normalize_species_in, heat_cool_in, comm) & + use_const_species_in, gamma_in, normalize_species_in, & + heat_cool_in, inhomo_reion_in) & bind(C, name = "fort_set_method_params") ! Passing data from C++ into f90 @@ -202,12 +203,12 @@ subroutine fort_set_method_params( & use comoving_module, only : comoving_type use network, only : nspec, naux use eos_module - use parallel implicit none integer, intent(in) :: dm integer, intent(in) :: numadv + integer, intent(in) :: ndiag_in integer, intent(in) :: do_hydro integer, intent(in) :: ppm_type_in integer, intent(in) :: ppm_ref_in @@ -220,19 +221,13 @@ subroutine fort_set_method_params( & integer, intent(in) :: use_const_species_in integer, intent(in) :: normalize_species_in integer, intent(in) :: heat_cool_in - integer, intent(in), optional :: comm + integer, intent(in) :: inhomo_reion_in integer :: QNEXT integer :: UNEXT integer :: iadv, ispec - if (present(comm)) then - call parallel_initialize(comm=comm) - else - call parallel_initialize() - end if - use_const_species = use_const_species_in iorder = 2 @@ -242,6 +237,8 @@ subroutine fort_set_method_params( & comoving_type = 1 + NDIAG = ndiag_in + if (do_hydro .eq. 0) then NVAR = 1 @@ -256,11 +253,17 @@ subroutine fort_set_method_params( & TEMP_COMP = -1 NE_COMP = -1 + ZHI_COMP = -1 else TEMP_COMP = 1 NE_COMP = 2 + if (inhomo_reion_in .gt. 0) then + ZHI_COMP = 3 + else + ZHI_COMP = -1 + endif !--------------------------------------------------------------------- ! conserved state components @@ -362,13 +365,10 @@ subroutine fort_set_method_params( & normalize_species = normalize_species_in heat_cool_type = heat_cool_in + inhomo_reion = inhomo_reion_in end if - if (heat_cool_type .eq. 1 .or. heat_cool_type .eq. 3 .or. heat_cool_type .eq. 5) then - call tabulate_rates() - end if - ! Easy indexing for the passively advected quantities. ! This lets us loop over all four groups (advected, species, aux) ! in a single loop. diff --git a/Source/SourceTerms/Nyx_sources.cpp b/Source/SourceTerms/Nyx_sources.cpp index dd75d435..fe64c711 100644 --- a/Source/SourceTerms/Nyx_sources.cpp +++ b/Source/SourceTerms/Nyx_sources.cpp @@ -27,12 +27,14 @@ Nyx::get_old_source (Real old_time, Dborder.define(grids, D_old.DistributionMap(), D_old.nComp(), 4); FillPatch(*this, Sborder, 4, old_time, State_Type, Density, Sborder.nComp()); - FillPatch(*this, Dborder, 4, old_time, DiagEOS_Type, 0, 2); + FillPatch(*this, Dborder, 4, old_time, DiagEOS_Type, 0, D_old.nComp()); + + fort_interp_to_this_z(&z); #ifdef _OPENMP #pragma omp parallel #endif - for (MFIter mfi(S_old,true); mfi.isValid(); ++mfi) + for (MFIter mfi(S_old, MFItInfo().SetDynamic(true).EnableTiling()); mfi.isValid(); ++mfi) { // We explicitly want to fill the ghost regions of the ext_src array const Box& bx = mfi.growntilebox(ext_src.nGrow()); @@ -84,13 +86,15 @@ Nyx::get_new_source (Real old_time, FillPatch(*this, Sborder_old, 4, old_time, State_Type , Density, Sborder_old.nComp()); FillPatch(*this, Sborder_new, 4, new_time, State_Type , Density, Sborder_new.nComp()); - FillPatch(*this, Dborder_old, 4, old_time, DiagEOS_Type, 0 , 2); - FillPatch(*this, Dborder_new, 4, new_time, DiagEOS_Type, 0 , 2); + FillPatch(*this, Dborder_old, 4, old_time, DiagEOS_Type, 0 , Dborder_old.nComp()); + FillPatch(*this, Dborder_new, 4, new_time, DiagEOS_Type, 0 , Dborder_new.nComp()); + + fort_interp_to_this_z(&z); #ifdef _OPENMP #pragma omp parallel #endif - for (MFIter mfi(S_old,true); mfi.isValid(); ++mfi) + for (MFIter mfi(S_old, MFItInfo().SetDynamic(true).EnableTiling()); mfi.isValid(); ++mfi) { // We explicitly only want to fill the valid region const Box& bx = mfi.tilebox(); diff --git a/Source/Src_3d/compute_temp_3d.f90 b/Source/Src_3d/compute_temp_3d.f90 index ae301faa..4fa99ffb 100644 --- a/Source/Src_3d/compute_temp_3d.f90 +++ b/Source/Src_3d/compute_temp_3d.f90 @@ -9,9 +9,12 @@ subroutine fort_compute_temp(lo,hi, & use amrex_fort_module, only : rt => amrex_real use eos_module - use atomic_rates_module, only: this_z, interp_to_this_z + use atomic_rates_module, only: this_z use meth_params_module, only : NVAR, URHO, UMX, UMY, UMZ, UEINT, UEDEN, & - TEMP_COMP, NE_COMP, small_temp, heat_cool_type + NDIAG, TEMP_COMP, NE_COMP, ZHI_COMP, & + small_temp, heat_cool_type + use reion_aux_module, only: zhi_flash, zheii_flash, flash_h, flash_he, & + inhomogeneous_on use eos_params_module implicit none @@ -20,20 +23,27 @@ subroutine fort_compute_temp(lo,hi, & integer , intent(in ) :: d_l1,d_l2,d_l3,d_h1,d_h2,d_h3 integer , intent(in ) :: print_fortran_warnings real(rt), intent(inout) :: state(s_l1:s_h1,s_l2:s_h2,s_l3:s_h3,NVAR) - real(rt), intent(inout) :: diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,2) + real(rt), intent(inout) :: diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,NDIAG) real(rt), intent(in ) :: comoving_a - integer :: i,j,k + integer :: i,j,k, JH, JHe real(rt) :: rhoInv,eint real(rt) :: ke,dummy_pres real(rt) :: z z = 1.d0/comoving_a - 1.d0 - if (heat_cool_type.gt.0) then - if (z .ne. this_z) & - call interp_to_this_z(z) - end if + ! Flash reionization? + if ((flash_h .eqv. .true.) .and. (z .gt. zhi_flash)) then + JH = 0 + else + JH = 1 + endif + if ((flash_he .eqv. .true.) .and. (z .gt. zheii_flash)) then + JHe = 0 + else + JHe = 1 + endif do k = lo(3),hi(3) do j = lo(2),hi(2) @@ -59,7 +69,13 @@ subroutine fort_compute_temp(lo,hi, & eint = state(i,j,k,UEINT) * rhoInv - call nyx_eos_T_given_Re(diag_eos(i,j,k,TEMP_COMP), diag_eos(i,j,k,NE_COMP), & + if ((inhomogeneous_on) .and. (z .gt. diag_eos(i,j,k,ZHI_COMP))) then + JH = 0 + else + JH = 1 + endif + + call nyx_eos_T_given_Re(JH, JHe, diag_eos(i,j,k,TEMP_COMP), diag_eos(i,j,k,NE_COMP), & state(i,j,k,URHO), eint, comoving_a) else @@ -85,14 +101,130 @@ subroutine fort_compute_temp(lo,hi, & end subroutine fort_compute_temp + subroutine fort_compute_temp_vec(lo,hi, & + state ,s_l1,s_l2,s_l3, s_h1,s_h2,s_h3, & + diag_eos,d_l1,d_l2,d_l3, d_h1,d_h2,d_h3, & + comoving_a, print_fortran_warnings) & + bind(C, name = "fort_compute_temp_vec") + + use amrex_fort_module, only : rt => amrex_real + use eos_module + use atomic_rates_module, only: this_z + use meth_params_module, only : NVAR, URHO, UMX, UMY, UMZ, UEINT, UEDEN, & + NDIAG, TEMP_COMP, NE_COMP, small_temp, heat_cool_type + use eos_params_module + + implicit none + integer , intent(in ) :: lo(3),hi(3) + integer , intent(in ) :: s_l1,s_l2,s_l3,s_h1,s_h2,s_h3 + integer , intent(in ) :: d_l1,d_l2,d_l3,d_h1,d_h2,d_h3 + integer , intent(in ) :: print_fortran_warnings + real(rt), intent(inout) :: state(s_l1:s_h1,s_l2:s_h2,s_l3:s_h3,NVAR) + real(rt), intent(inout) :: diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,NDIAG) + real(rt), intent(in ) :: comoving_a + + integer :: i,j,k + real(rt) :: rhoInv,eint + real(rt), dimension(hi(1)-lo(1)+1) :: ke,dummy_pres,small_temp_vec + real(rt) :: z + real(rt), dimension(hi(1)-lo(1)+1,4) :: eos_inputs_pos_ueint, eos_inputs_neg_ueint + integer :: orig_indices(hi(1)-lo(1)+1,3) + integer :: pos_eos_count, neg_eos_count + + z = 1.d0/comoving_a - 1.d0 + + do k = lo(3),hi(3) + do j = lo(2),hi(2) + do i = lo(1),hi(1) + if (state(i,j,k,URHO) <= 0.d0) then + print *,' ' + print *,'>>> Error: compute_temp ',i,j,k + print *,'>>> ... negative density ',state(i,j,k,URHO) + print *,' ' + call bl_error("Error:: compute_temp_3d.f90 :: compute_temp") + end if + enddo + enddo + enddo + + do k = lo(3),hi(3) + do j = lo(2),hi(2) + + pos_eos_count = 0 + neg_eos_count = 0 + + do i = lo(1),hi(1) + rhoInv = 1.d0 / state(i,j,k,URHO) + + if (state(i,j,k,UEINT) > 0.d0) then + + pos_eos_count = pos_eos_count + 1 + + eos_inputs_pos_ueint(pos_eos_count,1) = diag_eos(i,j,k,TEMP_COMP) + eos_inputs_pos_ueint(pos_eos_count,2) = diag_eos(i,j,k,NE_COMP) + eos_inputs_pos_ueint(pos_eos_count,3) = state(i,j,k,URHO) + eos_inputs_pos_ueint(pos_eos_count,4) = state(i,j,k,UEINT)*rhoInv + + orig_indices(pos_eos_count,1) = i + orig_indices(pos_eos_count,2) = j + orig_indices(pos_eos_count,3) = k + + else + + neg_eos_count = neg_eos_count + 1 + + eos_inputs_neg_ueint(neg_eos_count,1) = diag_eos(i,j,k,TEMP_COMP) ! DON'T NEED THIS; GET RID OF IT + eos_inputs_neg_ueint(neg_eos_count,2) = diag_eos(i,j,k,NE_COMP) + eos_inputs_neg_ueint(neg_eos_count,3) = state(i,j,k,URHO) + eos_inputs_neg_ueint(neg_eos_count,4) = state(i,j,k,UEINT) + + orig_indices(neg_eos_count,1) = i + orig_indices(neg_eos_count,2) = j + orig_indices(neg_eos_count,3) = k + + end if + end do + + ! For cells with positive E_int + call nyx_eos_T_given_Re_vec(eos_inputs_pos_ueint(1:pos_eos_count,1), & + eos_inputs_pos_ueint(1:pos_eos_count,2), & + eos_inputs_pos_ueint(1:pos_eos_count,3), & + eos_inputs_pos_ueint(1:pos_eos_count,4), & + comoving_a, & + pos_eos_count) + diag_eos(orig_indices(1:pos_eos_count,1),j,k,TEMP_COMP) = eos_inputs_pos_ueint(1:pos_eos_count,1) + diag_eos(orig_indices(1:pos_eos_count,1),j,k,NE_COMP) = eos_inputs_pos_ueint(1:pos_eos_count,2) + + ! For cells with negative E_int + call nyx_eos_given_RT_vec(eos_inputs_neg_ueint(1:neg_eos_count,4), & + dummy_pres(1:neg_eos_count), & + eos_inputs_neg_ueint(1:neg_eos_count,3), & + small_temp_vec(1:neg_eos_count), & + eos_inputs_neg_ueint(1:neg_eos_count,2), & + comoving_a, & + neg_eos_count) + + ke(1:neg_eos_count) = 0.5d0 * (state(orig_indices(1:neg_eos_count,1),j,k,UMX)*state(orig_indices(1:neg_eos_count,1),j,k,UMX) + & + state(orig_indices(1:neg_eos_count,1),j,k,UMY)*state(orig_indices(1:neg_eos_count,1),j,k,UMY) + & + state(orig_indices(1:neg_eos_count,1),j,k,UMZ)*state(orig_indices(1:neg_eos_count,1),j,k,UMZ)) * rhoInv + + diag_eos(orig_indices(1:neg_eos_count,1),j,k,TEMP_COMP) = small_temp_vec(1:neg_eos_count) + state(orig_indices(1:neg_eos_count,1),j,k,UEINT) = eos_inputs_neg_ueint(1:neg_eos_count,3) * eos_inputs_neg_ueint(1:neg_eos_count,4) + state(orig_indices(1:neg_eos_count,1),j,k,UEDEN) = eos_inputs_neg_ueint(1:neg_eos_count,4) + ke(1:neg_eos_count) + + enddo + enddo + + end subroutine fort_compute_temp_vec + subroutine fort_compute_rho_temp(lo,hi,dx, & state,s_l1,s_l2,s_l3,s_h1,s_h2,s_h3, & diag_eos,d_l1,d_l2,d_l3,d_h1,d_h2,d_h3, & rho_ave,rho_T_sum, & - T_sum,T_meanrho_sum,rho_sum,vol_sum,vol_mn_sum) & + T_sum,Tinv_sum,T_meanrho_sum,rho_sum,vol_sum,vol_mn_sum) & bind(C, name = "fort_compute_rho_temp") - use meth_params_module, only : NVAR, URHO, TEMP_COMP + use meth_params_module, only : NVAR, URHO, NDIAG, TEMP_COMP use amrex_fort_module, only : rt => amrex_real implicit none @@ -102,8 +234,8 @@ subroutine fort_compute_rho_temp(lo,hi,dx, & real(rt), intent(in ) :: dx(3) real(rt), intent(in ) :: rho_ave real(rt), intent(in ) :: state(s_l1:s_h1,s_l2:s_h2,s_l3:s_h3,NVAR) - real(rt), intent(in ) :: diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,2) - real(rt), intent(inout) :: rho_T_sum, rho_sum, T_sum, T_meanrho_sum + real(rt), intent(inout) :: diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,NDIAG) + real(rt), intent(inout) :: rho_T_sum, rho_sum, T_sum, Tinv_sum, T_meanrho_sum real(rt), intent(inout) :: vol_sum, vol_mn_sum integer :: i,j,k @@ -116,6 +248,7 @@ subroutine fort_compute_rho_temp(lo,hi,dx, & do j = lo(2),hi(2) do i = lo(1),hi(1) T_sum = T_sum + vol*diag_eos(i,j,k,TEMP_COMP) + Tinv_sum = Tinv_sum + state(i,j,k,URHO)/diag_eos(i,j,k,TEMP_COMP) rho_T_sum = rho_T_sum + state(i,j,k,URHO)*diag_eos(i,j,k,TEMP_COMP) rho_sum = rho_sum + state(i,j,k,URHO) if ( (state(i,j,k,URHO) .lt. rho_hi) .and. & @@ -137,7 +270,7 @@ subroutine fort_compute_max_temp_loc(lo,hi, & max_temp, den_maxt, imax, jmax, kmax) & bind(C, name = "fort_compute_max_temp_loc") - use meth_params_module, only : TEMP_COMP, NVAR, URHO + use meth_params_module, only : TEMP_COMP, NVAR, URHO, NDIAG use amrex_fort_module, only : rt => amrex_real implicit none @@ -145,7 +278,7 @@ subroutine fort_compute_max_temp_loc(lo,hi, & integer , intent(in ) :: s_l1,s_l2,s_l3,s_h1,s_h2,s_h3 integer , intent(in ) :: d_l1,d_l2,d_l3,d_h1,d_h2,d_h3 real(rt), intent(inout) :: state(s_l1:s_h1,s_l2:s_h2,s_l3:s_h3,NVAR) - real(rt), intent(inout) :: diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,2) + real(rt), intent(inout) :: diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,NDIAG) real(rt), intent(in ) :: max_temp real(rt), intent( out) :: den_maxt integer , intent(inout) :: imax,jmax,kmax diff --git a/Source/comoving.cpp b/Source/comoving.cpp index 28e450ca..42f233a7 100644 --- a/Source/comoving.cpp +++ b/Source/comoving.cpp @@ -230,9 +230,13 @@ Nyx::comoving_a_post_restart (const std::string& restart_file) std::cout << "...setting old_a_time to " << old_a_time << std::endl; } +#ifdef HEATCOOL // Initialize "this_z" in the atomic_rates_module - if (heat_cool_type == 1 || heat_cool_type == 3 || heat_cool_type == 5) - fort_init_this_z(&old_a); + if (heat_cool_type == 1 || heat_cool_type == 3 || heat_cool_type == 5 || heat_cool_type == 7) { + Real old_z = 1.0/old_a - 1.0; + fort_interp_to_this_z(&old_z); + } +#endif } void diff --git a/Source/main.cpp b/Source/main.cpp index 03766423..e7062a37 100644 --- a/Source/main.cpp +++ b/Source/main.cpp @@ -411,6 +411,10 @@ main (int argc, char* argv[]) #endif const Real time_before_main_loop = ParallelDescriptor::second(); +#ifdef USE_CVODE + Nyx::alloc_simd_vec(); +#endif + bool finished(false); while ( ! finished) { @@ -512,6 +516,10 @@ main (int argc, char* argv[]) #endif } // ---- end while( ! finished) +#ifdef USE_CVODE + Nyx::dealloc_simd_vec(); +#endif + const Real time_without_init = ParallelDescriptor::second() - time_before_main_loop; if (ParallelDescriptor::IOProcessor()) std::cout << "Time w/o init: " << time_without_init << std::endl; diff --git a/Source/meth_params.f90 b/Source/meth_params.f90 index 51fd612f..89874091 100644 --- a/Source/meth_params.f90 +++ b/Source/meth_params.f90 @@ -17,9 +17,9 @@ module meth_params_module integer, parameter :: MAXADV = 5 ! NTHERM: number of thermodynamic variables - integer , save :: NTHERM, NVAR + integer , save :: NTHERM, NVAR, NDIAG integer , save :: URHO, UMX, UMY, UMZ, UEDEN, UEINT, UFA, UFS, UFX - integer , save :: TEMP_COMP, NE_COMP + integer , save :: TEMP_COMP, NE_COMP, ZHI_COMP ! QTHERM: number of primitive variables integer , save :: QTHERM, QVAR @@ -39,6 +39,7 @@ module meth_params_module integer , save :: use_const_species integer , save :: normalize_species integer , save :: heat_cool_type + integer , save :: inhomo_reion integer , save :: grav_source_type integer, save :: npassive diff --git a/Source/misc_params.f90 b/Source/misc_params.f90 new file mode 100644 index 00000000..fdaac1d7 --- /dev/null +++ b/Source/misc_params.f90 @@ -0,0 +1,7 @@ +module misc_params + + implicit none + + integer :: simd_width + +end module misc_params diff --git a/Source/strang_splitting.cpp b/Source/strang_splitting.cpp index 32b79a42..581b5437 100644 --- a/Source/strang_splitting.cpp +++ b/Source/strang_splitting.cpp @@ -14,6 +14,11 @@ Nyx::strang_first_step (Real time, Real dt, MultiFab& S_old, MultiFab& D_old) const Real a = get_comoving_a(time); const Real* dx = geom.CellSize(); + { + const Real z = 1.0/a - 1.0; + fort_interp_to_this_z(&z); + } + #ifdef _OPENMP #pragma omp parallel #endif @@ -50,13 +55,19 @@ Nyx::strang_second_step (Real time, Real dt, MultiFab& S_new, MultiFab& D_new) int min_iter_grid; int max_iter_grid; - const Real a = get_comoving_a(time); + // Set a at the half of the time step in the second strang + const Real a = get_comoving_a(time-half_dt); const Real* dx = geom.CellSize(); compute_new_temp(); + { + const Real z = 1.0/a - 1.0; + fort_interp_to_this_z(&z); + } + #ifdef _OPENMP -#pragma omp parallel +#pragma omp parallel private(min_iter_grid,max_iter_grid) reduction(min:min_iter) reduction(max:max_iter) #endif for (MFIter mfi(S_new,true); mfi.isValid(); ++mfi) { diff --git a/Source/write_info.cpp b/Source/write_info.cpp index 74ee74b9..1c1a06df 100644 --- a/Source/write_info.cpp +++ b/Source/write_info.cpp @@ -14,12 +14,12 @@ Nyx::write_info () MultiFab& D_new = get_new_data(DiagEOS_Type); Real max_t = 0; - Real rho_T_avg=0.0, T_avg=0.0, T_meanrho=0.0; + Real rho_T_avg=0.0, T_avg=0.0, Tinv_avg=0.0, T_meanrho=0.0; if (do_hydro) { compute_new_temp(); max_t = D_new.norm0(Temp_comp); - compute_rho_temp(rho_T_avg, T_avg, T_meanrho); + compute_rho_temp(rho_T_avg, T_avg, Tinv_avg, T_meanrho); } #endif @@ -37,18 +37,20 @@ Nyx::write_info () if (time == 0.0) { - data_loga << std::setw( 8) << " nstep"; + data_loga << std::setw( 8) << "# nstep"; data_loga << std::setw(14) << " time "; - data_loga << std::setw(14) << " dt "; - data_loga << std::setw(14) << " redshift "; - data_loga << std::setw(14) << " a "; + data_loga << std::setw(14) << " dt "; + data_loga << std::setw(14) << " z "; + data_loga << std::setw(14) << " a "; #ifndef NO_HYDRO if (do_hydro == 1) { - data_loga << std::setw(14) << " max temp "; - data_loga << std::setw(14) << "rho-wgted temp "; - data_loga << std::setw(14) << " V-wgted temp "; - data_loga << std::setw(14) << " T @ "; + data_loga << std::setw(14) << " T_max "; + data_loga << std::setw(14) << " _rho "; + data_loga << std::setw(14) << " _V "; + data_loga << std::setw(14) << "T @ "; + data_loga << std::setw(14) << "T(21cm) "; + data_loga << std::setw(14) << "adiab. "; } #endif data_loga << '\n'; @@ -66,6 +68,8 @@ Nyx::write_info () data_loga << std::setw(14) << std::setprecision(6) << rho_T_avg; data_loga << std::setw(14) << std::setprecision(6) << T_avg; data_loga << std::setw(14) << std::setprecision(6) << T_meanrho; + data_loga << std::setw(14) << std::setprecision(6) << 1.0/Tinv_avg; + data_loga << std::setw(14) << std::setprecision(6) << 0.021*(1.0+old_z)*(1.0+old_z); } #endif data_loga << '\n'; @@ -85,6 +89,8 @@ Nyx::write_info () data_loga << std::setw(14) << std::setprecision(6) << rho_T_avg; data_loga << std::setw(14) << std::setprecision(6) << T_avg; data_loga << std::setw(14) << std::setprecision(6) << T_meanrho; + data_loga << std::setw(14) << std::setprecision(6) << 1.0/Tinv_avg; + data_loga << std::setw(14) << std::setprecision(6) << 0.021*(1.0+new_z)*(1.0+new_z); } #endif data_loga << std::endl; diff --git a/UsersGuide/HeatCool/NyxHeatCool.tex b/UsersGuide/HeatCool/NyxHeatCool.tex new file mode 100644 index 00000000..90fd7b77 --- /dev/null +++ b/UsersGuide/HeatCool/NyxHeatCool.tex @@ -0,0 +1,37 @@ +\label{chap:HeatCool} + +\nyx\ provides the capability to compute local heating and cooling effects due to radiation. +The motivation and algorithm for the heating and cooling components is documented in \cite{lukic15}, and the relevant code is located in the \texttt{Source/HeatCool} subdirectory. +The code is activated through the \texttt{USE\_HEATCOOL=TRUE} option in the \texttt{GNUmakefile}. +Mathematically, the heating and cooling can be described by a single ODE in each cell, to be integrated per time step $\Delta t$. +This ODE exhibits a sensitive relationship to quantities such as temperature and free electron density, and consequently it often requires sophisticated integration techniques to compute correctly. + +\nyx\ provides a few different techniques for solving this ODE, which are selected via the \texttt{nyx.heat\_cool\_type} input parameter. +One method is to use the VODE ODE solver (selected with \texttt{nyx.heat\_cool\_type=3}). +The source code for VODE is included in the \texttt{Util/VODE} subdirectory and is compiled automatically with the rest of \nyx. +However, while VODE is sufficient for computing this ODE correctly, it is an old Fortran code which is no longer maintained, and consequently will not easily be adapted to future high-performance computing architectures. + +VODE's successor is CVODE, which is a translation of the original VODE solver from Fortran to C. +CVODE is actively developed and maintained, and is more likely to be adapted to future architectures. +To use CVODE in \nyx, one may use the \texttt{nyx.heat\_cool\_type=5} input parameter. +Currently the performance of VODE is slightly better because CVODE evaluates the ODE RHS one more time than VODE per coarse time step integration. +Users should note that, while the VODE solver is compiled automatically in \nyx, CVODE must be compiled as a separate library; instructions for compiling CVODE are provided in the \amrex\ User Guide. +To link the external CVODE solver into \nyx, one must set \texttt{USE\_HEATCOOL=TRUE} as well as \texttt{USE\_CVODE=TRUE} in the \texttt{GNUmakefile}. + +Finally, a third ODE integration option (which is new and \emph{\textbf{highly experimental}}) consists of using CVODE while treating groups of ODEs in different cells as a single system of coupled ODEs. +This option can be selected with the \texttt{nyx.heat\_cool\_type=7} option. +The purpose of this approach is to enable the evaluation of multiple RHSs simultaneously, using SIMD instructions. +SIMD parallelism comprises a large fraction of compute performance on modern HPC architectures, and consequently, this approach can lead to a significant performance gain in the ODE integration (which is the most expensive computational kernel in \nyx). +The number of ODEs (cells) which are computed simultaneously is chosen through the input parameter \texttt{nyx.simd\_width}. +On Intel Xeon Phi, with 512 bit-wide SIMD instructions, an appropriate value for this parameter might be 8 or 16, or perhaps larger; the value which yields the highest performance will vary by architecture. +However, users are cautioned that this mode remains \emph{\textbf{experimental}} and its results have not been subjected to the same level of verification as the other solver methods. +In particular, the are three numerical tolerances, available as input parameters, which affect the convergence of the scalar vs SIMD ODE integration: + +\begin{itemize} + \item \texttt{nyx.eos\_nr\_eps}: this is the convergence criterion for the Newton-Raphson iteration which is used to evaluate the ODE RHS + \item \texttt{nyx.vode\_rtol}: this is the relative tolerance required for the ODE integration in VODE or CVODE + \item \texttt{nyx.vode\_atol\_scaled}: this is the absolute tolerance required for the ODE integration in VODE or CVODE, scaled by the initial value of the independent variable in the ODE +\end{itemize} + +These variables, in particular \texttt{nyx.vode\_rtol}, have different effects depending on whether one is integrating a single ODE at a time, or a system of ODEs simultaneously. +One should be mindful of the numerical differences which arise from these, which can be observed with the \texttt{fcompare} tool in \amrex. diff --git a/UsersGuide/HeatCool/heatcool.bib b/UsersGuide/HeatCool/heatcool.bib new file mode 100644 index 00000000..6ba96207 --- /dev/null +++ b/UsersGuide/HeatCool/heatcool.bib @@ -0,0 +1,12 @@ +@article{lukic15, +author = {Luki\'c, Zarija and Stark, Casey W. and Nugent, Peter and White, Martin and Meiksin, Avery A. and Almgren, Ann}, +title = "{The Lyman-$\alpha$ forest in optically thin hydrodynamical simulations}", +journal = {Monthly Notices of the Royal Astronomical Society}, +volume = {446}, +number = {4}, +pages = {3697-3724}, +year = {2015}, +doi = {10.1093/mnras/stu2377}, +URL = { + http://dx.doi.org/10.1093/mnras/stu2377}, +eprint = {/oup/backfile/content_public/journal/mnras/446/4/10.1093_mnras_stu2377/2/stu2377.pdf} +} diff --git a/UsersGuide/NyxUserGuide.tex b/UsersGuide/NyxUserGuide.tex index c1be1110..795870b7 100644 --- a/UsersGuide/NyxUserGuide.tex +++ b/UsersGuide/NyxUserGuide.tex @@ -193,6 +193,9 @@ \chapter{Gravity} \chapter{Dark Matter Particles} \input Particles/Particles.tex +\chapter{Radiative Heating/Cooling} +\input HeatCool/NyxHeatCool.tex + \chapter{Active Galactic Nuclei} \input AGN/AGN.tex @@ -205,6 +208,6 @@ \chapter{Post-processing} \renewcommand\bibname{References} \addcontentsline{toc}{chapter}{References} \bibliographystyle{plain} -\bibliography{Gravity/gr,ComovingHydro/sgs,Forcing/force} +\bibliography{Gravity/gr,ComovingHydro/sgs,Forcing/force,HeatCool/heatcool} \end{document} diff --git a/Util/hpgmg/LICENSE b/Util/hpgmg/LICENSE new file mode 100644 index 00000000..d8898f24 --- /dev/null +++ b/Util/hpgmg/LICENSE @@ -0,0 +1,23 @@ +Copyright (c) 2014, The Regents of the University of California, through +Lawrence Berkeley National Laboratory and UChicago Argonne, LLC. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Util/hpgmg/README b/Util/hpgmg/README new file mode 100644 index 00000000..a26627f8 --- /dev/null +++ b/Util/hpgmg/README @@ -0,0 +1,2 @@ +From https://bitbucket.org/friesen/hpgmg.git +branch make_hpgmg_lib diff --git a/Util/hpgmg/finite-volume/README b/Util/hpgmg/finite-volume/README new file mode 100644 index 00000000..7b365530 --- /dev/null +++ b/Util/hpgmg/finite-volume/README @@ -0,0 +1,31 @@ +*** Copyright Notice *** + +HPGMG, Copyright (c) 2014, The Regents of the University of +California, through Lawrence Berkeley National Laboratory (subject to +receipt of any required approvals from the U.S. Dept. of Energy). All +rights reserved. + +If you have questions about your rights to use or distribute this +software, please contact Berkeley Lab's Technology Transfer Department +at TTD@lbl.gov. + +NOTICE. This software is owned by the U.S. Department of Energy. As +such, the U.S. Government has been granted for itself and others +acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide +license in the Software to reproduce, prepare derivative works, and +perform publicly and display publicly. Beginning five (5) years after +the date permission to assert copyright is obtained from the U.S. +Department of Energy, and subject to any subsequent five (5) year +renewals, the U.S. Government is granted for itself and others acting +on its behalf a paid-up, nonexclusive, irrevocable, worldwide license +in the Software to reproduce, prepare derivative works, distribute +copies to the public, perform publicly and display publicly, and to +permit others to do so. +**************************** + +This directory contains the current HPGMG finite-volume benchmark. + +Please see ./source/README for details on how to compiler, run, +optimize, and examine the output of the hpgmg finite-volume benchmark. + +Example job scripts are in the ./example_jobs directory diff --git a/Util/hpgmg/finite-volume/local.mk b/Util/hpgmg/finite-volume/local.mk new file mode 100644 index 00000000..67f20dc0 --- /dev/null +++ b/Util/hpgmg/finite-volume/local.mk @@ -0,0 +1 @@ +include $(call incsubdirs,source) diff --git a/Util/hpgmg/finite-volume/source/Make.package b/Util/hpgmg/finite-volume/source/Make.package new file mode 100644 index 00000000..d29d499f --- /dev/null +++ b/Util/hpgmg/finite-volume/source/Make.package @@ -0,0 +1,5 @@ +cEXE_sources += timers.c +cEXE_sources += level.c +cEXE_sources += operators.7pt.c +cEXE_sources += mg_hpgmg.c +cEXE_sources += solvers.c diff --git a/Util/hpgmg/finite-volume/source/Makefile b/Util/hpgmg/finite-volume/source/Makefile new file mode 100644 index 00000000..260a5556 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/Makefile @@ -0,0 +1,48 @@ +#CC = cc +#CFLAGS = -g -O2 -I. -openmp -DUSE_BICGSTAB=1 -DUSE_SUBCOMM=1 -DUSE_FCYCLES=1 -DUSE_CHEBY=1 -DUSE_MPI=1 -DUNLIMIT_FMG_VCYCLES=1 +#LFLAGS = -g -O2 -openmp + +#CC = cc +#CFLAGS = -g -O2 -I. -fopenmp -DUSE_BICGSTAB=1 -DUSE_SUBCOMM=1 -DUSE_CHEBY=1 -DUSE_MPI=1 -DUSE_FCYCLES=1 -DUNLIMIT_FMG_VCYCLES=1 +#LFLAGS = -g -O2 -fopenmp + +CC = cc +CFLAGS = -g -O0 -I. -fopenmp -DUSE_BICGSTAB=1 -DUSE_SUBCOMM=1 -DUSE_FCYCLES=1 -DUSE_CHEBY=1 -DUSE_MPI=1 -DUNLIMIT_FMG_VCYCLES=1 +LFLAGS = -g -O0 -fopenmp + +#CC = cc +#CFLAGS = -g -O0 -I. -DUSE_BICGSTAB=1 -DUSE_SUBCOMM=1 -DUSE_FCYCLES=1 -DUSE_CHEBY=1 -DUSE_MPI=1 -DUNLIMIT_FMG_VCYCLES=1 +#LFLAGS = -g -O0 + +#CC = cc +#CFLAGS = -g -O2 -I. -DUSE_BICGSTAB=1 -DUSE_SUBCOMM=1 -DUSE_FCYCLES=1 -DUSE_CHEBY=1 -DUSE_MPI=1 +#LFLAGS = -g -O2 + +#CC = icc +#CFLAGS = -g -O2 -I. -DUSE_BICGSTAB=1 -DUSE_FCYCLES=1 -DUSE_CHEBY=1 +#LFLAGS = -g -O2 + +#CC = mpiicc +#CFLAGS = -g -O2 -ip -xHost -I. -DUSE_BICGSTAB=1 -DUSE_SUBCOMM=1 -DUSE_FCYCLES=1 -DUSE_CHEBY=1 -DUSE_MPI=1 +#LFLAGS = -g -O2 -ip -xHost + +#CC = mpiicc +#CFLAGS = -g -O0 -I. -DUSE_BICGSTAB=1 -DUSE_SUBCOMM=1 -DUSE_FCYCLES=1 -DUSE_CHEBY=1 -DUSE_MPI=1 -DUSE_PERIODIC_BC=1 -DUNLIMIT_FMG_VCYCLES=1 +#LFLAGS = -g -O0 + +OBJ = timers.o level.o operators.7pt.o mg.o solvers.o hpgmg_setup.o +MAIN = call_hpgmg_setup.o + +%.o: %.c $(DEPS) + $(CC) -c $(CFLAGS) -o $@ $< + +libhpgmg_test.a: $(OBJ) + ar rcs $@ $^ + +clean: + $(RM) *.o *.a + +lib: libhpgmg_test.a + +all: $(OBJ) $(MAIN) + $(CC) -o hpgmg $^ $(LFLAGS) diff --git a/Util/hpgmg/finite-volume/source/README b/Util/hpgmg/finite-volume/source/README new file mode 100644 index 00000000..405f88a7 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/README @@ -0,0 +1,250 @@ +*** Copyright Notice *** + +HPGMG, Copyright (c) 2014, The Regents of the University of +California, through Lawrence Berkeley National Laboratory (subject to +receipt of any required approvals from the U.S. Dept. of Energy). All +rights reserved. + +If you have questions about your rights to use or distribute this +software, please contact Berkeley Lab's Technology Transfer Department +at TTD@lbl.gov. + +NOTICE. This software is owned by the U.S. Department of Energy. As +such, the U.S. Government has been granted for itself and others +acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide +license in the Software to reproduce, prepare derivative works, and +perform publicly and display publicly. Beginning five (5) years after +the date permission to assert copyright is obtained from the U.S. +Department of Energy, and subject to any subsequent five (5) year +renewals, the U.S. Government is granted for itself and others acting +on its behalf a paid-up, nonexclusive, irrevocable, worldwide license +in the Software to reproduce, prepare derivative works, distribute +copies to the public, perform publicly and display publicly, and to +permit others to do so. +**************************** + + +About +===== +HPGMG is a compact benchmark designed to proxy the geometric MG solves found in +applications built from AMR MG frameworks like CHOMBO or BoxLib. At a high +level, the benchmark solves Au=f where u and f are cell-centered (finite volume) +3D structured grids. The operator A is a fourth order finite volume discretization +of the helmholtz operator (a*alpha[]*u[] - b* div beta[] grad u[]), where a and +b are scalar constants and alpha[] and beta[] are spatially varyingcoefficients. +HPGMG supports both periodic and homogeneous dirichlet boundary conditions. +The benchmark generates a u_exact[] for a large cubical 3D grid partitioned into +subdomains(boxes) which are distributed across the supercomputer. It then +manually differentiates u[] for form f[], then uses a multigrid solver to +calculate a u[]. It may then use u_exact[] to test correctness and order. +By default, HPGMG solves a poisson equation (a==0) with homogeneous dirichlet +boundary conditions. + +The basic relaxation operator is Gauss-Seidel Red-Black (GSRB) which is applied +twice (red/black/red/black) at every level up and and down the v-cycle. HPGMG +also includes Jacobi, L1Jacobi, Symmetric Gauss-Seidel, and Chebyshev Polynomial +smoothers. HPGMG implements both a truncated v-cycle (u-cycle) in which boxes +are restricted locally and a true distributed v-cycle in which restriction and +interpolation(prolongation) become true distributed operations. The latter +allows the global problem to be restricted to as little as one cell. At the +bottom of the v-cycle, HPGMG switches to one of a few bottom solvers (BiCGStab +is used often used in codes like BoxLib). Once a sufficiently accurate solution +is obtained on this coarse grid, it is interpolated back up the v-cycle. + +HPGMG also includes the option to use Full Multigrid (FMG) in which one executes +an f-cycle that should hit the discretization error in one pass (instead of 10 +v-cycles). This can provide a substantial performance boost, but presents a +number of performance optimization challenges. + + +Compilation +=========== +Although no make file currently exists, compilation is straightforward. + +There are a few basic arguments that should be used. Most are selfexplanatory. + +-DUSE_MPI // compiles the distributed (MPI) version +-DUSE_CG // use CG as a bottom (coarse grid) solver +-DUSE_BICGSTAB // use BiCGStab as a bottom (coarse grid) solver +-DUSE_CABICGSTAB // use CABiCGStab as a bottom (coarse grid) solver (makes more sense with U-Cycles) +-DUSE_SUBCOMM // build a subcommunicator for each level in the MG v-cycle to minimize the scope of MPI_AllReduce() + +-DUSE_FCYCLES // use the Full Multigrid (FMG) solver... HPGMG benchmark should include this option + // note, the choice of FMG is orthogonal from U-Cycles and V-Cycles +-DUSE_VCYCLES // use true distributed V-Cycles in the multigrid solver... this is the suggested option +-DUSE_UCYCLES // use truncated V-Cycles (U-Cycles) in the multigrid solver... a legacy option for understanding the performance implications + +-DUSE_CHEBY // use a Chebyshev Polynomial smoother (degree is specified with CHEBYSHEV_DEGREE) +-DUSE_GSRB // use the GSRB smoother (the number of pre/posts smooths is specified by NUM_SMOOTHS) +-DUSE_JACOBI // use a weighted Jacobi smoother with a weight of 2/3 +-DUSE_L1JACOBI // use a L1 Jacobi smoother (each row's weight is the L1 norm of that row) + +-DBLOCKCOPY_TILE_I=### // parallelism for all ghost zone, restriction, interpolation, and (now) operators (and eventually BC's) is organized the (cache/thread) block concept. +-DBLOCKCOPY_TILE_J=### // That is, boxes are decomposed into tiles of size BLOCKCOPY_TILE_I x BLOCKCOPY_TILE_J x BLOCKCOPY_TILE_J. Users may tune to find the optimal block size. +-DBLOCKCOPY_TILE_K=### // Smaller blocks fit in cache and express more TLP (good for MIC/BGQ/GPUs/...). However, the unit stride for small blocks is reduced (bad for CPUs which rely on prefetchers) + // If these are ommited, the code relies on its defaults. + +-DBOX_ALIGN_JSTRIDE=### // Data allocation is now performed on a level-by-level basis (rather than block-by-block). +-DBOX_ALIGN_KSTRIDE=### // In order to guarantee SIMD alignment, you can pad the unit-stride to a nice round number (e.g. 2, 4, or 8) so that j+/-1 is SIMD-aligned. +-DBOX_ALIGN_VOLUME=### // Similarly, you can pad the kStride (or volume) so that k+/-1 (or vector+/-1) is SIMD-aligned + // If these are ommited, the code relies on its defaults. + +-DMAX_COARSE_DIM=### // provides a means of constraining the maximum coarse dimension. By default, the maximum is 11 (i.e. maximum coarse grid is 11^3) + + +Let us consider an example for Edison, the Cray XC30 at NERSC where the MPI compiler uses icc and is invoked as 'cc'. +cc -Ofast -xAVX -fopenmp level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI -DUSE_SUBCOMM -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB -o run.edison +Conversely, true flat MPI should omit the -fopenmp flag. +cc -Ofast -xAVX level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI -DUSE_SUBCOMM -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB -o run.edison.flat + + +On Mira (an IBM BlueGene/Q), one can use the following... +soft add +mpiwrapper-xl +mpixlc_r -O5 -qsmp=omp:noauto level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB -o run.bgq +-or- +mpixlc_r -O5 -qsmp=omp:noauto level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB -o run.bgq +In order to compile with IBM's HPM counters... +mpixlc_r -O5 -qsmp=omp:noauto level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB -o run.bgq \ + -DUSE_HPM -I/bgsys/drivers/ppcfloor/bgpm/include -L/soft/perftools/hpctw/lib -L/bgsys/drivers/ppcfloor/bgpm/lib /bgsys/drivers/ppcfloor/bgpm/lib/libbgpm.a -lbgpm -lmpihpm_smp -lmpitrace + +On Babbage (Xeon Phi cluster at NERSC), one can use the following for native mode compilation... +mpiicc -mmic -Ofast -fopenmp level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI -DUSE_SUBCOMM -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB -o run.babbage +-or- +mpiicc -mmic -Ofast -fopenmp level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI -DUSE_SUBCOMM -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB -o run.babbage + + + +4th order HPGMG-FV +================== +Included in this release is the 4th order Finite Volume Full Multigrid Implementation. Unlike a 2nd order implementation, the 4th order version requires a different operator file (operators.fv4.c instead of operators.7pt.c). Currently, it is highly recommended one use GSRB over the Chebyshev or Jacobi smoothers. By default, most smoothers make 6 pases thru the data (instead of 4) in order to provide sufficient error and residual. This has the effect of increasing the time spent in smoothing by 50%. + +Nominally, compared to the 2nd order method, the 4th order smoother performs ... +- 4x the flops +- 4x the MPI messages +- 2x the MPI data movement +- 1x the DRAM data movement +- provides 4 more bits of accuracy for every 8x increase in the problem size (instead of 2 bits) + +In order to compile the older 2nd order version on Edison, one may use the following command line... +cc -Ofast -xAVX -fopenmp level.c operators.fv2.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI -DUSE_SUBCOMM -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB -o run.edison + + + +Running the benchmark +===================== +The benchmark exploits OpenMP and/or MPI for parallelism. +You must thus set OMP_NUM_THREADS correctly. For a machine like Edison at NERSC, this is simply +% export OMP_NUM_THREADS=12 +Moreover, on multisocket architectures or when using MPI, you must set affinity correctly. + +The benchmark takes 2 arguments. +./run.hpgmg [log2BoxSize] [Target # of boxes per process] +- log2BoxSize is the log base 2 of the dimension of each box on the finnest grid (e.g. 6 is a good proxy for real applications) +- the target number of boxes per process is a loose bound on memory per process +Given these constraints, the benchmark will then calculate the largest cubical domain it can run. + +The code supports nested OpenMP parallelism which can be enabled by setting +OMP_NESTED=true. At each multigrid level, the code will try and determine +the best balance between coarse and fine-frained parallelism. + +On edison, (the Cray XC30 at nersc), one uses aprun to invoke mpi jobs. A job script may include the following... +#PBS -l mppwidth=96 +export OMP_NUM_THREADS=12 +aprun -n 8 -d 12 -N 2 -S 1 -ss -cc numa_node ./run.hpgmg 6 8 +This will launch 8 mpi processes (-n 8) of 12 threads (-d 12 == OMP_NUM_THREADS) +with 2 processes per node (-N 2), 1 process per NUMA node (-S 1) with the +appropriate NUMA controls (-ss -cc numa_node). Moreover, the dimension of each +box is 2^6 on a side (64^3) and there is a target of 8 boxes per process on the +finest grid. The resultant problem is thus 256^3 on the finest grid. + +On Mira (the IBM Blue Gene/Q at Argonne), one may use qsub to directly queue +the benchmark. For example... +qsub -t 00:10:00 -n 64 --proccount 64 --mode c1 -A [ALLOCATION] --env BG_SHAREDMEMSIZE=32MB:PAMID_VERBOSE=1:BG_COREDUMPDISABLED=1:BG_SMP_FAST_WAKEUP=YES:BG_THREADLAYOUT=1:OMP_PROC_BIND=TRUE:OMP_NUM_THREADS=64:OMP_WAIT_POLICY=active:OMP_NESTED=true ./run.bgq 6 8 +Will run the benchmark on 64 processes spread over 64 nodes with 1 process per +node (c1) and 64 threads per process. Each process is allocated 8 64^3 boxes. +At one point, the additional environment variables listed were found to +accelerate performance. + +On Babbage (the Xeon Phi Cluster at NERSC) +mpirun.mic -n 8 -hostfile micfile.$PBS_JOBID -ppn 1 -env OMP_NUM_THREADS 120 -env KMP_AFFINITY balanced ./run.babbage 7 1 + + +Understanding the Results +========================= +During execution, the benchmark will output some debug information for understanding convergence and performance. +The following is an example and examines a key subset of this information. ++ aprun -n 512 -d 12 -N 2 -S 1 -ss -cc numa_node ./run.hpgmg.edison 7 8 + +Requested MPI_THREAD_FUNNELED, got MPI_THREAD_FUNNELED +512 MPI Tasks of 12 threads +truncating the v-cycle at 2^3 subdomains +creating domain... done + 128 x 128 x 128 (per subdomain) + 256 x 256 x 256 (per process) + 2048 x 2048 x 2048 (overall) + 1-deep ghost zones + allocated 1865 MB + +This initial output details how MPI and OpenMP were initialized. +Moreover, it notes how deep the v-cycle is (down to 2^3 boxes) +It then shows the progress as it creates the structured grids noting their respective sizes and the total memory explicitly allocated with malloc(). +Thus, the 2K^3 overall problem represents 8 billion degrees of freedom. + + +MGSolve... +v-cycle= 1, norm=0.00002091903646017090 (2.091904e-05) +v-cycle= 2, norm=0.00000079708396334668 (7.970840e-07) +v-cycle= 3, norm=0.00000007951502395414 (7.951502e-08) +v-cycle= 4, norm=0.00000000581619537788 (5.816195e-09) +v-cycle= 5, norm=0.00000000048970464287 (4.897046e-10) +v-cycle= 6, norm=0.00000000003900568126 (3.900568e-11) +v-cycle= 7, norm=0.00000000000318039461 (3.180395e-12) +v-cycle= 8, norm=0.00000000000025703104 (2.570310e-13) +v-cycle= 9, norm=0.00000000000002088201 (2.088201e-14) +v-cycle=10, norm=0.00000000000000170463 (1.704634e-15) +v-cycle=11, norm=0.00000000000000014284 (1.428395e-16) +done + +As the multigrid solver progresses, the max (inf) norm of the residual is reported after each v-cycle. +One expects to reduce the norm by one digit on each v-cycle. +Thus to attain a norm less than 1e-15, we required 11 v-cycles. + + + 0 1 2 3 4 5 6 + 128^3 64^3 32^3 16^3 8^3 4^3 2^3 total +smooth 2.244879 0.288221 0.020186 0.003279 0.000672 0.000267 0.000000 2.557504 +residual 0.569046 0.035340 0.001833 0.000328 0.000077 0.000036 0.000030 0.606691 +restriction 0.041538 0.003994 0.000310 0.000072 0.000032 0.000028 0.000000 0.045975 +interpolation 0.076533 0.006586 0.000567 0.000105 0.000038 0.000032 0.000000 0.083860 +applyOp 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.001715 0.001715 +BLAS1 0.157396 0.004949 0.000776 0.000184 0.000055 0.000027 0.014614 0.178002 +BLAS3 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 +communication 0.314615 0.069810 0.024858 0.017584 0.009740 0.005763 0.318338 0.760707 + local exchange 0.047781 0.008262 0.001819 0.000730 0.000368 0.000233 0.001743 0.060936 + pack MPI buffers 0.047688 0.007722 0.001089 0.000569 0.000294 0.000215 0.001630 0.059207 + unpack MPI buffers 0.022835 0.004058 0.001226 0.000530 0.000349 0.000231 0.001712 0.030940 + MPI_Isend 0.002422 0.002161 0.000856 0.000659 0.000779 0.000374 0.002755 0.010005 + MPI_Irecv 0.000456 0.000402 0.000152 0.000205 0.000119 0.000079 0.000677 0.002089 + MPI_Waitall 0.169658 0.047091 0.019666 0.014850 0.007801 0.004603 0.022721 0.286390 + MPI_collectives 0.023637 0.000000 0.000000 0.000000 0.000000 0.000000 0.286850 0.310487 +-------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Total by level 3.386964 0.404774 0.047960 0.021436 0.010595 0.006159 0.334945 4.212834 + + Total time in MGBuild 0.081082 + Total time in MGSolve 4.235200 + " v-cycles 4.213100 + number of v-cycles 11 +Bottom solver iterations 397 + +Finally, we see a timing report. Vertically are the key operations within the v-cycle (communication is further broken down into its constituient operations). Horizontally is a breakdown of time (in seconds) by level in the v-cycle. Thus, one can see the difference in time spent in each operation at each level. These times are totaled by level and by function. Finally, the total time required to build the solver (note geometric multigrid solves can be built extremely quickly), the time spent in the solver, the number of v-cycles, and the total number of bottom solver (e.g. BiCGStab) iterations summed across all v-cycles is reported. + +We thus observe that this 8 billion DOF problem was solved in 4.23 seconds on 512 processes (6144 cores). It required 397 BiCGStab iterations on the coarse grid spread over 11 vcycles (approx 36 terations per v-cycle). + +As the time spent smoothing the fine grid was non-trivial (2.244 seconds), one might be motivated to analyze it. +Each box has (128+2)^3 cells including ghost zones. +There are 8 boxes (2 2 2) per process +Each call to smooth moves 64 bytes of data per cell per stencil sweep. +There are 4 calls to smooth and 2 stencil sweeps per smooth in the v-cycle. +There are 11 v-cycles. +Thus, smooth requires one move *at least* 8 * 130^3 * 64 * 8 * 11 bytes of data = 98.99e9 bytes. +Moving this data in 2.244 seconds suggests each process attained an average DRAM bandwidth of 44 GB/s. This is quite good given this was vanilla OpenMP code without optimization and one could never hope for better than 54GB/s on this machine. diff --git a/Util/hpgmg/finite-volume/source/TODO b/Util/hpgmg/finite-volume/source/TODO new file mode 100644 index 00000000..589d1185 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/TODO @@ -0,0 +1,6 @@ +- cubical problem size -> rectahedral problem size ... init problem, restriction rules, etc... +- rectahedral problem size -> arbitrary problem shape... +- more efficient ghost zone exchange (box intersection algebra) when communicating edges and corners +- overlap BC with exchange +- add a VECTOR_INTERNAL +- iterate on F's (faster for 2nd order, slower for 4th order)? diff --git a/Util/hpgmg/finite-volume/source/call_hpgmg_setup.c b/Util/hpgmg/finite-volume/source/call_hpgmg_setup.c new file mode 100644 index 00000000..06d9852e --- /dev/null +++ b/Util/hpgmg/finite-volume/source/call_hpgmg_setup.c @@ -0,0 +1,60 @@ +#ifdef USE_MPI +#include +#endif + +void hpgmg_setup (const int log2_box_dim, + const int target_boxes_per_rank, + const int OMP_Threads, + const int OMP_Nested, + const int requested_threading_model, + const int actual_threading_model); + +int +main (int argc, char *argv[]) +{ + + const int log2_box_dim = 6; + const int target_boxes_per_rank = 1; + + int OMP_Threads = 1; + int OMP_Nested = 0; + +#ifdef _OPENMP +#pragma omp parallel + { +#pragma omp master + { + OMP_Threads = omp_get_num_threads (); + OMP_Nested = omp_get_nested (); + } + } +#endif + +#ifdef USE_MPI + int actual_threading_model = -1; + int requested_threading_model = -1; + requested_threading_model = MPI_THREAD_SINGLE; + //requested_threading_model = MPI_THREAD_FUNNELED; + //requested_threading_model = MPI_THREAD_SERIALIZED; + //requested_threading_model = MPI_THREAD_MULTIPLE; + //MPI_Init(&argc, &argv); +#ifdef _OPENMP + requested_threading_model = MPI_THREAD_FUNNELED; + //requested_threading_model = MPI_THREAD_SERIALIZED; + //requested_threading_model = MPI_THREAD_MULTIPLE; + //MPI_Init_thread(&argc, &argv, requested_threading_model, &actual_threading_model); +#endif + MPI_Init_thread (&argc, &argv, requested_threading_model, + &actual_threading_model); +#ifdef USE_HPM // IBM HPM counters for BGQ... + HPM_Init (); +#endif +#endif + + hpgmg_setup (log2_box_dim, + target_boxes_per_rank, + OMP_Threads, + OMP_Nested, requested_threading_model, actual_threading_model); + + return 0; +} diff --git a/Util/hpgmg/finite-volume/source/compile b/Util/hpgmg/finite-volume/source/compile new file mode 100644 index 00000000..ce96c1df --- /dev/null +++ b/Util/hpgmg/finite-volume/source/compile @@ -0,0 +1,16 @@ + + +#======================================================================================================================= +# mira +#======================================================================================================================= +soft add +mpiwrapper-xl +qsub -t 00:10:00 -n 64 --proccount 64 --mode c1 -A PEACEndStation --env BG_SHAREDMEMSIZE=32MB:PAMID_VERBOSE=1:BG_COREDUMPDISABLED=1:BG_SMP_FAST_WAKEUP=YES:BG_THREADLAYOUT=2:OMP_PROC_BIND=TRUE:OMP_NUM_THREADS=64:OMP_WAIT_POLICY=active ./run.bgq 7 1 +qsub -t 00:10:00 -n 64 --proccount 64 --mode c1 -A PEACEndStation --env BG_SHAREDMEMSIZE=32MB:PAMID_VERBOSE=1:BG_COREDUMPDISABLED=1:BG_SMP_FAST_WAKEUP=YES:BG_THREADLAYOUT=2:OMP_PROC_BIND=TRUE:OMP_NUM_THREADS=64:OMP_WAIT_POLICY=active:OMP_NESTED=true ./run.bgq 6 8 + + +mpixlc_r -O5 -qsmp=omp:noauto level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB -DBLOCKCOPY_TILE_K=1 -DBLOCKCOPY_TILE_J=32 -o run.bgq.1x32 -DUSE_HPM -L/soft/perftools/hpctw/lib -L/soft/perftools/bgpm/lib -lmpihpm_smp -lbgpm + + +mpirun.mic -n 8 -ppn 8 -hostfile micfile.$PBS_JOBID -env OMP_NUM_THREADS=30 -env KMP_AFFINITY=compact -env I_MPI_FABRICS=shm -env I_MPI_PIN_DOMAIN=30 ./run.babbage.baseline 7 1 +mpirun.mic -n 8 -ppn 8 -hostfile micfile.$PBS_JOBID -env OMP_NUM_THREADS=30 -env KMP_AFFINITY=compact -env I_MPI_FABRICS=shm:ofa -env I_MPI_PIN_DOMAIN=30 ./run.babbage.baseline 7 1 +mpirun.mic -n 8 -ppn 8 -hostfile micfile.$PBS_JOBID -env OMP_NUM_THREADS=30 -env KMP_AFFINITY=compact -env I_MPI_FABRICS=shm:dapl -env I_MPI_PIN_DOMAIN=30 ./run.babbage.baseline 7 1 diff --git a/Util/hpgmg/finite-volume/source/defines.h b/Util/hpgmg/finite-volume/source/defines.h new file mode 100644 index 00000000..0d283514 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/defines.h @@ -0,0 +1,27 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +// Lu = a*alpha[]*u[] - b*divergence( beta[]*gradient(u[]) ) +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef DEFINES_H +#define DEFINES_H +//------------------------------------------------------------------------------------------------------------------------------ +#define VECTOR_TEMP 0 // +#define VECTOR_E 1 // error used in residual correction FMG +#define VECTOR_F_MINUS_AV 2 // cell centered residual (f-Av) +//------------------------------------------------------------------------------------------------------------------------------ +#define VECTOR_F 3 // original right-hand side (Au=f), cell centered +#define VECTOR_U 4 // numerical solution +#define VECTOR_ALPHA 5 // cell centered coefficient +#define VECTOR_BETA_I 6 // face centered coefficient (n.b. element 0 is the left face of the ghost zone element) +#define VECTOR_BETA_J 7 // face centered coefficient (n.b. element 0 is the back face of the ghost zone element) +#define VECTOR_BETA_K 8 // face centered coefficient (n.b. element 0 is the bottom face of the ghost zone element) +//------------------------------------------------------------------------------------------------------------------ +#define VECTOR_DINV 9 // cell centered relaxation parameter (e.g. inverse of the diagonal) +#define VECTOR_L1INV 10 // cell centered relaxation parameter (e.g. inverse of the L1 norm of each row) +//------------------------------------------------------------------------------------------------------------------ +#define VECTORS_RESERVED 11 // total number of vectors and the starting location for any auxillary bottom solver vectors +//------------------------------------------------------------------------------------------------------------------------------ +#endif diff --git a/Util/hpgmg/finite-volume/source/hpgmg-fv.c b/Util/hpgmg/finite-volume/source/hpgmg-fv.c new file mode 100644 index 00000000..ff53f471 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/hpgmg-fv.c @@ -0,0 +1,382 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Copyright Notice +//------------------------------------------------------------------------------------------------------------------------------ +// HPGMG, Copyright (c) 2014, The Regents of the University of +// California, through Lawrence Berkeley National Laboratory (subject to +// receipt of any required approvals from the U.S. Dept. of Energy). All +// rights reserved. +// +// If you have questions about your rights to use or distribute this +// software, please contact Berkeley Lab's Technology Transfer Department +// at TTD@lbl.gov. +// +// NOTICE. This software is owned by the U.S. Department of Energy. As +// such, the U.S. Government has been granted for itself and others +// acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide +// license in the Software to reproduce, prepare derivative works, and +// perform publicly and display publicly. Beginning five (5) years after +// the date permission to assert copyright is obtained from the U.S. +// Department of Energy, and subject to any subsequent five (5) year +// renewals, the U.S. Government is granted for itself and others acting +// on its behalf a paid-up, nonexclusive, irrevocable, worldwide license +// in the Software to reproduce, prepare derivative works, distribute +// copies to the public, perform publicly and display publicly, and to +// permit others to do so. +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef USE_MPI +#include +#endif +#ifdef _OPENMP +#include +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#include "timers.h" +#include "defines.h" +#include "level.h" +#include "mg.h" +#include "operators.h" +#include "solvers.h" +//------------------------------------------------------------------------------------------------------------------------------ +void bench_hpgmg(mg_type *all_grids, int onLevel, double a, double b, double dtol, double rtol){ + int doTiming; + int minSolves = 10; // do at least minSolves MGSolves + double timePerSolve = 0; + + for(doTiming=0;doTiming<=1;doTiming++){ // first pass warms up, second pass times + + #ifdef USE_HPM // IBM performance counters for BGQ... + if( (doTiming==1) && (onLevel==0) )HPM_Start("FMGSolve()"); + #endif + + #ifdef USE_MPI + double minTime = 60.0; // minimum time in seconds that the benchmark should run + double startTime = MPI_Wtime(); + if(doTiming==1){ + if((minTime/timePerSolve)>minSolves)minSolves=(minTime/timePerSolve); // if one needs to do more than minSolves to run for minTime, change minSolves + } + #endif + + if(all_grids->levels[onLevel]->my_rank==0){ + if(doTiming==0){fprintf(stdout,"\n\n===== Warming up by running %d solves ==========================================\n",minSolves);} + else{fprintf(stdout,"\n\n===== Running %d solves ========================================================\n",minSolves);} + fflush(stdout); + } + + int numSolves = 0; // solves completed + MGResetTimers(all_grids); + while( (numSolveslevels[onLevel],VECTOR_U); + #ifdef USE_FCYCLES + FMGSolve(all_grids,onLevel,VECTOR_U,VECTOR_F,a,b,dtol,rtol); + #else + MGSolve(all_grids,onLevel,VECTOR_U,VECTOR_F,a,b,dtol,rtol); + #endif + numSolves++; + } + + #ifdef USE_MPI + if(doTiming==0){ + double endTime = MPI_Wtime(); + timePerSolve = (endTime-startTime)/numSolves; + MPI_Bcast(&timePerSolve,1,MPI_DOUBLE,0,MPI_COMM_WORLD); // after warmup, process 0 broadcasts the average time per solve (consensus) + } + #endif + + #ifdef USE_HPM // IBM performance counters for BGQ... + if( (doTiming==1) && (onLevel==0) )HPM_Stop("FMGSolve()"); + #endif + } +} + + +//------------------------------------------------------------------------------------------------------------------------------ +int main(int argc, char **argv){ + int my_rank=0; + int num_tasks=1; + int OMP_Threads = 1; + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #ifdef _OPENMP + #pragma omp parallel + { + #pragma omp master + { + OMP_Threads = omp_get_num_threads(); + } + } + #endif + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // initialize MPI and HPM + #ifdef USE_MPI + int actual_threading_model = -1; + int requested_threading_model = -1; + requested_threading_model = MPI_THREAD_SINGLE; + //requested_threading_model = MPI_THREAD_FUNNELED; + //requested_threading_model = MPI_THREAD_SERIALIZED; + //requested_threading_model = MPI_THREAD_MULTIPLE; + #ifdef _OPENMP + requested_threading_model = MPI_THREAD_FUNNELED; + //requested_threading_model = MPI_THREAD_SERIALIZED; + //requested_threading_model = MPI_THREAD_MULTIPLE; + #endif + MPI_Init_thread(&argc, &argv, requested_threading_model, &actual_threading_model); + MPI_Comm_size(MPI_COMM_WORLD, &num_tasks); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + #ifdef USE_HPM // IBM HPM counters for BGQ... + HPM_Init(); + #endif + #endif // USE_MPI + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // parse the arguments... + int log2_box_dim = 6; // 64^3 + int target_boxes_per_rank = 1; +//int64_t target_memory_per_rank = -1; // not specified + int64_t box_dim = -1; + int64_t boxes_in_i = -1; + int64_t target_boxes = -1; + + if(argc==3){ + log2_box_dim=atoi(argv[1]); + target_boxes_per_rank=atoi(argv[2]); + + if(log2_box_dim>9){ + // NOTE, in order to use 32b int's for array indexing, box volumes must be less than 2^31 doubles + if(my_rank==0){fprintf(stderr,"log2_box_dim must be less than 10\n");} + #ifdef USE_MPI + MPI_Finalize(); + #endif + exit(0); + } + + if(log2_box_dim<4){ + if(my_rank==0){fprintf(stderr,"log2_box_dim must be at least 4\n");} + #ifdef USE_MPI + MPI_Finalize(); + #endif + exit(0); + } + + if(target_boxes_per_rank<1){ + if(my_rank==0){fprintf(stderr,"target_boxes_per_rank must be at least 1\n");} + #ifdef USE_MPI + MPI_Finalize(); + #endif + exit(0); + } + + #ifndef MAX_COARSE_DIM + #define MAX_COARSE_DIM 11 + #endif + box_dim=1<0)restriction(MG_h.levels[l],VECTOR_F,MG_h.levels[l-1],VECTOR_F,RESTRICT_CELL); + bench_hpgmg(&MG_h,l,a,b,dtol,rtol); + AverageSolveTime[l] = (double)MG_h.timers.MGSolve / (double)MG_h.MGSolves_performed; + if(my_rank==0){fprintf(stdout,"\n\n===== Timing Breakdown =========================================================\n");} + MGPrintTiming(&MG_h,l); + } + + if(my_rank==0){ + #ifdef CALIBRATE_TIMER + double _timeStart=getTime();sleep(1);double _timeEnd=getTime(); + double SecondsPerCycle = (double)1.0/(double)(_timeEnd-_timeStart); + #else + double SecondsPerCycle = 1.0; + #endif + fprintf(stdout,"\n\n===== Performance Summary ======================================================\n"); + for(l=0;l<3;l++){ + double DOF = (double)MG_h.levels[l]->dim.i*(double)MG_h.levels[l]->dim.j*(double)MG_h.levels[l]->dim.k; + double seconds = SecondsPerCycle*(double)AverageSolveTime[l]; + double DOFs = DOF / seconds; + fprintf(stdout," h=%0.15e DOF=%0.15e time=%0.6f DOF/s=%0.3e MPI=%d OMP=%d\n",MG_h.levels[l]->h,DOF,seconds,DOFs,num_tasks,OMP_Threads); + } + } + #endif + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + if(my_rank==0){fprintf(stdout,"\n\n===== Richardson error analysis ================================================\n");} + // solve A^h u^h = f^h + // solve A^2h u^2h = f^2h + // solve A^4h u^4h = f^4h + // error analysis... + MGResetTimers(&MG_h); + for(l=0;l<3;l++){ + if(l>0)restriction(MG_h.levels[l],VECTOR_F,MG_h.levels[l-1],VECTOR_F,RESTRICT_CELL); + zero_vector(MG_h.levels[l],VECTOR_U); + #ifdef USE_FCYCLES + FMGSolve(&MG_h,l,VECTOR_U,VECTOR_F,a,b,dtol,rtol); + #else + MGSolve(&MG_h,l,VECTOR_U,VECTOR_F,a,b,dtol,rtol); + #endif + } + richardson_error(&MG_h,0,VECTOR_U); + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + if(my_rank==0){fprintf(stdout,"\n\n===== Deallocating memory ======================================================\n");} + MGDestroy(&MG_h); + destroy_level(&level_h); + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + if(my_rank==0){fprintf(stdout,"\n\n===== Done =====================================================================\n");} + + #ifdef USE_MPI + #ifdef USE_HPM // IBM performance counters for BGQ... + HPM_Print(); + #endif + MPI_Finalize(); + #endif + return(0); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +} diff --git a/Util/hpgmg/finite-volume/source/hpgmg_setup.c b/Util/hpgmg/finite-volume/source/hpgmg_setup.c new file mode 100644 index 00000000..f5882cf3 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/hpgmg_setup.c @@ -0,0 +1,235 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Copyright Notice +//------------------------------------------------------------------------------------------------------------------------------ +// HPGMG, Copyright (c) 2014, The Regents of the University of +// California, through Lawrence Berkeley National Laboratory (subject to +// receipt of any required approvals from the U.S. Dept. of Energy). All +// rights reserved. +// +// If you have questions about your rights to use or distribute this +// software, please contact Berkeley Lab's Technology Transfer Department +// at TTD@lbl.gov. +// +// NOTICE. This software is owned by the U.S. Department of Energy. As +// such, the U.S. Government has been granted for itself and others +// acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide +// license in the Software to reproduce, prepare derivative works, and +// perform publicly and display publicly. Beginning five (5) years after +// the date permission to assert copyright is obtained from the U.S. +// Department of Energy, and subject to any subsequent five (5) year +// renewals, the U.S. Government is granted for itself and others acting +// on its behalf a paid-up, nonexclusive, irrevocable, worldwide license +// in the Software to reproduce, prepare derivative works, distribute +// copies to the public, perform publicly and display publicly, and to +// permit others to do so. +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef USE_MPI +#include +#endif +#ifdef _OPENMP +#include +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#include "defines.h" +#include "level.h" +#include "mg.h" +#include "operators.h" +#include "solvers.h" +//------------------------------------------------------------------------------------------------------------------------------ +void hpgmg_setup(const int log2_box_dim, + const int target_boxes_per_rank, + const int OMP_Threads, + const int OMP_Nested, + const int requested_threading_model, + const int actual_threading_model) { + int my_rank=0; + int num_tasks=1; + + #ifdef USE_MPI + MPI_Comm_size(MPI_COMM_WORLD, &num_tasks); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); +//if(actual_threading_model>requested_threading_model)actual_threading_model=requested_threading_model; + if(my_rank==0){ + if(requested_threading_model == MPI_THREAD_MULTIPLE )printf("Requested MPI_THREAD_MULTIPLE, "); + else if(requested_threading_model == MPI_THREAD_SINGLE )printf("Requested MPI_THREAD_SINGLE, "); + else if(requested_threading_model == MPI_THREAD_FUNNELED )printf("Requested MPI_THREAD_FUNNELED, "); + else if(requested_threading_model == MPI_THREAD_SERIALIZED)printf("Requested MPI_THREAD_SERIALIZED, "); + else if(requested_threading_model == MPI_THREAD_MULTIPLE )printf("Requested MPI_THREAD_MULTIPLE, "); + else printf("Requested Unknown MPI Threading Model (%d), ",requested_threading_model); + if(actual_threading_model == MPI_THREAD_MULTIPLE )printf("got MPI_THREAD_MULTIPLE\n"); + else if(actual_threading_model == MPI_THREAD_SINGLE )printf("got MPI_THREAD_SINGLE\n"); + else if(actual_threading_model == MPI_THREAD_FUNNELED )printf("got MPI_THREAD_FUNNELED\n"); + else if(actual_threading_model == MPI_THREAD_SERIALIZED)printf("got MPI_THREAD_SERIALIZED\n"); + else if(actual_threading_model == MPI_THREAD_MULTIPLE )printf("got MPI_THREAD_MULTIPLE\n"); + else printf("got Unknown MPI Threading Model (%d)\n",actual_threading_model); + } + #endif + + + if(log2_box_dim<4){ + if(my_rank==0){printf("log2_box_dim must be at least 4\n");} + #ifdef USE_MPI + MPI_Finalize(); + #endif + exit(0); + } + + if(target_boxes_per_rank<1){ + if(my_rank==0){printf("target_boxes_per_rank must be at least 1\n");} + #ifdef USE_MPI + MPI_Finalize(); + #endif + exit(0); + } + + if(my_rank==0){ + if(OMP_Nested)fprintf(stdout,"%d MPI Tasks of %d threads (OMP_NESTED=TRUE)\n\n" ,num_tasks,OMP_Threads); + else fprintf(stdout,"%d MPI Tasks of %d threads (OMP_NESTED=FALSE)\n\n",num_tasks,OMP_Threads); + } + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // calculate the problem size... + #ifndef MAX_COARSE_DIM + #define MAX_COARSE_DIM 11 + #endif + int64_t box_dim=1<minSolves)minSolves=(minTime/timePerSolve); // if one needs to do more than minSolves to run for minTime, change minSolves + } + #endif + + if(my_rank==0){ + if(doTiming==0){fprintf(stdout,"\n\n===== warming up by running %d solves ===============================\n",minSolves);} + else{fprintf(stdout,"\n\n===== running %d solves =============================================\n",minSolves);} + fflush(stdout); + } + + int numSolves = 0; // solves completed + MGResetTimers(&all_grids); + while( (numSolves +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef USE_MPI +#include +#endif +#ifdef _OPENMP +#include +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#include "timers.h" +#include "defines.h" +#include "level.h" +#include "operators.h" +//------------------------------------------------------------------------------------------------------------------------------ +void print_communicator(int printSendRecv, int rank, int level, communicator_type *comm){ + int i; + printf("rank=%2d level=%d ",rank,level); + if(printSendRecv & 0x1){ + printf("num_sends=%2d ",comm->num_sends); + printf("send_ranks=[ ");for(i=0;inum_sends;i++)printf("%2d ",comm->send_ranks[i]);printf("] "); + printf("send_sizes=[ ");for(i=0;inum_sends;i++)printf("%2d ",comm->send_sizes[i]);printf("] "); + printf("send_buffers=[ ");for(i=0;inum_sends;i++)printf("%08lx ",(uint64_t)comm->send_buffers[i]);printf("] "); + for(i=0;inum_blocks[0];i++)printf("[ %dx%dx%d from %d %d %d %d %d to %d %d %d %d %d ] ",comm->blocks[0][i].dim.i,comm->blocks[0][i].dim.j,comm->blocks[0][i].dim.k,comm->blocks[0][i].read.i,comm->blocks[0][i].read.j,comm->blocks[0][i].read.k,comm->blocks[0][i].read.jStride,comm->blocks[0][i].read.kStride,comm->blocks[0][i].write.i,comm->blocks[0][i].write.j,comm->blocks[0][i].write.k,comm->blocks[0][i].write.jStride,comm->blocks[0][i].write.kStride); + printf("\n"); + } + if(printSendRecv & 0x2){ + for(i=0;inum_blocks[1];i++)printf("[ %dx%dx%d from %d %d %d %d %d to %d %d %d %d %d ] ",comm->blocks[1][i].dim.i,comm->blocks[1][i].dim.j,comm->blocks[1][i].dim.k,comm->blocks[1][i].read.i,comm->blocks[1][i].read.j,comm->blocks[1][i].read.k,comm->blocks[1][i].read.jStride,comm->blocks[1][i].read.kStride,comm->blocks[1][i].write.i,comm->blocks[1][i].write.j,comm->blocks[1][i].write.k,comm->blocks[1][i].write.jStride,comm->blocks[1][i].write.kStride); + printf("\n"); + } + if(printSendRecv & 0x4){ + printf("num_recvs=%2d ",comm->num_recvs); + printf("recv_ranks=[ ");for(i=0;inum_recvs;i++)printf("%2d ",comm->recv_ranks[i]);printf("] "); + printf("recv_sizes=[ ");for(i=0;inum_recvs;i++)printf("%2d ",comm->recv_sizes[i]);printf("] "); + printf("recv_buffers=[ ");for(i=0;inum_recvs;i++)printf("%08lx ",(uint64_t)comm->recv_buffers[i]);printf("] "); + for(i=0;inum_blocks[2];i++)printf("[ %dx%dx%d from %d %d %d %d %d to %d %d %d %d %d ] ",comm->blocks[2][i].dim.i,comm->blocks[2][i].dim.j,comm->blocks[2][i].dim.k,comm->blocks[2][i].read.i,comm->blocks[2][i].read.j,comm->blocks[2][i].read.k,comm->blocks[2][i].read.jStride,comm->blocks[2][i].read.kStride,comm->blocks[2][i].write.i,comm->blocks[2][i].write.j,comm->blocks[2][i].write.k,comm->blocks[2][i].write.jStride,comm->blocks[2][i].write.kStride); + printf("\n"); + } + fflush(stdout); +} +//------------------------------------------------------------------------------------------------------------------------------ +typedef struct { + int sendRank; + int sendBoxID; + int sendBox; + int sendDir; + int recvRank; + int recvBoxID; + int recvBox; +} GZ_type; + + +int qsortGZ(const void *a, const void*b){ + GZ_type *gza = (GZ_type*)a; + GZ_type *gzb = (GZ_type*)b; + // by convention, MPI buffers are first sorted by sendRank + if(gza->sendRank < gzb->sendRank)return(-1); + if(gza->sendRank > gzb->sendRank)return( 1); + // then by sendBoxID + if(gza->sendBoxID < gzb->sendBoxID)return(-1); + if(gza->sendBoxID > gzb->sendBoxID)return( 1); + // and finally by the direction sent + if(gza->sendDir < gzb->sendDir)return(-1); + if(gza->sendDir > gzb->sendDir)return( 1); + return(0); +} + + +int qsortInt(const void *a, const void *b){ + int *ia = (int*)a; + int *ib = (int*)b; + if(*ia < *ib)return(-1); + if(*ia > *ib)return( 1); + return( 0); +} + +int qsortBlock(const void *a, const void *b){ + blockCopy_type *ba = (blockCopy_type*)a; + blockCopy_type *bb = (blockCopy_type*)b; + + if(ba->write.box >= 0){ + // sort by box... + if(ba->write.box < bb->write.box)return(-1); + if(ba->write.box > bb->write.box)return( 1); + // now sort by k + if(ba->write.k < bb->write.k )return(-1); + if(ba->write.k > bb->write.k )return( 1); + // now sort by j + if(ba->write.j < bb->write.j )return(-1); + if(ba->write.j > bb->write.j )return( 1); + // now sort by i + if(ba->write.i < bb->write.i )return(-1); + if(ba->write.i > bb->write.i )return( 1); + }else if(ba->read.box >= 0){ + // sort by box... + if(ba->read.box < bb->read.box )return(-1); + if(ba->read.box > bb->read.box )return( 1); + // now sort by k + if(ba->read.k < bb->read.k )return(-1); + if(ba->read.k > bb->read.k )return( 1); + // now sort by j + if(ba->read.j < bb->read.j )return(-1); + if(ba->read.j > bb->read.j )return( 1); + // now sort by i + if(ba->read.i < bb->read.i )return(-1); + if(ba->read.i > bb->read.i )return( 1); + } + return( 0); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void decompose_level_lex(int *rank_of_box, int idim, int jdim, int kdim, int ranks){ + // simple lexicographical decomposition of the domain (i-j-k ordering) + // load balancing is easily realized + // unfortunately, each process will likely receive one or two long pencils of boxes. + // as such, the resultant surface:volum ratio will likely be poor + int boxes = idim*jdim*kdim; + int i,j,k; + for(k=0;k=idim)&&(kdim>=jdim) ){if( (kdim%f==0) && (ranks%f==0) ){for(ff=0;ff=idim)&&(jdim>=kdim) ){if( (jdim%f==0) && (ranks%f==0) ){for(ff=0;ff=jdim)&&(idim>=kdim) ){if( (idim%f==0) && (ranks%f==0) ){for(ff=0;ff=jdim)&&(idim>=kdim) ){ + int dim0 = (int)(0.5*(double)idim + 0.50); + int dim1 = idim-dim0; + int r0 = (int)( 0.5 + (double)ranks*(double)dim0/(double)idim ); + int r1 = ranks-r0; + decompose_level_bisection_special(rank_of_box,jStride,kStride,ilo ,jlo,klo,dim0,jdim,kdim,rank_lo ,r0); // lo + decompose_level_bisection_special(rank_of_box,jStride,kStride,ilo+dim0,jlo,klo,dim1,jdim,kdim,rank_lo+r0,r1); // hi + return; + } + // try and bisect the domain in the j-dimension + if( (jdim>=idim)&&(jdim>=kdim) ){ + int dim0 = (int)(0.5*(double)jdim + 0.50); + int dim1 = jdim-dim0; + int r0 = (int)( 0.5 + (double)ranks*(double)dim0/(double)jdim ); + int r1 = ranks-r0; + decompose_level_bisection_special(rank_of_box,jStride,kStride,ilo,jlo ,klo,idim,dim0,kdim,rank_lo ,r0); // lo + decompose_level_bisection_special(rank_of_box,jStride,kStride,ilo,jlo+dim0,klo,idim,dim1,kdim,rank_lo+r0,r1); // hi + return; + } + // try and bisect the domain in the k-dimension + if( (kdim>=idim)&&(kdim>=jdim) ){ + int dim0 = (int)(0.5*(double)kdim + 0.50); + int dim1 = kdim-dim0; + int r0 = (int)( 0.5 + (double)ranks*(double)dim0/(double)kdim ); + int r1 = ranks-r0; + decompose_level_bisection_special(rank_of_box,jStride,kStride,ilo,jlo,klo ,idim,jdim,dim0,rank_lo ,r0); // lo + decompose_level_bisection_special(rank_of_box,jStride,kStride,ilo,jlo,klo+dim0,idim,jdim,dim1,rank_lo+r0,r1); // hi + return; + } + fprintf(stderr,"decompose_level_bisection_special failed !!!\n");exit(0); +} + + +//--------------------------------------------------------------------------------------------------------------------------------------------------- +void decompose_level_bisection(int *rank_of_box, int jStride, int kStride, int ilo, int jlo, int klo, int idim, int jdim, int kdim, int ranks, int sfc_offset, int sfc_max_length){ + + // base case... + if( (idim==1) && (jdim==1) && (kdim==1) ){ + int b = ilo + jlo*jStride + klo*kStride; + rank_of_box[b] = ((uint64_t)ranks*(uint64_t)sfc_offset)/(uint64_t)sfc_max_length; // sfc_max_length is the precomputed maximum length + return; + } + + // try and bisect the domain in the i-dimension + if( (idim>=jdim)&&(idim>=kdim) ){ + int dim0 = (int)(0.5*(double)idim + 0.50); + int dim1 = idim-dim0; + int sfc_delta = dim0*jdim*kdim; + decompose_level_bisection(rank_of_box,jStride,kStride,ilo ,jlo,klo,dim0,jdim,kdim,ranks,sfc_offset ,sfc_max_length); // lo + decompose_level_bisection(rank_of_box,jStride,kStride,ilo+dim0,jlo,klo,dim1,jdim,kdim,ranks,sfc_offset+sfc_delta,sfc_max_length); // hi + return; + } + + // try and bisect the domain in the j-dimension + if( (jdim>=idim)&&(jdim>=kdim) ){ + int dim0 = (int)(0.5*(double)jdim + 0.50); + int dim1 = jdim-dim0; + int sfc_delta = idim*dim0*kdim; + decompose_level_bisection(rank_of_box,jStride,kStride,ilo,jlo ,klo,idim,dim0,kdim,ranks,sfc_offset ,sfc_max_length); // lo + decompose_level_bisection(rank_of_box,jStride,kStride,ilo,jlo+dim0,klo,idim,dim1,kdim,ranks,sfc_offset+sfc_delta,sfc_max_length); // hi + return; + } + + // try and bisect the domain in the k-dimension + if( (kdim>=idim)&&(kdim>=jdim) ){ + int dim0 = (int)(0.5*(double)kdim + 0.50); + int dim1 = kdim-dim0; + int sfc_delta = idim*jdim*dim0; + decompose_level_bisection(rank_of_box,jStride,kStride,ilo,jlo,klo ,idim,jdim,dim0,ranks,sfc_offset ,sfc_max_length); // lo + decompose_level_bisection(rank_of_box,jStride,kStride,ilo,jlo,klo+dim0,idim,jdim,dim1,ranks,sfc_offset+sfc_delta,sfc_max_length); // hi + return; + } + + // failure... + fprintf(stderr,"decompose_level_bisection failed !!!\n");exit(0); +} + + +//--------------------------------------------------------------------------------------------------------------------------------------------------- +// Given a bounding box (idim,jdim,kdim) use a Z-morton Space Filling Curve (SFC) to assign the boxes within the (boxes_in_i,boxes_in_j,boxes_in_k) valid region domain +// sfc_offset is the current offset within the space filling curve (starts with 0) +// this function returns the new offset based on how many actual boxes it found within (ilo,jlo,klo) + (idim,jdim,kdim) +// sfc_max_length is the maximum length of the SFC. Note, if this length exceeds boxes_in_i*boxes_in_j*boxes_in_k, then some processes with receive no work +int decompose_level_zmort(int *rank_of_box, int boxes_in_i, int boxes_in_j, int boxes_in_k, int ilo, int jlo, int klo, int idim, int jdim, int kdim, int ranks, int sfc_offset, int sfc_max_length){ + + // invalid cases... + if(idim<1)return(sfc_offset); + if(jdim<1)return(sfc_offset); + if(kdim<1)return(sfc_offset); + if(ilo <0)return(sfc_offset); + if(jlo <0)return(sfc_offset); + if(klo <0)return(sfc_offset); + + // base case... + if( (idim==1) && (jdim==1) && (kdim==1) ){ + if( (ilomy_rank!=0)return; + printf("\n"); + int i,j,k; + int jStride = level->boxes_in.i; + int kStride = level->boxes_in.i*level->boxes_in.j; + for(k=level->boxes_in.k-1;k>=0;k--){ // (i,j,k)=(0,0,0) is bottom left corner + for(j=level->boxes_in.j-1;j>=0;j--){ // (i,j)=(0,0) is bottom left corner + for(i=0;iboxes_in.i;i++){ + int b = i + j*jStride + k*kStride; + printf("%4d ",level->rank_of_box[b]); + }printf("\n"); + }printf("\n\n"); + } + fflush(stdout); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// append the specified block (logical region) to the current list of blocks +// each block may be tiled to... +// - create more parallelism across the list of blocks +// - limit parallelism within a block +// - limit the memory requirements for each block +#ifndef BLOCK_LIST_MIN_SIZE +#define BLOCK_LIST_MIN_SIZE 1000 +#endif +void append_block_to_list(blockCopy_type ** blocks, int *allocated_blocks, int *num_blocks, + int dim_i, int dim_j, int dim_k, + int read_box, double* read_ptr, int read_i, int read_j, int read_k, int read_jStride, int read_kStride, int read_scale, + int write_box, double* write_ptr, int write_i, int write_j, int write_k, int write_jStride, int write_kStride, int write_scale, + int blockcopy_tile_i, int blockcopy_tile_j, int blockcopy_tile_k, + int subtype + ){ + // Take a dim_j x dim_k iteration space and tile it into smaller faces of size blockcopy_tile_j x blockcopy_tile_k + // This increases the number of blockCopies in the ghost zone exchange and thereby increases the thread-level parallelism + + #if 0 + // use recursive (z-mort) ordering of tiles in order to improve locality on deep memory hierarchies... + int doRecursion=0; + if(dim_i > blockcopy_tile_i)doRecursion=1; + if(dim_j > blockcopy_tile_j)doRecursion=1; + if(dim_k > blockcopy_tile_k)doRecursion=1; + if( read_scale != 1)doRecursion=0; // disable recursion for restriction + if(write_scale != 1)doRecursion=0; // disable recursion for interpolation + if(doRecursion){ + int mid_i = (dim_i + 1)/2; + int mid_j = (dim_j + 1)/2; + int mid_k = (dim_k + 1)/2; + mid_i = blockcopy_tile_i*( (mid_i+blockcopy_tile_i-1)/blockcopy_tile_i); + mid_j = blockcopy_tile_j*( (mid_j+blockcopy_tile_j-1)/blockcopy_tile_j); + mid_k = blockcopy_tile_k*( (mid_k+blockcopy_tile_k-1)/blockcopy_tile_k); + if(mid_i>dim_i)mid_i=dim_i; + if(mid_j>dim_j)mid_j=dim_j; + if(mid_k>dim_k)mid_k=dim_k; + append_block_to_list(blocks,allocated_blocks,num_blocks, mid_i, mid_j, mid_k, + read_box, read_ptr, read_i , read_j , read_k , read_jStride, read_kStride, read_scale, + write_box,write_ptr,write_i ,write_j ,write_k ,write_jStride,write_kStride,write_scale, + blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype); + append_block_to_list(blocks,allocated_blocks,num_blocks,dim_i-mid_i, mid_j, mid_k, + read_box, read_ptr, read_i+mid_i, read_j , read_k , read_jStride, read_kStride, read_scale, + write_box,write_ptr,write_i+mid_i,write_j ,write_k ,write_jStride,write_kStride,write_scale, + blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype); + append_block_to_list(blocks,allocated_blocks,num_blocks, mid_i,dim_j-mid_j, mid_k, + read_box, read_ptr, read_i , read_j+mid_j, read_k , read_jStride, read_kStride, read_scale, + write_box,write_ptr,write_i ,write_j+mid_j,write_k ,write_jStride,write_kStride,write_scale, + blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype); + append_block_to_list(blocks,allocated_blocks,num_blocks,dim_i-mid_i,dim_j-mid_j, mid_k, + read_box, read_ptr, read_i+mid_i, read_j+mid_j, read_k , read_jStride, read_kStride, read_scale, + write_box,write_ptr,write_i+mid_i,write_j+mid_j,write_k ,write_jStride,write_kStride,write_scale, + blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype); + append_block_to_list(blocks,allocated_blocks,num_blocks, mid_i, mid_j,dim_k-mid_k, + read_box, read_ptr, read_i , read_j , read_k+mid_k, read_jStride, read_kStride, read_scale, + write_box,write_ptr,write_i ,write_j ,write_k+mid_k,write_jStride,write_kStride,write_scale, + blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype); + append_block_to_list(blocks,allocated_blocks,num_blocks,dim_i-mid_i, mid_j,dim_k-mid_k, + read_box, read_ptr, read_i+mid_i, read_j , read_k+mid_k, read_jStride, read_kStride, read_scale, + write_box,write_ptr,write_i+mid_i,write_j ,write_k+mid_k,write_jStride,write_kStride,write_scale, + blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype); + append_block_to_list(blocks,allocated_blocks,num_blocks, mid_i,dim_j-mid_j,dim_k-mid_k, + read_box, read_ptr, read_i , read_j+mid_j, read_k+mid_k, read_jStride, read_kStride, read_scale, + write_box,write_ptr,write_i ,write_j+mid_j,write_k+mid_k,write_jStride,write_kStride,write_scale, + blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype); + append_block_to_list(blocks,allocated_blocks,num_blocks,dim_i-mid_i,dim_j-mid_j,dim_k-mid_k, + read_box, read_ptr, read_i+mid_i, read_j+mid_j, read_k+mid_k, read_jStride, read_kStride, read_scale, + write_box,write_ptr,write_i+mid_i,write_j+mid_j,write_k+mid_k,write_jStride,write_kStride,write_scale, + blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype); + return; + } + #endif + // read_/write_scale are used to stride appropriately when read and write loop iterations spaces are different + // ghostZone: read_scale=1, write_scale=1 + // interpolation: read_scale=1, write_scale=2 + // restriction: read_scale=2, write_scale=1 + // FIX... dim_i,j,k -> read_dim_i,j,k, write_dim_i,j,k + int ii,jj,kk; + for(kk=0;kkblockcopy_tile_k)dim_k_mod=blockcopy_tile_k; + int dim_j_mod = dim_j-jj;if(dim_j_mod>blockcopy_tile_j)dim_j_mod=blockcopy_tile_j; + int dim_i_mod = dim_i-ii;if(dim_i_mod>blockcopy_tile_i)dim_i_mod=blockcopy_tile_i; + if(*num_blocks >= *allocated_blocks){ + int oldSize = *allocated_blocks; + if(*allocated_blocks == 0){*allocated_blocks=BLOCK_LIST_MIN_SIZE;*blocks=(blockCopy_type*) malloc( (*allocated_blocks)*sizeof(blockCopy_type));} + else{*allocated_blocks*=2; *blocks=(blockCopy_type*)realloc((void*)(*blocks),(*allocated_blocks)*sizeof(blockCopy_type));} + if(*blocks == NULL){fprintf(stderr,"realloc failed - append_block_to_list (%d -> %d)\n",oldSize,*allocated_blocks);exit(0);} + } + (*blocks)[*num_blocks].subtype = subtype; + (*blocks)[*num_blocks].dim.i = dim_i_mod; + (*blocks)[*num_blocks].dim.j = dim_j_mod; + (*blocks)[*num_blocks].dim.k = dim_k_mod; + (*blocks)[*num_blocks].read.box = read_box; + (*blocks)[*num_blocks].read.ptr = read_ptr; + (*blocks)[*num_blocks].read.i = read_i + read_scale*ii; + (*blocks)[*num_blocks].read.j = read_j + read_scale*jj; + (*blocks)[*num_blocks].read.k = read_k + read_scale*kk; + (*blocks)[*num_blocks].read.jStride = read_jStride; + (*blocks)[*num_blocks].read.kStride = read_kStride; + (*blocks)[*num_blocks].write.box = write_box; + (*blocks)[*num_blocks].write.ptr = write_ptr; + (*blocks)[*num_blocks].write.i = write_i + write_scale*ii; + (*blocks)[*num_blocks].write.j = write_j + write_scale*jj; + (*blocks)[*num_blocks].write.k = write_k + write_scale*kk; + (*blocks)[*num_blocks].write.jStride = write_jStride; + (*blocks)[*num_blocks].write.kStride = write_kStride; + (*num_blocks)++; + }}} +} + + +//---------------------------------------------------------------------------------------------------------------------------------------------------- +// create a mini program that traverses the domain boundary intersecting with this process's boxes +// This includes faces, corners, and edges +void build_boundary_conditions(level_type *level, int shape){ + level->boundary_condition.blocks[shape] = NULL; // default for periodic (i.e. no BC's) + level->boundary_condition.num_blocks[shape] = 0; // default for periodic (i.e. no BC's) + level->boundary_condition.allocated_blocks[shape] = 0; // default for periodic (i.e. no BC's) + if(level->boundary_condition.type == BC_PERIODIC)return; + +//int faces[27] = {0,0,0,0,1,0,0,0,0, 0,1,0,1,0,1,0,1,0, 0,0,0,0,1,0,0,0,0}; + int edges[27] = {0,1,0,1,0,1,0,1,0, 1,0,1,0,0,0,1,0,1, 0,1,0,1,0,1,0,1,0}; + int corners[27] = {1,0,1,0,0,0,1,0,1, 0,0,0,0,0,0,0,0,0, 1,0,1,0,0,0,1,0,1}; + + int box, di,dj,dk; + for(box=0;boxnum_my_boxes;box++){ // traverse my list of boxes... + for(dk=-1;dk<=1;dk++){ // for each box, examine its 26 neighbors... + for(dj=-1;dj<=1;dj++){ + for(di=-1;di<=1;di++){ + int dir = 13+di+3*dj+9*dk; // face/edge/corner of *THIS* box (not the domain) + + // determine if this region (box's di,dj,dk ghost zone) is outside of the domain + int regionIsOutside=0; + int normal = 13; // normal effectively defines the normal vector to the *DOMAIN* for this region... + // this addition is necessary for linearly interpolated BC's as a box's corner is not necessarily a domain's corner + int myBox_i = level->my_boxes[box].low.i / level->box_dim; + int myBox_j = level->my_boxes[box].low.j / level->box_dim; + int myBox_k = level->my_boxes[box].low.k / level->box_dim; + int neighborBox_i = ( myBox_i + di ); + int neighborBox_j = ( myBox_j + dj ); + int neighborBox_k = ( myBox_k + dk ); + if( neighborBox_i < 0 ){regionIsOutside=1;normal-=1;} + if( neighborBox_j < 0 ){regionIsOutside=1;normal-=3;} + if( neighborBox_k < 0 ){regionIsOutside=1;normal-=9;} + if( neighborBox_i >=level->boxes_in.i ){regionIsOutside=1;normal+=1;} + if( neighborBox_j >=level->boxes_in.j ){regionIsOutside=1;normal+=3;} + if( neighborBox_k >=level->boxes_in.k ){regionIsOutside=1;normal+=9;} + + // calculate ghost zone region size and coordinates relative to the first non-ghost zone element (0,0,0) + int block_i=-1,block_j=-1,block_k=-1; + int dim_i=-1, dim_j=-1, dim_k=-1; + switch(di){ + case -1:dim_i=level->box_ghosts;block_i=0-level->box_ghosts;break; + case 0:dim_i=level->box_dim; block_i=0; break; + case 1:dim_i=level->box_ghosts;block_i=0+level->box_dim; break; + } + switch(dj){ + case -1:dim_j=level->box_ghosts;block_j=0-level->box_ghosts;break; + case 0:dim_j=level->box_dim; block_j=0; break; + case 1:dim_j=level->box_ghosts;block_j=0+level->box_dim; break; + } + switch(dk){ + case -1:dim_k=level->box_ghosts;block_k=0-level->box_ghosts;break; + case 0:dim_k=level->box_dim; block_k=0; break; + case 1:dim_k=level->box_ghosts;block_k=0+level->box_dim; break; + } + + // use regionIsOutside to short circuit logic and cull unnecessary regions... + switch(shape){ + case STENCIL_SHAPE_STAR: if(edges[dir]||corners[dir])regionIsOutside=0;break; // star-shaped stencils don't need BC's enforced on corners or edges + case STENCIL_SHAPE_NO_CORNERS:if( corners[dir])regionIsOutside=0;break; // these stencils don't need BC's enforced on edges + } + + // default tile sizes... + // NOTE, BC's may never tile smaller than the ghost zone depth + int blockcopy_i = (BLOCKCOPY_TILE_I < level->box_ghosts) ? level->box_ghosts : BLOCKCOPY_TILE_I; + int blockcopy_j = (BLOCKCOPY_TILE_J < level->box_ghosts) ? level->box_ghosts : BLOCKCOPY_TILE_J; + int blockcopy_k = (BLOCKCOPY_TILE_K < level->box_ghosts) ? level->box_ghosts : BLOCKCOPY_TILE_K; + + #if 0 + // 2D tiling of faces + // 1D tiling of edges + // corners use defaults + switch(dir){ + case 1:blockcopy_i= 8;blockcopy_j=10000;blockcopy_k=10000;break; // i edge + case 3:blockcopy_i=10000;blockcopy_j= 8;blockcopy_k=10000;break; // j edge + case 4:blockcopy_i= 8;blockcopy_j= 8;blockcopy_k=10000;break; // ij face + case 5:blockcopy_i=10000;blockcopy_j= 8;blockcopy_k=10000;break; // j edge + case 7:blockcopy_i= 8;blockcopy_j=10000;blockcopy_k=10000;break; // i edge + + case 9:blockcopy_i=10000;blockcopy_j=10000;blockcopy_k= 8;break; // k edge + case 10:blockcopy_i= 8;blockcopy_j=10000;blockcopy_k= 8;break; // ik face + case 11:blockcopy_i=10000;blockcopy_j=10000;blockcopy_k= 8;break; // k edge + case 12:blockcopy_i=10000;blockcopy_j= 8;blockcopy_k= 8;break; // jk face + + case 14:blockcopy_i=10000;blockcopy_j= 8;blockcopy_k= 8;break; // jk face + case 15:blockcopy_i=10000;blockcopy_j=10000;blockcopy_k= 8;break; // k edge + case 16:blockcopy_i= 8;blockcopy_j=10000;blockcopy_k= 8;break; // ik face + case 17:blockcopy_i=10000;blockcopy_j=10000;blockcopy_k= 8;break; // k edge + + case 19:blockcopy_i= 8;blockcopy_j=10000;blockcopy_k=10000;break; // i edge + case 21:blockcopy_i=10000;blockcopy_j= 8;blockcopy_k=10000;break; // j edge + case 22:blockcopy_i= 8;blockcopy_j= 8;blockcopy_k=10000;break; // ij face + case 23:blockcopy_i=10000;blockcopy_j= 8;blockcopy_k=10000;break; // j edge + case 25:blockcopy_i= 8;blockcopy_j=10000;blockcopy_k=10000;break; // i edge + } + #endif + + if(regionIsOutside){ + append_block_to_list(&(level->boundary_condition.blocks[shape]),&(level->boundary_condition.allocated_blocks[shape]),&(level->boundary_condition.num_blocks[shape]), + /* dim.i = */ dim_i, + /* dim.j = */ dim_j, + /* dim.k = */ dim_k, + /* read.box = */ box, + /* read.ptr = */ NULL, + /* read.i = */ block_i, + /* read.j = */ block_j, + /* read.k = */ block_k, + /* read.jStride = */ level->my_boxes[box].jStride, + /* read.kStride = */ level->my_boxes[box].kStride, + /* read.scale = */ 1, + /* write.box = */ box, + /* write.ptr = */ NULL, + /* write.i = */ block_i, + /* write.j = */ block_j, + /* write.k = */ block_k, + /* write.jStride = */ level->my_boxes[box].jStride, + /* write.kStride = */ level->my_boxes[box].kStride, + /* write.scale = */ 1, + /* blockcopy_i = */ blockcopy_i, + /* blockcopy_j = */ blockcopy_j, + /* blockcopy_k = */ blockcopy_k, + /* subtype = */ normal + ); + }}}}} + + #ifdef BLOCK_SPATIAL_SORT + // sort all the resultant blocks by box,k,j,i (good locality) + qsort(level->boundary_condition.blocks[shape],level->boundary_condition.num_blocks[shape],sizeof(blockCopy_type),qsortBlock); + #endif +} + +//---------------------------------------------------------------------------------------------------------------------------------------------------- +// create a mini program that packs data into MPI recv buffers, exchanges local data, and unpacks the MPI send buffers +// broadly speaking... +// 1. traverse my list of Boxes and create a list of ghosts that must be sent +// 2. create a list of neighbors to send to +// 3. allocate and populate the pack list and allocate the send buffers +// 4. allocate and populate the local exchange list +// 5. traverse my list of Boxes and create a list of ghosts that must be received +// 6. create a list of neighbors to receive from +// 7. allocate and populate the unpack list and allocate the recv buffers +// +// thus a ghost zone exchange is +// 1. prepost a Irecv for each MPI recv buffer (1 per neighbor) +// 2. traverse the pack list +// 3. post the Isends for each MPI send buffer (1 per neighbor) +// 4. traverse the local copy list +// 5. waitall +// 6. traverse the unpack list +// +// / 24 25 26 / +// / 21 22 23 / (k+1) +// / 18 19 20 / +// +// / 15 16 17 / +// / 12 13 14 / (k) +// / 9 10 11 / +// +// / 6 7 8 / +// / 3 4 5 / (k-1) +// / 0 1 2 / +// +void build_exchange_ghosts(level_type *level, int shape){ + int faces[27] = {0,0,0,0,1,0,0,0,0, 0,1,0,1,0,1,0,1,0, 0,0,0,0,1,0,0,0,0}; + int edges[27] = {0,1,0,1,0,1,0,1,0, 1,0,1,0,0,0,1,0,1, 0,1,0,1,0,1,0,1,0}; + int corners[27] = {1,0,1,0,0,0,1,0,1, 0,0,0,0,0,0,0,0,0, 1,0,1,0,0,0,1,0,1}; + + // initialize to defaults... + level->exchange_ghosts[shape].num_recvs = 0; + level->exchange_ghosts[shape].num_sends = 0; + level->exchange_ghosts[shape].recv_ranks = NULL; + level->exchange_ghosts[shape].send_ranks = NULL; + level->exchange_ghosts[shape].recv_sizes = NULL; + level->exchange_ghosts[shape].send_sizes = NULL; + level->exchange_ghosts[shape].recv_buffers = NULL; + level->exchange_ghosts[shape].send_buffers = NULL; + level->exchange_ghosts[shape].blocks[0] = NULL; + level->exchange_ghosts[shape].blocks[1] = NULL; + level->exchange_ghosts[shape].blocks[2] = NULL; + level->exchange_ghosts[shape].num_blocks[0] = 0; + level->exchange_ghosts[shape].num_blocks[1] = 0; + level->exchange_ghosts[shape].num_blocks[2] = 0; + level->exchange_ghosts[shape].allocated_blocks[0] = 0; + level->exchange_ghosts[shape].allocated_blocks[1] = 0; + level->exchange_ghosts[shape].allocated_blocks[2] = 0; + #ifdef USE_MPI + level->exchange_ghosts[shape].requests = NULL; + level->exchange_ghosts[shape].status = NULL; + #endif + + int n,CommunicateThisDir[27];for(n=0;n<27;n++)CommunicateThisDir[n] = faces[n] + edges[n] + corners[n];// to be safe, communicate everything + switch(shape){ + case STENCIL_SHAPE_BOX: for(n=0;n<27;n++)CommunicateThisDir[n] = faces[n] + edges[n] + corners[n];break; + case STENCIL_SHAPE_STAR: for(n=0;n<27;n++)CommunicateThisDir[n] = faces[n] ;break; + case STENCIL_SHAPE_NO_CORNERS:for(n=0;n<27;n++)CommunicateThisDir[n] = faces[n] + edges[n] ;break; + } + + int sendBox,recvBox; + int stage; + int _rank; + int ghost,numGhosts,numGhostsRemote; + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // traverse my list of boxes and create a lists of neighboring boxes and neighboring ranks + GZ_type *ghostsToSend = (GZ_type*)malloc(26*level->num_my_boxes*sizeof(GZ_type)); // There are at most 26 neighbors per box. + int *sendRanks = ( int*)malloc(26*level->num_my_boxes*sizeof( int)); // There are at most 26 neighbors per box. + if(level->num_my_boxes>0){ + if(ghostsToSend == NULL){fprintf(stderr,"malloc failed - build_exchange_ghosts/ghostsToSend\n");exit(0);} + if(sendRanks == NULL){fprintf(stderr,"malloc failed - build_exchange_ghosts/sendRanks \n");exit(0);} + } + numGhosts = 0; + numGhostsRemote = 0; + for(sendBox=0;sendBoxnum_my_boxes;sendBox++){ + int di,dj,dk; + for(dk=-1;dk<=1;dk++){ + for(dj=-1;dj<=1;dj++){ + for(di=-1;di<=1;di++){ + int dir = 13+di+3*dj+9*dk;if(CommunicateThisDir[dir]){ + int myBoxID = level->my_boxes[sendBox].global_box_id; + int myBox_i = level->my_boxes[sendBox].low.i / level->box_dim; + int myBox_j = level->my_boxes[sendBox].low.j / level->box_dim; + int myBox_k = level->my_boxes[sendBox].low.k / level->box_dim; + int neighborBoxID = -1; + if(level->boundary_condition.type == BC_PERIODIC){ + int neighborBox_i = ( myBox_i + di + level->boxes_in.i) % level->boxes_in.i; + int neighborBox_j = ( myBox_j + dj + level->boxes_in.j) % level->boxes_in.j; + int neighborBox_k = ( myBox_k + dk + level->boxes_in.k) % level->boxes_in.k; + neighborBoxID = neighborBox_i + neighborBox_j*level->boxes_in.i + neighborBox_k*level->boxes_in.i*level->boxes_in.j; + }else{ + int neighborBox_i = ( myBox_i + di ); + int neighborBox_j = ( myBox_j + dj ); + int neighborBox_k = ( myBox_k + dk ); + if( (neighborBox_i>=0) && (neighborBox_iboxes_in.i) && + (neighborBox_j>=0) && (neighborBox_jboxes_in.j) && + (neighborBox_k>=0) && (neighborBox_kboxes_in.k) ){ // i.e. the neighbor is a valid box + neighborBoxID = neighborBox_i + neighborBox_j*level->boxes_in.i + neighborBox_k*level->boxes_in.i*level->boxes_in.j; + } + } + if(neighborBoxID>=0){ + if( level->rank_of_box[neighborBoxID] != -1 ){ + ghostsToSend[numGhosts].sendRank = level->my_rank; + ghostsToSend[numGhosts].sendBoxID = myBoxID; + ghostsToSend[numGhosts].sendBox = sendBox; + ghostsToSend[numGhosts].sendDir = dir; + ghostsToSend[numGhosts].recvRank = level->rank_of_box[neighborBoxID]; + ghostsToSend[numGhosts].recvBoxID = neighborBoxID; + ghostsToSend[numGhosts].recvBox = -1; + if( level->rank_of_box[neighborBoxID] != level->my_rank ){ + sendRanks[numGhostsRemote++] = level->rank_of_box[neighborBoxID]; + }else{ + int recvBox=0;while(level->my_boxes[recvBox].global_box_id!=neighborBoxID)recvBox++; // search my list of boxes for the appropriate recvBox index + ghostsToSend[numGhosts].recvBox = recvBox; + } + numGhosts++; + }} + }}}} + } + // sort boxes by sendRank(==my rank) then by sendBoxID... ensures the sends and receive buffers are always sorted by sendBoxID... + qsort(ghostsToSend,numGhosts ,sizeof(GZ_type),qsortGZ ); + // sort the lists of neighboring ranks and remove duplicates... + qsort(sendRanks ,numGhostsRemote,sizeof( int),qsortInt); + int numSendRanks=0;_rank=-1;for(ghost=0;ghostexchange_ghosts[shape].num_sends = numSendRanks; + level->exchange_ghosts[shape].send_ranks = (int*)malloc(numSendRanks*sizeof(int)); + level->exchange_ghosts[shape].send_sizes = (int*)malloc(numSendRanks*sizeof(int)); + level->exchange_ghosts[shape].send_buffers = (double**)malloc(numSendRanks*sizeof(double*)); + if(numSendRanks>0){ + if(level->exchange_ghosts[shape].send_ranks ==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].send_ranks\n",shape);exit(0);} + if(level->exchange_ghosts[shape].send_sizes ==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].send_sizes\n",shape);exit(0);} + if(level->exchange_ghosts[shape].send_buffers==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].send_buffers\n",shape);exit(0);} + } + level->exchange_ghosts[shape].blocks[0] = NULL; + level->exchange_ghosts[shape].blocks[1] = NULL; + level->exchange_ghosts[shape].num_blocks[0] = 0; + level->exchange_ghosts[shape].num_blocks[1] = 0; + level->exchange_ghosts[shape].allocated_blocks[0] = 0; + level->exchange_ghosts[shape].allocated_blocks[1] = 0; + for(stage=0;stage<=1;stage++){ + // stage=0... traverse the list and calculate the buffer sizes + // stage=1... allocate MPI send buffers, traverse the list, and populate the unpack/local lists... + int neighbor; + for(neighbor=0;neighborexchange_ghosts[shape].send_buffers[neighbor] = (double*)malloc(level->exchange_ghosts[shape].send_sizes[neighbor]*sizeof(double)); + if(level->exchange_ghosts[shape].send_sizes[neighbor]>0) + if(level->exchange_ghosts[shape].send_buffers[neighbor]==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].send_buffers[neighbor]\n",shape);exit(0);} + memset(level->exchange_ghosts[shape].send_buffers[neighbor], 0,level->exchange_ghosts[shape].send_sizes[neighbor]*sizeof(double)); + } + level->exchange_ghosts[shape].send_ranks[neighbor]=sendRanks[neighbor]; + level->exchange_ghosts[shape].send_sizes[neighbor]=0; + } + for(ghost=0;ghostbox_ghosts;recv_i= level->box_dim; break; + case 0:send_i=0; dim_i=level->box_dim; recv_i=0; break; + case 1:send_i=level->box_dim-level->box_ghosts;dim_i=level->box_ghosts;recv_i=0-level->box_ghosts;break; + } + switch(dj){ // direction relative to sender + case -1:send_j=0; dim_j=level->box_ghosts;recv_j= level->box_dim; break; + case 0:send_j=0; dim_j=level->box_dim; recv_j=0; break; + case 1:send_j=level->box_dim-level->box_ghosts;dim_j=level->box_ghosts;recv_j=0-level->box_ghosts;break; + } + switch(dk){ // direction relative to sender + case -1:send_k=0; dim_k=level->box_ghosts;recv_k= level->box_dim; break; + case 0:send_k=0; dim_k=level->box_dim; recv_k=0; break; + case 1:send_k=level->box_dim-level->box_ghosts;dim_k=level->box_ghosts;recv_k=0-level->box_ghosts;break; + } + + // determine if this ghost requires a pack or local exchange + int LocalExchange; // 0 = pack list, 1 = local exchange list + if(ghostsToSend[ghost].recvRank != level->my_rank){ + LocalExchange=0; // pack + neighbor=0;while(level->exchange_ghosts[shape].send_ranks[neighbor] != ghostsToSend[ghost].recvRank)neighbor++; + }else{ + LocalExchange=1; // local + neighbor=-1; + } + + if(stage==1){ + if(LocalExchange) // append to the local exchange list... + append_block_to_list(&(level->exchange_ghosts[shape].blocks[1]),&(level->exchange_ghosts[shape].allocated_blocks[1]),&(level->exchange_ghosts[shape].num_blocks[1]), + /* dim.i = */ dim_i, + /* dim.j = */ dim_j, + /* dim.k = */ dim_k, + /* read.box = */ ghostsToSend[ghost].sendBox, + /* read.ptr = */ NULL, + /* read.i = */ send_i, + /* read.j = */ send_j, + /* read.k = */ send_k, + /* read.jStride = */ level->my_boxes[ghostsToSend[ghost].sendBox].jStride, + /* read.kStride = */ level->my_boxes[ghostsToSend[ghost].sendBox].kStride, + /* read.scale = */ 1, + /* write.box = */ ghostsToSend[ghost].recvBox, + /* write.ptr = */ NULL, + /* write.i = */ recv_i, + /* write.j = */ recv_j, + /* write.k = */ recv_k, + /* write.jStride = */ level->my_boxes[ghostsToSend[ghost].recvBox].jStride, + /* write.kStride = */ level->my_boxes[ghostsToSend[ghost].recvBox].kStride, + /* write.scale = */ 1, + /* blockcopy_i = */ BLOCKCOPY_TILE_I, // default + /* blockcopy_j = */ BLOCKCOPY_TILE_J, // default + /* blockcopy_k = */ BLOCKCOPY_TILE_K, // default + /* subtype = */ 0 + ); + else // append to the MPI pack list... + append_block_to_list(&(level->exchange_ghosts[shape].blocks[0]),&(level->exchange_ghosts[shape].allocated_blocks[0]),&(level->exchange_ghosts[shape].num_blocks[0]), + /* dim.i = */ dim_i, + /* dim.j = */ dim_j, + /* dim.k = */ dim_k, + /* read.box = */ ghostsToSend[ghost].sendBox, + /* read.ptr = */ NULL, + /* read.i = */ send_i, + /* read.j = */ send_j, + /* read.k = */ send_k, + /* read.jStride = */ level->my_boxes[ghostsToSend[ghost].sendBox].jStride, + /* read.kStride = */ level->my_boxes[ghostsToSend[ghost].sendBox].kStride, + /* read.scale = */ 1, + /* write.box = */ -1, + /* write.ptr = */ level->exchange_ghosts[shape].send_buffers[neighbor], // NOTE, 1. count _sizes, 2. allocate _buffers, 3. populate blocks + /* write.i = */ level->exchange_ghosts[shape].send_sizes[neighbor], // current offset in the MPI send buffer + /* write.j = */ 0, + /* write.k = */ 0, + /* write.jStride = */ dim_i, // contiguous block + /* write.kStride = */ dim_i*dim_j, // contiguous block + /* write.scale = */ 1, + /* blockcopy_i = */ BLOCKCOPY_TILE_I, // default + /* blockcopy_j = */ BLOCKCOPY_TILE_J, // default + /* blockcopy_k = */ BLOCKCOPY_TILE_K, // default + /* subtype = */ 0 + );} + if(neighbor>=0)level->exchange_ghosts[shape].send_sizes[neighbor]+=dim_i*dim_j*dim_k; + } // ghost for-loop + } // stage for-loop + + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // free temporary storage... + free(ghostsToSend); + free(sendRanks); + + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // traverse my list of boxes and create a lists of neighboring boxes and neighboring ranks + GZ_type *ghostsToRecv = (GZ_type*)malloc(26*level->num_my_boxes*sizeof(GZ_type)); // There are at most 26 neighbors per box. + int *recvRanks = ( int*)malloc(26*level->num_my_boxes*sizeof( int)); // There are at most 26 neighbors per box. + if(level->num_my_boxes>0){ + if(ghostsToRecv == NULL){fprintf(stderr,"malloc failed - build_exchange_ghosts/ghostsToRecv\n");exit(0);} + if(recvRanks == NULL){fprintf(stderr,"malloc failed - build_exchange_ghosts/recvRanks \n");exit(0);} + } + numGhosts = 0; + numGhostsRemote = 0; + for(recvBox=0;recvBoxnum_my_boxes;recvBox++){ + int di,dj,dk; + for(dk=-1;dk<=1;dk++){ + for(dj=-1;dj<=1;dj++){ + for(di=-1;di<=1;di++){ + int dir = 13+di+3*dj+9*dk;if(CommunicateThisDir[dir]){ + int myBoxID = level->my_boxes[recvBox].global_box_id; + int myBox_i = level->my_boxes[recvBox].low.i / level->box_dim; + int myBox_j = level->my_boxes[recvBox].low.j / level->box_dim; + int myBox_k = level->my_boxes[recvBox].low.k / level->box_dim; + int neighborBoxID = -1; + if(level->boundary_condition.type == BC_PERIODIC){ + int neighborBox_i = ( myBox_i + di + level->boxes_in.i) % level->boxes_in.i; + int neighborBox_j = ( myBox_j + dj + level->boxes_in.j) % level->boxes_in.j; + int neighborBox_k = ( myBox_k + dk + level->boxes_in.k) % level->boxes_in.k; + neighborBoxID = neighborBox_i + neighborBox_j*level->boxes_in.i + neighborBox_k*level->boxes_in.i*level->boxes_in.j; + }else{ + int neighborBox_i = ( myBox_i + di ); + int neighborBox_j = ( myBox_j + dj ); + int neighborBox_k = ( myBox_k + dk ); + if( (neighborBox_i>=0) && (neighborBox_iboxes_in.i) && + (neighborBox_j>=0) && (neighborBox_jboxes_in.j) && + (neighborBox_k>=0) && (neighborBox_kboxes_in.k) ){ // i.e. the neighbor is a valid box + neighborBoxID = neighborBox_i + neighborBox_j*level->boxes_in.i + neighborBox_k*level->boxes_in.i*level->boxes_in.j; + } + } + if(neighborBoxID>=0){ + if( (level->rank_of_box[neighborBoxID] != -1) && (level->rank_of_box[neighborBoxID] != level->my_rank) ){ + ghostsToRecv[numGhosts].sendRank = level->rank_of_box[neighborBoxID]; + ghostsToRecv[numGhosts].sendBoxID = neighborBoxID; + ghostsToRecv[numGhosts].sendBox = -1; + ghostsToRecv[numGhosts].sendDir = 26-dir; + ghostsToRecv[numGhosts].recvRank = level->my_rank; + ghostsToRecv[numGhosts].recvBoxID = myBoxID; + ghostsToRecv[numGhosts].recvBox = recvBox; + numGhosts++; + recvRanks[numGhostsRemote++] = level->rank_of_box[neighborBoxID]; + }} + }}}} + } + // sort boxes by sendRank then by sendBoxID... ensures the recvs and receive buffers are always sorted by sendBoxID... + qsort(ghostsToRecv,numGhosts ,sizeof(GZ_type),qsortGZ ); + // sort the lists of neighboring ranks and remove duplicates... + qsort(recvRanks ,numGhostsRemote,sizeof( int),qsortInt); + int numRecvRanks=0;_rank=-1;for(ghost=0;ghostexchange_ghosts[shape].num_recvs = numRecvRanks; + level->exchange_ghosts[shape].recv_ranks = (int*)malloc(numRecvRanks*sizeof(int)); + level->exchange_ghosts[shape].recv_sizes = (int*)malloc(numRecvRanks*sizeof(int)); + level->exchange_ghosts[shape].recv_buffers = (double**)malloc(numRecvRanks*sizeof(double*)); + if(numRecvRanks>0){ + if(level->exchange_ghosts[shape].recv_ranks ==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].recv_ranks\n",shape);exit(0);} + if(level->exchange_ghosts[shape].recv_sizes ==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].recv_sizes\n",shape);exit(0);} + if(level->exchange_ghosts[shape].recv_buffers==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].recv_buffers\n",shape);exit(0);} + } + level->exchange_ghosts[shape].blocks[2] = NULL; + level->exchange_ghosts[shape].num_blocks[2] = 0; + level->exchange_ghosts[shape].allocated_blocks[2] = 0; + for(stage=0;stage<=1;stage++){ + // stage=0... traverse the list and calculate the buffer sizes + // stage=1... allocate MPI recv buffers, traverse the list, and populate the unpack/local lists... + int neighbor; + for(neighbor=0;neighborexchange_ghosts[shape].recv_buffers[neighbor] = (double*)malloc(level->exchange_ghosts[shape].recv_sizes[neighbor]*sizeof(double)); + if(level->exchange_ghosts[shape].recv_sizes[neighbor]>0) + if(level->exchange_ghosts[shape].recv_buffers[neighbor]==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].recv_buffers[neighbor]\n",shape);exit(0);} + memset(level->exchange_ghosts[shape].recv_buffers[neighbor], 0,level->exchange_ghosts[shape].recv_sizes[neighbor]*sizeof(double)); + } + level->exchange_ghosts[shape].recv_ranks[neighbor]=recvRanks[neighbor]; + level->exchange_ghosts[shape].recv_sizes[neighbor]=0; + } + for(ghost=0;ghostbox_ghosts;recv_i= level->box_dim; break; + case 0:dim_i=level->box_dim; recv_i=0; break; + case 1:dim_i=level->box_ghosts;recv_i=0-level->box_ghosts;break; + } + switch(dj){ // direction relative to sender + case -1:dim_j=level->box_ghosts;recv_j= level->box_dim; break; + case 0:dim_j=level->box_dim; recv_j=0; break; + case 1:dim_j=level->box_ghosts;recv_j=0-level->box_ghosts;break; + } + switch(dk){ // direction relative to sender + case -1:dim_k=level->box_ghosts;recv_k= level->box_dim; break; + case 0:dim_k=level->box_dim; recv_k=0; break; + case 1:dim_k=level->box_ghosts;recv_k=0-level->box_ghosts;break; + } + + // determine if this ghost requires a pack or local exchange + neighbor=0;while(level->exchange_ghosts[shape].recv_ranks[neighbor] != ghostsToRecv[ghost].sendRank)neighbor++; + if(stage==1)append_block_to_list(&(level->exchange_ghosts[shape].blocks[2]),&(level->exchange_ghosts[shape].allocated_blocks[2]),&(level->exchange_ghosts[shape].num_blocks[2]), + /*dim.i = */ dim_i, + /*dim.j = */ dim_j, + /*dim.k = */ dim_k, + /*read.box = */ -1, + /*read.ptr = */ level->exchange_ghosts[shape].recv_buffers[neighbor], // NOTE, 1. count _sizes, 2. allocate _buffers, 3. populate blocks + /*read.i = */ level->exchange_ghosts[shape].recv_sizes[neighbor], // current offset in the MPI recv buffer + /*read.j = */ 0, + /*read.k = */ 0, + /*read.jStride = */ dim_i, // contiguous block + /*read.kStride = */ dim_i*dim_j, // contiguous block + /*read.scale = */ 1, + /*write.box = */ ghostsToRecv[ghost].recvBox, + /*write.ptr = */ NULL, + /*write.i = */ recv_i, + /*write.j = */ recv_j, + /*write.k = */ recv_k, + /*write.jStride = */ level->my_boxes[ghostsToRecv[ghost].recvBox].jStride, + /*write.kStride = */ level->my_boxes[ghostsToRecv[ghost].recvBox].kStride, + /*write.scale = */ 1, + /* blockcopy_i = */ BLOCKCOPY_TILE_I, // default + /* blockcopy_j = */ BLOCKCOPY_TILE_J, // default + /* blockcopy_k = */ BLOCKCOPY_TILE_K, // default + /* subtype = */ 0 + ); + if(neighbor>=0)level->exchange_ghosts[shape].recv_sizes[neighbor]+=dim_i*dim_j*dim_k; + } // ghost for-loop + } // stage for-loop + + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // free temporary storage... + free(ghostsToRecv); + free(recvRanks); + + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // malloc MPI requests/status arrays + #ifdef USE_MPI + level->exchange_ghosts[shape].requests = (MPI_Request*)malloc((level->exchange_ghosts[shape].num_sends+level->exchange_ghosts[shape].num_recvs)*sizeof(MPI_Request)); + level->exchange_ghosts[shape].status = (MPI_Status *)malloc((level->exchange_ghosts[shape].num_sends+level->exchange_ghosts[shape].num_recvs)*sizeof(MPI_Status )); + if((level->exchange_ghosts[shape].num_sends+level->exchange_ghosts[shape].num_recvs)>0){ + if(level->exchange_ghosts[shape].requests==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].requests\n",shape);exit(0);} + if(level->exchange_ghosts[shape].status ==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].status\n",shape);exit(0);} + } + #endif + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #ifdef BLOCK_SPATIAL_SORT + // sort all the resultant blocks by box,k,j,i (good locality) + qsort(level->exchange_ghosts[shape].blocks[0],level->exchange_ghosts[shape].num_blocks[0],sizeof(blockCopy_type),qsortBlock); + qsort(level->exchange_ghosts[shape].blocks[1],level->exchange_ghosts[shape].num_blocks[1],sizeof(blockCopy_type),qsortBlock); + qsort(level->exchange_ghosts[shape].blocks[2],level->exchange_ghosts[shape].num_blocks[2],sizeof(blockCopy_type),qsortBlock); + #endif +} + + + +//--------------------------------------------------------------------------------------------------------------------------------------------------- +// create the pointers in level_type to the contiguous vector FP data (useful for bulk copies to/from accelerators) +// create the pointers in each box to their respective segment of the level's vector FP data (useful for box-relative operators) +// if( (level->numVectors > 0) && (numVectors > level->numVectors) ) then allocate additional space for (numVectors-level->numVectors) and copy old leve->numVectors data +void create_vectors(level_type *level, int numVectors){ + if(numVectors <= level->numVectors)return; // already have enough space + double * old_vectors_base = level->vectors_base; // save a pointer to the originally allocated data for subsequent free() + double * old_vector0 = NULL; + if(level->numVectors>0)old_vector0 = level->vectors[0]; // save a pointer to old FP data to copy + + + // calculate the size of each box... + level->box_jStride = (level->box_dim+2*level->box_ghosts);while(level->box_jStride % BOX_ALIGN_JSTRIDE)level->box_jStride++; // pencil + level->box_kStride = level->box_jStride*(level->box_dim+2*level->box_ghosts);while(level->box_kStride % BOX_ALIGN_KSTRIDE)level->box_kStride++; // plane + level->box_volume = level->box_kStride*(level->box_dim+2*level->box_ghosts);while(level->box_volume % BOX_ALIGN_VOLUME )level->box_volume++; // volume + + + #define VECTOR_MALLOC_BULK + #ifdef VECTOR_MALLOC_BULK + // allocate one aligned, double-precision array and divide it among vectors... + uint64_t malloc_size = (uint64_t)numVectors*level->num_my_boxes*level->box_volume*sizeof(double) + 4096; + level->vectors_base = (double*)malloc(malloc_size); + if((numVectors>0)&&(level->vectors_base==NULL)){fprintf(stderr,"malloc failed - level->vectors_base\n");exit(0);} + double * tmpbuf = level->vectors_base; + while( (uint64_t)(tmpbuf+level->box_ghosts*(1+level->box_jStride+level->box_kStride)) & 0xff ){tmpbuf++;} // align first *non-ghost* zone element of first component to a 256-Byte boundary + uint64_t ofs; + #ifdef _OPENMP + #pragma omp parallel for + #endif + for(ofs=0;ofs<(uint64_t)numVectors*level->num_my_boxes*level->box_volume;ofs++){tmpbuf[ofs]=0.0;} // Faster in MPI+OpenMP environments, but not NUMA-aware + // if there is existing FP data... copy it, then free old data and pointer array + if(level->numVectors>0){ + memcpy(tmpbuf,old_vector0,(uint64_t)level->numVectors*level->num_my_boxes*level->box_volume*sizeof(double)); // FIX... omp thread ??? + if(old_vectors_base)free(old_vectors_base); // free old data... + } + // allocate an array of pointers which point to the union of boxes for each vector + // NOTE, this requires just one copyin per vector to an accelerator rather than requiring one copyin per box per vector + if(level->numVectors>0)free(level->vectors); // free any previously allocated vector array + level->vectors = (double **)malloc(numVectors*sizeof(double*)); + if((numVectors>0)&&(level->vectors==NULL)){fprintf(stderr,"malloc failed - level->vectors\n");exit(0);} + uint64_t c;for(c=0;cvectors[c] = tmpbuf + (uint64_t)c*level->num_my_boxes*level->box_volume;} + #else + // allocate vectors individually (simple, but may cause conflict misses) + double ** old_vectors = level->vectors; + level->vectors = (double **)malloc(numVectors*sizeof(double*)); + uint64_t c; + for(c= 0;cnumVectors;c++){level->vectors[c] = old_vectors[c];} + for(c=level->numVectors;c< numVectors;c++){ + level->vectors[c] = (double*)malloc((uint64_t)level->num_my_boxes*level->box_volume*sizeof(double)); + uint64_t ofs; + #ifdef _OPENMP + #pragma omp parallel for + #endif + for(ofs=0;ofs<(uint64_t)level->num_my_boxes*level->box_volume;ofs++){level->vectors[c][ofs]=0.0;} // Faster in MPI+OpenMP environments, but not NUMA-aware + } + free(old_vectors); + #endif + + + // build the list of boxes... + int box=0; + int i,j,k; + for(k=0;kboxes_in.k;k++){ + for(j=0;jboxes_in.j;j++){ + for(i=0;iboxes_in.i;i++){ + int jStride = level->boxes_in.i; + int kStride = level->boxes_in.i*level->boxes_in.j; + int b=i + j*jStride + k*kStride; + if(level->rank_of_box[b]==level->my_rank){ + if(level->numVectors>0)free(level->my_boxes[box].vectors); // free previously allocated vector array + level->my_boxes[box].vectors = (double **)malloc(numVectors*sizeof(double*)); + if((numVectors>0)&&(level->my_boxes[box].vectors==NULL)){fprintf(stderr,"malloc failed - level->my_boxes[box].vectors\n");exit(0);} + uint64_t c;for(c=0;cmy_boxes[box].vectors[c] = level->vectors[c] + (uint64_t)box*level->box_volume;} + level->my_boxes[box].numVectors = numVectors; + level->my_boxes[box].dim = level->box_dim; + level->my_boxes[box].ghosts = level->box_ghosts; + level->my_boxes[box].jStride = level->box_jStride; + level->my_boxes[box].kStride = level->box_kStride; + level->my_boxes[box].volume = level->box_volume; + level->my_boxes[box].low.i = i*level->box_dim; + level->my_boxes[box].low.j = j*level->box_dim; + level->my_boxes[box].low.k = k*level->box_dim; + level->my_boxes[box].global_box_id = b; + box++; + }}}} + + // level now has created/initialized vector FP data + level->numVectors = numVectors; +} + + +//--------------------------------------------------------------------------------------------------------------------------------------------------- +// create a level by populating the basic data structure, distribute boxes within the level among processes, allocate memory, and create any auxilliaries +// box_ghosts must be >= stencil_get_radius() +// numVectors represents an estimate of the number of vectors needed in this level. Additional vectors can be added via subsequent calls to create_vectors() +void create_level(level_type *level, int boxes_in_i, int box_dim, int box_ghosts, int numVectors, int domain_boundary_condition, int my_rank, int num_ranks, const MPI_Comm comm){ + int box; + int TotalBoxes = boxes_in_i*boxes_in_i*boxes_in_i; + + if(my_rank==0){ + //if(domain_boundary_condition==BC_DIRICHLET)fprintf(stdout,"\nattempting to create a %d^3 level (with Dirichlet BC) using a %d^3 grid of %d^3 boxes and %d tasks...\n",box_dim*boxes_in_i,boxes_in_i,box_dim,num_ranks); + //if(domain_boundary_condition==BC_PERIODIC )fprintf(stdout,"\nattempting to create a %d^3 level (with Periodic BC) using a %d^3 grid of %d^3 boxes and %d tasks...\n", box_dim*boxes_in_i,boxes_in_i,box_dim,num_ranks); + fprintf(stdout,"\nattempting to create a %d^3 level from %d x %d^3 boxes distributed among %d tasks...\n", box_dim*boxes_in_i,TotalBoxes,box_dim,num_ranks); + if(domain_boundary_condition==BC_DIRICHLET)fprintf(stdout," boundary condition = BC_DIRICHLET\n"); + if(domain_boundary_condition==BC_PERIODIC )fprintf(stdout," boundary condition = BC_PERIODIC\n"); + + } + + int omp_threads = 1; + + #ifdef _OPENMP + #pragma omp parallel + { + #pragma omp master + { + omp_threads = omp_get_num_threads(); + } + } + #endif + + if(box_ghosts < stencil_get_radius() ){ + if(my_rank==0)fprintf(stderr,"ghosts(%d) must be >= stencil_get_radius(%d)\n",box_ghosts,stencil_get_radius()); + exit(0); + } + + level->box_dim = box_dim; + level->box_ghosts = box_ghosts; + level->numVectors = 0; // no vectors have been allocated yet + level->vectors_base = NULL; // pointer returned by bulk malloc + level->vectors = NULL; // pointers to individual vectors + level->boxes_in.i = boxes_in_i; + level->boxes_in.j = boxes_in_i; + level->boxes_in.k = boxes_in_i; + level->dim.i = box_dim*level->boxes_in.i; + level->dim.j = box_dim*level->boxes_in.j; + level->dim.k = box_dim*level->boxes_in.k; + level->active = 1; + level->my_rank = my_rank; + level->num_ranks = num_ranks; + level->boundary_condition.type = domain_boundary_condition; + level->must_subtract_mean = -1; + level->num_threads = omp_threads; + level->my_blocks = NULL; + level->num_my_blocks = 0; + level->allocated_blocks = 0; + level->tag = log2(level->dim.i); + level->fluxes = NULL; + + + // allocate 3D array of integers to hold the MPI rank of the corresponding box and initialize to -1 (unassigned) + level->rank_of_box = (int*)malloc(level->boxes_in.i*level->boxes_in.j*level->boxes_in.k*sizeof(int)); + if(level->rank_of_box==NULL){fprintf(stderr,"malloc of level->rank_of_box failed\n");exit(0);} + for(box=0;boxboxes_in.i*level->boxes_in.j*level->boxes_in.k;box++){level->rank_of_box[box]=-1;} // -1 denotes that there is no actual box assigned to this region + + + // parallelize the level (i.e. assign a process rank to each box)... + #ifdef DECOMPOSE_LEX + // lexicographical ordering... good load balance, potentially high bisection bandwidth requirements, bad surface:volume ratio when #boxes/proc is large + if(my_rank==0){fprintf(stdout," Decomposing level via lexicographical ordering... ");fflush(stdout);} + decompose_level_lex(level->rank_of_box,level->boxes_in.i,level->boxes_in.j,level->boxes_in.k,num_ranks); + #elif DECOMPOSE_BISECTION_SPECIAL + // recursive partitioning by primes + if(my_rank==0){fprintf(stdout," Decomposing level via partitioning by primes... ");fflush(stdout);} + decompose_level_bisection_special(level->rank_of_box,level->boxes_in.i,level->boxes_in.i*level->boxes_in.j,0,0,0,level->boxes_in.i,level->boxes_in.j,level->boxes_in.k,0,num_ranks); + #elif DECOMPOSE_BISECTION + // recursive bisection + if(my_rank==0){fprintf(stdout," Decomposing level via recursive bisection... ");fflush(stdout);} + decompose_level_bisection(level->rank_of_box,level->boxes_in.i,level->boxes_in.i*level->boxes_in.j,0,0,0,level->boxes_in.i,level->boxes_in.j,level->boxes_in.k,num_ranks,0,level->boxes_in.i*level->boxes_in.j*level->boxes_in.k); + #else//#elif DECOMPOSE_ZMORT + if(my_rank==0){fprintf(stdout," Decomposing level via Z-mort ordering... ");fflush(stdout);} + #if 0 // Z-Mort over a power of two bounding box skipping boxes outside the domain + int idim_padded=1;while(idim_paddedboxes_in.i)idim_padded*=2; + int jdim_padded=1;while(jdim_paddedboxes_in.j)jdim_padded*=2; + int kdim_padded=1;while(kdim_paddedboxes_in.k)kdim_padded*=2; + #else // Z-Mort over the valid domain wtih odd-sized base cases (i.e. zmort on 3x3) + int idim_padded=level->boxes_in.i; + int jdim_padded=level->boxes_in.j; + int kdim_padded=level->boxes_in.k; + #endif + decompose_level_zmort(level->rank_of_box,level->boxes_in.i,level->boxes_in.j,level->boxes_in.k,0,0,0,idim_padded,jdim_padded,kdim_padded,num_ranks,0,level->boxes_in.i*level->boxes_in.j*level->boxes_in.k); + #endif + if(my_rank==0){fprintf(stdout,"done\n");fflush(stdout);} +//print_decomposition(level);// for debug purposes only + + + // calculate how many boxes I own... + level->num_my_boxes=0; + for(box=0;boxboxes_in.i*level->boxes_in.j*level->boxes_in.k;box++){if(level->rank_of_box[box]==level->my_rank)level->num_my_boxes++;} + level->my_boxes = (box_type*)malloc(level->num_my_boxes*sizeof(box_type)); + if((level->num_my_boxes>0)&&(level->my_boxes==NULL)){fprintf(stderr,"malloc failed - create_level/level->my_boxes\n");exit(0);} + + + // allocate flattened vector FP data and create pointers... + if(my_rank==0){fprintf(stdout," Allocating vectors... ");fflush(stdout);} + create_vectors(level,numVectors); + if(my_rank==0){fprintf(stdout,"done\n");fflush(stdout);} + + + // Build and auxilarlly data structure that flattens boxes into blocks... + for(box=0;boxnum_my_boxes;box++){ + int blockcopy_i = BLOCKCOPY_TILE_I; + int blockcopy_j = BLOCKCOPY_TILE_J; + int blockcopy_k = BLOCKCOPY_TILE_K; + + append_block_to_list(&(level->my_blocks),&(level->allocated_blocks),&(level->num_my_blocks), + /* dim.i = */ level->my_boxes[box].dim, + /* dim.j = */ level->my_boxes[box].dim, + /* dim.k = */ level->my_boxes[box].dim, + /* read.box = */ box, + /* read.ptr = */ NULL, + /* read.i = */ 0, + /* read.j = */ 0, + /* read.k = */ 0, + /* read.jStride = */ level->my_boxes[box].jStride, + /* read.kStride = */ level->my_boxes[box].kStride, + /* read.scale = */ 1, + /* write.box = */ box, + /* write.ptr = */ NULL, + /* write.i = */ 0, + /* write.j = */ 0, + /* write.k = */ 0, + /* write.jStride = */ level->my_boxes[box].jStride, + /* write.kStride = */ level->my_boxes[box].kStride, + /* write.scale = */ 1, + /* blockcopy_i = */ blockcopy_i, + /* blockcopy_j = */ blockcopy_j, + /* blockcopy_k = */ blockcopy_k, + /* subtype = */ 0 + ); + } + + + // build an assist structure for Gauss Seidel Red Black that would facilitate unrolling and SIMDization... + level->RedBlack_base = NULL; + level->RedBlack_FP = NULL; + if(level->num_my_boxes){ + int i,j; + int kStride = level->my_boxes[0].kStride; + int jStride = level->my_boxes[0].jStride; + level->RedBlack_base = (double*)malloc(2*kStride*sizeof(double)+256); // used for free() + level->RedBlack_FP = level->RedBlack_base; // aligned version + // align first *non-ghost* zone element to a 64-Byte boundary... + while( (uint64_t)(level->RedBlack_FP + level->box_ghosts*(1+level->box_jStride)) & 0x3f ){level->RedBlack_FP++;} + // initialize RedBlack array... + for(j=0-level->box_ghosts;jbox_dim+level->box_ghosts;j++){ + for(i=0-level->box_ghosts;ibox_dim+level->box_ghosts;i++){ + int ij = (i+level->box_ghosts) + (j+level->box_ghosts)*jStride; + if((i^j^1)&0x1){ + level->RedBlack_FP[ij ]=1.0; + level->RedBlack_FP[ij+kStride]=0.0; + }else{ + level->RedBlack_FP[ij ]=0.0; + level->RedBlack_FP[ij+kStride]=1.0; + } + // Never update ghost zones + //if( (i<0) || (i>=level->box_dim) || (j<0) || (j>=level->box_dim) ){ + // level->RedBlack_FP[ij ]=0.0; + // level->RedBlack_FP[ij+kStride]=0.0; + //} + }} + } + + + int shape; + // create mini program for each stencil shape to perform a ghost zone exchange... + for(shape=0;shapeMPI_COMM_ALLREDUCE = comm; + /* + if(my_rank==0){fprintf(stdout," Duplicating MPI_COMM_WORLD... ");fflush(stdout);} + double time_start = MPI_Wtime(); + MPI_Comm_dup(comm,&level->MPI_COMM_ALLREDUCE); + double time_end = MPI_Wtime(); + double time_in_comm_dup = 0; + double time_in_comm_dup_send = time_end-time_start; + MPI_Allreduce(&time_in_comm_dup_send,&time_in_comm_dup,1,MPI_DOUBLE,MPI_MAX,level->MPI_COMM_ALLREDUCE); + if(my_rank==0){fprintf(stdout,"done (%0.6f seconds)\n",time_in_comm_dup);fflush(stdout);} + */ + #endif + + // report on potential load imbalance + int BoxesPerProcess = level->num_my_boxes; + #ifdef USE_MPI + int BoxesPerProcessSend = level->num_my_boxes; + MPI_Allreduce(&BoxesPerProcessSend,&BoxesPerProcess,1,MPI_INT,MPI_MAX,level->MPI_COMM_ALLREDUCE); + #endif + if(my_rank==0){fprintf(stdout," Calculating boxes per process... target=%0.3f, max=%d\n",(double)TotalBoxes/(double)num_ranks,BoxesPerProcess);} +} + + + +//--------------------------------------------------------------------------------------------------------------------------------------------------- +// zeros are the timers within this level +// useful if one wishes to separate setup(build) timing from solve timing +void reset_level_timers(level_type *level){ + // cycle counters information... + level->timers.smooth = 0; + level->timers.apply_op = 0; + level->timers.residual = 0; + level->timers.blas1 = 0; + level->timers.blas3 = 0; + level->timers.boundary_conditions = 0; + level->timers.restriction_total = 0; + level->timers.restriction_pack = 0; + level->timers.restriction_local = 0; + level->timers.restriction_unpack = 0; + level->timers.restriction_recv = 0; + level->timers.restriction_send = 0; + level->timers.restriction_wait = 0; + level->timers.interpolation_total = 0; + level->timers.interpolation_pack = 0; + level->timers.interpolation_local = 0; + level->timers.interpolation_unpack = 0; + level->timers.interpolation_recv = 0; + level->timers.interpolation_send = 0; + level->timers.interpolation_wait = 0; + level->timers.ghostZone_total = 0; + level->timers.ghostZone_pack = 0; + level->timers.ghostZone_local = 0; + level->timers.ghostZone_unpack = 0; + level->timers.ghostZone_recv = 0; + level->timers.ghostZone_send = 0; + level->timers.ghostZone_wait = 0; + level->timers.collectives = 0; + level->timers.Total = 0; + // solver events information... + level->Krylov_iterations = 0; + level->CAKrylov_formations_of_G = 0; + level->vcycles_from_this_level = 0; +} + +//--------------------------------------------------------------------------------------------------------------------------------------------------- +// free all memory allocated by this level +// n.b. in some cases a malloc was used as the basis for an array of pointers. As such free(x[0]) +void destroy_level(level_type *level){ + int i,j; + if(level->my_rank==0){fprintf(stdout,"attempting to free the %5d^3 level... ",level->dim.i);fflush(stdout);} + + // box ... + for(i=0;inum_my_boxes;i++)if(level->my_boxes[i].vectors)free(level->my_boxes[i].vectors); + + // misc ... + if(level->rank_of_box )free(level->rank_of_box); + if(level->my_boxes )free(level->my_boxes); + if(level->my_blocks )free(level->my_blocks); + if(level->RedBlack_base)free(level->RedBlack_base); + + // FP vector data... + #ifdef VECTOR_MALLOC_BULK + if(level->vectors_base)free(level->vectors_base); + if(level->vectors )free(level->vectors); + #else + for(i=0;inumVectors;i++)if(level->vectors[i])free(level->vectors[i]); + if(level->vectors )free(level->vectors); + #endif + + // boundary condition mini program... + for(i=0;iboundary_condition.blocks[i])free(level->boundary_condition.blocks[i]); + } + + // ghost zone exchange mini programs... + for(i=0;iexchange_ghosts[i].num_recvs>0){ + for(j=0;jexchange_ghosts[i].num_recvs;j++)if(level->exchange_ghosts[i].recv_buffers[j])free(level->exchange_ghosts[i].recv_buffers[j]); + if(level->exchange_ghosts[i].recv_buffers)free(level->exchange_ghosts[i].recv_buffers); + if(level->exchange_ghosts[i].recv_ranks )free(level->exchange_ghosts[i].recv_ranks ); + if(level->exchange_ghosts[i].recv_sizes )free(level->exchange_ghosts[i].recv_sizes ); + } + if(level->exchange_ghosts[i].num_sends>0){ + for(j=0;jexchange_ghosts[i].num_sends;j++)if(level->exchange_ghosts[i].send_buffers[j])free(level->exchange_ghosts[i].send_buffers[j]); + if(level->exchange_ghosts[i].send_buffers)free(level->exchange_ghosts[i].send_buffers); + if(level->exchange_ghosts[i].send_ranks )free(level->exchange_ghosts[i].send_ranks ); + if(level->exchange_ghosts[i].send_sizes )free(level->exchange_ghosts[i].send_sizes ); + } + if(level->exchange_ghosts[i].blocks[0] )free(level->exchange_ghosts[i].blocks[0] ); + if(level->exchange_ghosts[i].blocks[1] )free(level->exchange_ghosts[i].blocks[1] ); + if(level->exchange_ghosts[i].blocks[2] )free(level->exchange_ghosts[i].blocks[2] ); + #ifdef USE_MPI + if(level->exchange_ghosts[i].requests )free(level->exchange_ghosts[i].requests ); + if(level->exchange_ghosts[i].status )free(level->exchange_ghosts[i].status ); + #endif + } + + /* + // MPI subcommunicator + #ifdef USE_MPI + #ifdef USE_SUBCOMM + MPI_Comm_free(&level->MPI_COMM_ALLREDUCE); + #endif + #endif + */ + + if(level->my_rank==0){fprintf(stdout,"done\n");} +} diff --git a/Util/hpgmg/finite-volume/source/level.h b/Util/hpgmg/finite-volume/source/level.h new file mode 100644 index 00000000..6a632c3c --- /dev/null +++ b/Util/hpgmg/finite-volume/source/level.h @@ -0,0 +1,199 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef LEVEL_H +#define LEVEL_H +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef USE_MPI +#include +#endif +//------------------------------------------------------------------------------------------------------------------------------ +// supported boundary conditions +#define BC_PERIODIC 0 +#define BC_DIRICHLET 1 +//------------------------------------------------------------------------------------------------------------------------------ +// regiment communication by defining a series of stencil shapes... +#define STENCIL_SHAPE_BOX 0 // faces, edges, and corners +#define STENCIL_SHAPE_STAR 1 // just faces +#define STENCIL_SHAPE_NO_CORNERS 2 // faces and edges, but no corners +#define STENCIL_MAX_SHAPES 3 +//------------------------------------------------------------------------------------------------------------------------------ +// regiment threading around the 'block' or 'tile' concepts. Define default tilings... +#ifndef BLOCKCOPY_TILE_I +#define BLOCKCOPY_TILE_I 10000 +#else +#warning By overriding BLOCKCOPY_TILE_I, you are tiling in the unit stride. I hope you know what you are doing. +#endif +#ifndef BLOCKCOPY_TILE_J +#define BLOCKCOPY_TILE_J 8 +#endif +#ifndef BLOCKCOPY_TILE_K +#define BLOCKCOPY_TILE_K 8 +#endif +//------------------------------------------------------------------------------------------------------------------------------ +// FP data for a vector within a box is padded to ensure alignment +#ifndef BOX_ALIGN_JSTRIDE +#define BOX_ALIGN_JSTRIDE 4 // j-stride(unit stride dimension including ghosts and padding) is a multiple of BOX_ALIGN_JSTRIDE... useful for SIMD in j+/-1 +#endif +#ifndef BOX_ALIGN_KSTRIDE +#define BOX_ALIGN_KSTRIDE 8 // k-stride is a multiple of BOX_ALIGN_KSTRIDE ... useful for SIMD in k+/-1 +#endif +#ifndef BOX_ALIGN_VOLUME +#define BOX_ALIGN_VOLUME 8 // box volumes are a multiple of BOX_ALIGN_VOLUME ... useful for SIMD on different vectors +#endif +//------------------------------------------------------------------------------------------------------------------------------ +typedef struct { + int subtype; // e.g. used to calculate normal to domain for BC's + struct {int i, j, k;}dim; // dimensions of the block to copy + struct {int box, i, j, k, jStride, kStride;double * __restrict__ ptr;}read,write; + // coordinates in the read grid to extract data, + // coordinates in the write grid to insert data + // if read/write.box<0, then use write/read.ptr, otherwise use boxes[box].vectors[id] + // Thus, you can do grid->grid, grid->buf, buf->grid, or buf->buf +} __attribute__((aligned(64))) blockCopy_type; + + +//------------------------------------------------------------------------------------------------------------------------------ +typedef struct { + int num_recvs; // number of neighbors by type + int num_sends; // number of neighbors by type + int * __restrict__ recv_ranks; // MPI rank of each neighbor... recv_ranks[neighbor] + int * __restrict__ send_ranks; // MPI rank of each neighbor... send_ranks[neighbor] + int * __restrict__ recv_sizes; // size of each MPI recv buffer... recv_sizes[neighbor] + int * __restrict__ send_sizes; // size of each MPI send buffer... send_sizes[neighbor] + double ** __restrict__ recv_buffers; // MPI recv buffer for each neighbor... recv_buffers[neighbor][ recv_sizes[neighbor] ] + double ** __restrict__ send_buffers; // MPI send buffer for each neighbor... send_buffers[neighbor][ send_sizes[neighbor] ] + int allocated_blocks[3]; // number of blocks allocated (not necessarily used) each list... + int num_blocks[3]; // number of blocks in each list... num_blocks[pack,local,unpack] + blockCopy_type * blocks[3]; // list of block copies... blocks[pack,local,unpack] + #ifdef USE_MPI + MPI_Request * __restrict__ requests; + MPI_Status * __restrict__ status; + #endif +} communicator_type; + + +//------------------------------------------------------------------------------------------------------------------------------ +typedef struct { + int global_box_id; // used to inded into level->rank_of_box + struct {int i, j, k;}low; // global coordinates of the first (non-ghost) element of subdomain + int dim; // dimension of this box's core (owned) + int ghosts; // ghost zone depth + int jStride,kStride,volume; // useful for offsets + int numVectors; // + double ** __restrict__ vectors; // vectors[c] = pointer to 3D array for vector c for one box +} box_type; + + +//------------------------------------------------------------------------------------------------------------------------------ +typedef struct { + double h; // grid spacing at this level + int active; // I am an active process (I have work to do on this or subsequent levels) + int num_ranks; // total number of MPI ranks + int my_rank; // my MPI rank + int box_dim; // dimension of each cubical box (not counting ghost zones) + int box_ghosts; // ghost zone depth for each box + int box_jStride,box_kStride,box_volume; // useful for offsets + int numVectors; // number of vectors stored in each box + int tag; // tag each level uniquely... FIX... replace with sub commuicator + struct {int i, j, k;}boxes_in; // total number of boxes in i,j,k across this level + struct {int i, j, k;}dim; // global dimensions at this level (NOTE: dim.i == boxes_in.i * box_dim) + + int * rank_of_box; // 3D array containing rank of each box. i-major ordering + int num_my_boxes; // number of boxes owned by this rank + box_type * my_boxes; // pointer to array of boxes owned by this rank + + // create flattened FP data... useful for CUDA/OpenMP4/OpenACC when you want to copy an entire vector to/from an accelerator + double ** __restrict__ vectors; // vectors[v][box][k][j][i] = pointer to 5D array for vector v encompasing all boxes on this process... + double * __restrict__ vectors_base; // pointer used for malloc/free. vectors[v] are shifted from this for alignment + + int allocated_blocks; // number of blocks allocated by this rank (note, this represents a flattening of the box/cell hierarchy to facilitate threading) + int num_my_blocks; // number of blocks owned by this rank (note, this represents a flattening of the box/cell hierarchy to facilitate threading) + blockCopy_type * my_blocks; // pointer to array of blocks owned by this rank (note, this represents a flattening of the box/cell hierarchy to facilitate threading) + + struct { + int type; // BC_PERIODIC or BC_DIRICHLET + int allocated_blocks[STENCIL_MAX_SHAPES];// number of blocks allocated (not necessarily used) for boundary conditions on this level for [shape] + int num_blocks[STENCIL_MAX_SHAPES];// number of blocks used for boundary conditions on this level for [shape] + blockCopy_type * blocks[STENCIL_MAX_SHAPES];// pointer to array of blocks used for boundary conditions on this level for [shape] + } boundary_condition; // boundary conditions on this level + + communicator_type exchange_ghosts[STENCIL_MAX_SHAPES];// mini program that performs a neighbor ghost zone exchange for [shape] + communicator_type restriction[4]; // mini program that performs restriction and agglomeration for [0=cell centered, 1=i-face, 2=j-face, 3-k-face] + communicator_type interpolation; // mini program that performs interpolation and dissemination... + #ifdef USE_MPI + MPI_Comm MPI_COMM_ALLREDUCE; // MPI sub communicator for just the ranks that have boxes on this level or any subsequent level... + #endif + double dominant_eigenvalue_of_DinvA; // estimate on the dominate eigenvalue of D^{-1}A + int must_subtract_mean; // e.g. Poisson with Periodic BC's + double * __restrict__ RedBlack_base; // allocated pointer... will be aligned for the first non ghost zone element + double * __restrict__ RedBlack_FP; // Red/Black Mask (i.e. 0.0 or 1.0) for even/odd planes (2*kStride). + + int num_threads; + double * __restrict__ fluxes; // temporary array used to hold the flux values used by FV operators + + // statistics information... + struct { + double smooth; + double apply_op; + double residual; + double blas1; + double blas3; + double boundary_conditions; + // Distributed Restriction + double restriction_total; + double restriction_pack; + double restriction_local; + double restriction_unpack; + double restriction_recv; + double restriction_send; + double restriction_wait; + // Distributed interpolation + double interpolation_total; + double interpolation_pack; + double interpolation_local; + double interpolation_unpack; + double interpolation_recv; + double interpolation_send; + double interpolation_wait; + // Ghost Zone Exchanges... + double ghostZone_total; + double ghostZone_pack; + double ghostZone_local; + double ghostZone_unpack; + double ghostZone_recv; + double ghostZone_send; + double ghostZone_wait; + // Collectives... + double collectives; + double Total; + }timers; + int Krylov_iterations; // total number of bottom solver iterations + int CAKrylov_formations_of_G; // i.e. [G,g] = [P,R]^T[P,R,rt] + int vcycles_from_this_level; // number of vcycles performed that were initiated from this level +} level_type; + + +//------------------------------------------------------------------------------------------------------------------------------ +void create_level(level_type *level, int boxes_in_i, int box_dim, int box_ghosts, int numVectors, int domain_boundary_condition, int my_rank, int num_ranks, const MPI_Comm comm); +void destroy_level(level_type *level); +void create_vectors(level_type *level, int numVectors); +void reset_level_timers(level_type *level); +int qsortInt(const void *a, const void *b); +void append_block_to_list(blockCopy_type ** blocks, int *allocated_blocks, int *num_blocks, + int dim_i, int dim_j, int dim_k, + int read_box, double* read_ptr, int read_i, int read_j, int read_k, int read_jStride, int read_kStride, int read_scale, + int write_box, double* write_ptr, int write_i, int write_j, int write_k, int write_jStride, int write_kStride, int write_scale, + int my_blockcopy_tile_i, int my_blockcopy_tile_j, int my_blockcopy_tile_k, + int subtype + ); +//------------------------------------------------------------------------------------------------------------------------------ +#endif diff --git a/Util/hpgmg/finite-volume/source/local.mk b/Util/hpgmg/finite-volume/source/local.mk new file mode 100644 index 00000000..7fef990b --- /dev/null +++ b/Util/hpgmg/finite-volume/source/local.mk @@ -0,0 +1,8 @@ +hpgmg-fv-y.c += $(call thisdir, \ + timers.c \ + level.c \ + operators.fv4.c \ + mg.c \ + solvers.c \ + hpgmg-fv.c \ + ) diff --git a/Util/hpgmg/finite-volume/source/mg.h b/Util/hpgmg/finite-volume/source/mg.h new file mode 100644 index 00000000..74c2eec4 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/mg.h @@ -0,0 +1,46 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef MG_H +#define MG_H +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef MG_AGGLOMERATION_START +#define MG_AGGLOMERATION_START 8 // i.e. start the distributed v-cycle when boxes are smaller than 8^3 +#endif +#ifndef MG_DEFAULT_BOTTOM_NORM +#define MG_DEFAULT_BOTTOM_NORM 1e-3 +#endif +//------------------------------------------------------------------------------------------------------------------------------ +typedef struct { + int num_ranks; // total number of MPI ranks for MPI_COMM_WORLD + int my_rank; // my MPI rank for MPI_COMM_WORLD + int num_levels; // depth of the v-cycle + level_type ** levels; // array of pointers to levels + + struct { + double MGBuild; // total time spent building the coefficients... + double MGSolve; // total time spent in MGSolve + }timers; + int MGSolves_performed; +} mg_type; + + +//------------------------------------------------------------------------------------------------------------------------------ +void MGBuild(mg_type *all_grids, level_type *fine_grid, double a, double b, int minCoarseGridDim, const MPI_Comm comm); +void MGSolve(mg_type *all_grids, int onLevel, int u_id, int F_id, double a, double b, double dtol, double rtol); +void FMGSolve(mg_type *all_grids, int onLevel, int u_id, int F_id, double a, double b, double dtol, double rtol); +void MGPCG(mg_type *all_grids, int onLevel, int x_id, int F_id, double a, double b, double dtol, double rtol); +void MGDestroy(mg_type *all_grids); +void MGPrintTiming(mg_type *all_grids, int fromLevel); +void MGResetTimers(mg_type *all_grids); +void richardson_error(mg_type *all_grids, int levelh, int u_id); +//------------------------------------------------------------------------------------------------------------------------------ +#endif diff --git a/Util/hpgmg/finite-volume/source/mg_hpgmg.c b/Util/hpgmg/finite-volume/source/mg_hpgmg.c new file mode 100644 index 00000000..1a95ec93 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/mg_hpgmg.c @@ -0,0 +1,1498 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef USE_MPI +#include +#endif +#ifdef _OPENMP +#include +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#include "timers.h" +#include "defines.h" +#include "level.h" +#include "operators.h" +#include "solvers.h" +#include "mg.h" +//------------------------------------------------------------------------------------------------------------------------------ +// structs/routines used to construct the restriction and prolognation lists and ensure a convention on how data is ordered within an MPI buffer +typedef struct { + int sendRank; + int sendBoxID; + int sendBox; + int recvRank; + int recvBoxID; + int recvBox; + int i,j,k; // offsets used to index into the coarse box +} RP_type; + + +int qsortRP(const void *a, const void*b){ + RP_type *rpa = (RP_type*)a; + RP_type *rpb = (RP_type*)b; + // sort first by sendRank + if(rpa->sendRank < rpb->sendRank)return(-1); + if(rpa->sendRank > rpb->sendRank)return( 1); + // then by sendBoxID + if(rpa->sendBoxID < rpb->sendBoxID)return(-1); + if(rpa->sendBoxID > rpb->sendBoxID)return( 1); + return(0); +} + + +//---------------------------------------------------------------------------------------------------------------------------------------------------- +// print out average time per solve and then decompose by function and level +// note, in FMG, some levels are accessed more frequently. This routine only prints time per solve in that level +void MGPrintTiming(mg_type *all_grids, int fromLevel){ + if(all_grids->my_rank!=0)return; + int level,num_levels = all_grids->num_levels; + #ifdef CALIBRATE_TIMER + double _timeStart=getTime();sleep(1);double _timeEnd=getTime(); + double SecondsPerCycle = (double)1.0/(double)(_timeEnd-_timeStart); + #else + double SecondsPerCycle = 1.0; + #endif + double scale = SecondsPerCycle/(double)all_grids->MGSolves_performed; // prints average performance per MGSolve + + double time,total; + printf("\n\n"); + printf("level ");for(level=fromLevel;level<(num_levels );level++){printf("%12d ",level-fromLevel);}printf("\n"); + printf("level dimension ");for(level=fromLevel;level<(num_levels );level++){printf("%10d^3 ",all_grids->levels[level]->dim.i );}printf("\n"); + printf("box dimension ");for(level=fromLevel;level<(num_levels );level++){printf("%10d^3 ",all_grids->levels[level]->box_dim);}printf(" total\n"); + total=0;printf("------------------ ");for(level=fromLevel;level<(num_levels+1);level++){printf("------------ ");}printf("\n"); + total=0;printf("smooth ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.smooth; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf("residual ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.residual; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf("applyOp ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.apply_op; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf("BLAS1 ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.blas1; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf("BLAS3 ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.blas3; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf("Boundary Conditions ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.boundary_conditions; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf("Restriction ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.restriction_total; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf(" local restriction ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.restriction_local; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + #ifdef USE_MPI + total=0;printf(" pack MPI buffers ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.restriction_pack; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf(" unpack MPI buffers ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.restriction_unpack; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf(" MPI_Isend ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.restriction_send; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf(" MPI_Irecv ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.restriction_recv; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf(" MPI_Waitall ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.restriction_wait; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + #endif + total=0;printf("Interpolation ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.interpolation_total; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf(" local interpolation ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.interpolation_local; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + #ifdef USE_MPI + total=0;printf(" pack MPI buffers ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.interpolation_pack; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf(" unpack MPI buffers ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.interpolation_unpack; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf(" MPI_Isend ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.interpolation_send; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf(" MPI_Irecv ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.interpolation_recv; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf(" MPI_Waitall ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.interpolation_wait; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + #endif + total=0;printf("Ghost Zone Exchange ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.ghostZone_total; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf(" local exchange ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.ghostZone_local; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + #ifdef USE_MPI + total=0;printf(" pack MPI buffers ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.ghostZone_pack; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf(" unpack MPI buffers ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.ghostZone_unpack; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf(" MPI_Isend ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.ghostZone_send; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf(" MPI_Irecv ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.ghostZone_recv; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + total=0;printf(" MPI_Waitall ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.ghostZone_wait; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + #endif + #ifdef USE_MPI + total=0;printf("MPI_collectives ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.collectives; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + #endif + total=0;printf("------------------ ");for(level=fromLevel;level<(num_levels+1);level++){printf("------------ ");}printf("\n"); + total=0;printf("Total by level ");for(level=fromLevel;level<(num_levels );level++){time=scale*(double)all_grids->levels[level]->timers.Total; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total); + + printf("\n"); + printf( " Total time in MGBuild %12.6f seconds\n",SecondsPerCycle*(double)all_grids->timers.MGBuild); + printf( " Total time in MGSolve %12.6f seconds\n",scale*(double)all_grids->timers.MGSolve); + printf( " number of v-cycles %12d\n" ,all_grids->levels[fromLevel]->vcycles_from_this_level/all_grids->MGSolves_performed); + printf( "Bottom solver iterations %12d\n" ,all_grids->levels[num_levels-1]->Krylov_iterations/all_grids->MGSolves_performed); + #if defined(USE_CABICGSTAB) || defined(USE_CACG) + printf( " formations of G[][] %12d\n" ,all_grids->levels[num_levels-1]->CAKrylov_formations_of_G/all_grids->MGSolves_performed); + #endif + printf("\n\n");fflush(stdout); +} + + +//---------------------------------------------------------------------------------------------------------------------------------------------------- +// zeros all timers within this MG hierarchy +void MGResetTimers(mg_type *all_grids){ + int level; + for(level=0;levelnum_levels;level++)reset_level_timers(all_grids->levels[level]); +//all_grids->timers.MGBuild = 0; + all_grids->timers.MGSolve = 0; + all_grids->MGSolves_performed = 0; +} + + +//---------------------------------------------------------------------------------------------------------------------------------------------------- +// build a list of operations and MPI buffers to affect distributed interpolation +// the three lists constitute +// - buffer packing (i.e. interpolate a local box (or region of a box) and place the result in an MPI buffer) +// - local operations (i.e. interpolate a local box (or region of a box) and place the result in another local box) +// - buffer upacking (i.e. take interpolated data recieved from another process and use it to increment a local box) +void build_interpolation(mg_type *all_grids){ + int level; + for(level=0;levelnum_levels;level++){ + + // initialize to defaults... + all_grids->levels[level]->interpolation.num_recvs = 0; + all_grids->levels[level]->interpolation.num_sends = 0; + all_grids->levels[level]->interpolation.recv_ranks = NULL; + all_grids->levels[level]->interpolation.send_ranks = NULL; + all_grids->levels[level]->interpolation.recv_sizes = NULL; + all_grids->levels[level]->interpolation.send_sizes = NULL; + all_grids->levels[level]->interpolation.recv_buffers = NULL; + all_grids->levels[level]->interpolation.send_buffers = NULL; + all_grids->levels[level]->interpolation.blocks[0] = NULL; + all_grids->levels[level]->interpolation.blocks[1] = NULL; + all_grids->levels[level]->interpolation.blocks[2] = NULL; + all_grids->levels[level]->interpolation.num_blocks[0] = 0; + all_grids->levels[level]->interpolation.num_blocks[1] = 0; + all_grids->levels[level]->interpolation.num_blocks[2] = 0; + all_grids->levels[level]->interpolation.allocated_blocks[0] = 0; + all_grids->levels[level]->interpolation.allocated_blocks[1] = 0; + all_grids->levels[level]->interpolation.allocated_blocks[2] = 0; + #ifdef USE_MPI + all_grids->levels[level]->interpolation.requests = NULL; + all_grids->levels[level]->interpolation.status = NULL; + #endif + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // construct pack, send(to level-1), and local... + if( (level>0) && (all_grids->levels[level]->num_my_boxes>0) ){ // not top *and* I have boxes to send + // construct a list of fine boxes to be coarsened and sent to me... + int numFineBoxes = (all_grids->levels[level-1]->boxes_in.i/all_grids->levels[level]->boxes_in.i)* + (all_grids->levels[level-1]->boxes_in.j/all_grids->levels[level]->boxes_in.j)* + (all_grids->levels[level-1]->boxes_in.k/all_grids->levels[level]->boxes_in.k)* + all_grids->levels[level]->num_my_boxes; + int *fineRanks = ( int*)malloc(numFineBoxes*sizeof( int)); // high water mark (assumes every neighboring box is a different process) + RP_type *fineBoxes = (RP_type*)malloc(numFineBoxes*sizeof(RP_type)); + numFineBoxes = 0; + int numFineBoxesLocal = 0; + int numFineBoxesRemote = 0; + int coarseBox; + for(coarseBox=0;coarseBoxlevels[level]->num_my_boxes;coarseBox++){ + int bi,bj,bk; + int coarseBoxID = all_grids->levels[level]->my_boxes[coarseBox].global_box_id; + int coarseBox_i = all_grids->levels[level]->my_boxes[coarseBox].low.i / all_grids->levels[level]->box_dim; + int coarseBox_j = all_grids->levels[level]->my_boxes[coarseBox].low.j / all_grids->levels[level]->box_dim; + int coarseBox_k = all_grids->levels[level]->my_boxes[coarseBox].low.k / all_grids->levels[level]->box_dim; + for(bk=0;bklevels[level-1]->boxes_in.k/all_grids->levels[level]->boxes_in.k;bk++){ + for(bj=0;bjlevels[level-1]->boxes_in.j/all_grids->levels[level]->boxes_in.j;bj++){ + for(bi=0;bilevels[level-1]->boxes_in.i/all_grids->levels[level]->boxes_in.i;bi++){ + int fineBox_i = (all_grids->levels[level-1]->boxes_in.i/all_grids->levels[level]->boxes_in.i)*coarseBox_i + bi; + int fineBox_j = (all_grids->levels[level-1]->boxes_in.j/all_grids->levels[level]->boxes_in.j)*coarseBox_j + bj; + int fineBox_k = (all_grids->levels[level-1]->boxes_in.k/all_grids->levels[level]->boxes_in.k)*coarseBox_k + bk; + int fineBoxID = fineBox_i + fineBox_j*all_grids->levels[level-1]->boxes_in.i + fineBox_k*all_grids->levels[level-1]->boxes_in.i*all_grids->levels[level-1]->boxes_in.j; + int fineBox = -1;int f;for(f=0;flevels[level-1]->num_my_boxes;f++)if( all_grids->levels[level-1]->my_boxes[f].global_box_id == fineBoxID )fineBox=f; // try and find the index of a fineBox global_box_id == fineBoxID + fineBoxes[numFineBoxes].sendRank = all_grids->levels[level ]->rank_of_box[coarseBoxID]; + fineBoxes[numFineBoxes].sendBoxID = coarseBoxID; + fineBoxes[numFineBoxes].sendBox = coarseBox; + fineBoxes[numFineBoxes].recvRank = all_grids->levels[level-1]->rank_of_box[ fineBoxID]; + fineBoxes[numFineBoxes].recvBoxID = fineBoxID; + fineBoxes[numFineBoxes].recvBox = fineBox; + fineBoxes[numFineBoxes].i = bi*all_grids->levels[level-1]->box_dim/2; + fineBoxes[numFineBoxes].j = bj*all_grids->levels[level-1]->box_dim/2; + fineBoxes[numFineBoxes].k = bk*all_grids->levels[level-1]->box_dim/2; + numFineBoxes++; + if(all_grids->levels[level-1]->rank_of_box[fineBoxID] != all_grids->levels[level]->my_rank){ + fineRanks[numFineBoxesRemote++] = all_grids->levels[level-1]->rank_of_box[fineBoxID]; + }else{numFineBoxesLocal++;} + }}} + } // my (coarse) boxes + // sort boxes by sendRank(==my rank) then by sendBoxID... ensures the sends and receive buffers are always sorted by sendBoxID... + qsort(fineBoxes,numFineBoxes ,sizeof(RP_type),qsortRP ); + // sort the lists of neighboring ranks and remove duplicates... + qsort(fineRanks,numFineBoxesRemote,sizeof( int),qsortInt); + int numFineRanks=0; + int _rank=-1;int neighbor=0; + for(neighbor=0;neighborlevels[level]->interpolation.num_sends = numFineRanks; + all_grids->levels[level]->interpolation.send_ranks = (int*)malloc(numFineRanks*sizeof(int)); + all_grids->levels[level]->interpolation.send_sizes = (int*)malloc(numFineRanks*sizeof(int)); + all_grids->levels[level]->interpolation.send_buffers = (double**)malloc(numFineRanks*sizeof(double*)); + if(numFineRanks>0){ + if(all_grids->levels[level]->interpolation.send_ranks ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->interpolation.send_ranks\n",level);exit(0);} + if(all_grids->levels[level]->interpolation.send_sizes ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->interpolation.send_sizes\n",level);exit(0);} + if(all_grids->levels[level]->interpolation.send_buffers==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->interpolation.send_buffers\n",level);exit(0);} + } + + int elementSize = all_grids->levels[level-1]->box_dim*all_grids->levels[level-1]->box_dim*all_grids->levels[level-1]->box_dim; + double * all_send_buffers = (double*)malloc(numFineBoxesRemote*elementSize*sizeof(double)); + if(numFineBoxesRemote*elementSize>0) + if(all_send_buffers==NULL){fprintf(stderr,"malloc failed - interpolation/all_send_buffers\n");exit(0);} + memset(all_send_buffers,0,numFineBoxesRemote*elementSize*sizeof(double)); // DO NOT DELETE... you must initialize to 0 to avoid getting something like 0.0*NaN and corrupting the solve + //printf("level=%d, rank=%2d, send_buffers=%6d\n",level,all_grids->my_rank,numFineBoxesRemote*elementSize*sizeof(double)); + + // for each neighbor, construct the pack list and allocate the MPI send buffer... + for(neighbor=0;neighborlevels[level]->interpolation.send_buffers[neighbor] = all_send_buffers; + for(fineBox=0;fineBoxlevels[level]->interpolation.blocks[0]),&(all_grids->levels[level]->interpolation.allocated_blocks[0]),&(all_grids->levels[level]->interpolation.num_blocks[0]), + /* dim.i = */ all_grids->levels[level-1]->box_dim/2, + /* dim.j = */ all_grids->levels[level-1]->box_dim/2, + /* dim.k = */ all_grids->levels[level-1]->box_dim/2, + /* read.box = */ fineBoxes[fineBox].sendBox, + /* read.ptr = */ NULL, + /* read.i = */ fineBoxes[fineBox].i, + /* read.j = */ fineBoxes[fineBox].j, + /* read.k = */ fineBoxes[fineBox].k, + /* read.jStride = */ all_grids->levels[level]->my_boxes[fineBoxes[fineBox].sendBox].jStride, + /* read.kStride = */ all_grids->levels[level]->my_boxes[fineBoxes[fineBox].sendBox].kStride, + /* read.scale = */ 1, + /* write.box = */ -1, + /* write.ptr = */ all_grids->levels[level]->interpolation.send_buffers[neighbor], + /* write.i = */ offset, + /* write.j = */ 0, + /* write.k = */ 0, + /* write.jStride = */ all_grids->levels[level-1]->box_dim, + /* write.kStride = */ all_grids->levels[level-1]->box_dim*all_grids->levels[level-1]->box_dim, + /* write.scale = */ 2, + /* blockcopy_i = */ BLOCKCOPY_TILE_I, // default + /* blockcopy_j = */ BLOCKCOPY_TILE_J, // default + /* blockcopy_k = */ BLOCKCOPY_TILE_K, // default + /* subtype = */ 0 + ); + offset+=elementSize; + } + all_grids->levels[level]->interpolation.send_ranks[neighbor] = fineRanks[neighbor]; + all_grids->levels[level]->interpolation.send_sizes[neighbor] = offset; + all_send_buffers+=offset; + } // neighbor + { + int fineBox; + for(fineBox=0;fineBoxmy_rank){ + // local interpolations... + append_block_to_list(&(all_grids->levels[level]->interpolation.blocks[1]),&(all_grids->levels[level]->interpolation.allocated_blocks[1]),&(all_grids->levels[level]->interpolation.num_blocks[1]), + /* dim.i = */ all_grids->levels[level-1]->box_dim/2, + /* dim.j = */ all_grids->levels[level-1]->box_dim/2, + /* dim.k = */ all_grids->levels[level-1]->box_dim/2, + /* read.box = */ fineBoxes[fineBox].sendBox, + /* read.ptr = */ NULL, + /* read.i = */ fineBoxes[fineBox].i, + /* read.j = */ fineBoxes[fineBox].j, + /* read.k = */ fineBoxes[fineBox].k, + /* read.jStride = */ all_grids->levels[level]->my_boxes[fineBoxes[fineBox].sendBox].jStride, + /* read.kStride = */ all_grids->levels[level]->my_boxes[fineBoxes[fineBox].sendBox].kStride, + /* read.scale = */ 1, + /* write.box = */ fineBoxes[fineBox].recvBox, + /* write.ptr = */ NULL, + /* write.i = */ 0, + /* write.j = */ 0, + /* write.k = */ 0, + /* write.jStride = */ all_grids->levels[level-1]->my_boxes[fineBoxes[fineBox].recvBox].jStride, + /* write.kStride = */ all_grids->levels[level-1]->my_boxes[fineBoxes[fineBox].recvBox].kStride, + /* write.scale = */ 2, + /* blockcopy_i = */ BLOCKCOPY_TILE_I, // default + /* blockcopy_j = */ BLOCKCOPY_TILE_J, // default + /* blockcopy_k = */ BLOCKCOPY_TILE_K, // default + /* subtype = */ 0 + ); + } + } // local to local interpolation + + // free temporary storage... + free(fineBoxes); + free(fineRanks); + } // pack/send/local + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // construct recv(from level+1) and unpack... + if( (levelnum_levels-1) && (all_grids->levels[level]->num_my_boxes>0) ){ // not bottom *and* I have boxes to receive + + // construct the list of coarsened boxes and neighboring ranks that will be interpolated and sent to me... + int numCoarseBoxes = all_grids->levels[level]->num_my_boxes; // I may receive a block for each of my boxes + int *coarseRanks = ( int*)malloc(numCoarseBoxes*sizeof( int)); // high water mark (assumes every neighboring box is a different process) + RP_type *coarseBoxes = (RP_type*)malloc(numCoarseBoxes*sizeof(RP_type)); + numCoarseBoxes = 0; + int fineBox; + for(fineBox=0;fineBoxlevels[level]->num_my_boxes;fineBox++){ + int fineBoxID = all_grids->levels[level]->my_boxes[fineBox].global_box_id; + int fineBox_i = all_grids->levels[level]->my_boxes[fineBox].low.i / all_grids->levels[level]->box_dim; + int fineBox_j = all_grids->levels[level]->my_boxes[fineBox].low.j / all_grids->levels[level]->box_dim; + int fineBox_k = all_grids->levels[level]->my_boxes[fineBox].low.k / all_grids->levels[level]->box_dim; + int coarseBox_i = fineBox_i*all_grids->levels[level+1]->boxes_in.i/all_grids->levels[level]->boxes_in.i; + int coarseBox_j = fineBox_j*all_grids->levels[level+1]->boxes_in.j/all_grids->levels[level]->boxes_in.j; + int coarseBox_k = fineBox_k*all_grids->levels[level+1]->boxes_in.k/all_grids->levels[level]->boxes_in.k; + int coarseBoxID = coarseBox_i + coarseBox_j*all_grids->levels[level+1]->boxes_in.i + coarseBox_k*all_grids->levels[level+1]->boxes_in.i*all_grids->levels[level+1]->boxes_in.j; + if(all_grids->levels[level]->my_rank != all_grids->levels[level+1]->rank_of_box[coarseBoxID]){ + coarseBoxes[numCoarseBoxes].sendRank = all_grids->levels[level+1]->rank_of_box[coarseBoxID]; + coarseBoxes[numCoarseBoxes].sendBoxID = coarseBoxID; + coarseBoxes[numCoarseBoxes].sendBox = -1; + coarseBoxes[numCoarseBoxes].recvRank = all_grids->levels[level ]->rank_of_box[ fineBoxID]; + coarseBoxes[numCoarseBoxes].recvBoxID = fineBoxID; + coarseBoxes[numCoarseBoxes].recvBox = fineBox; + coarseRanks[numCoarseBoxes] = all_grids->levels[level+1]->rank_of_box[coarseBoxID]; + numCoarseBoxes++; + } + } // my (fine) boxes + + // sort boxes by sendRank(==my rank) then by sendBoxID... ensures the sends and receive buffers are always sorted by sendBoxID... + qsort(coarseBoxes,numCoarseBoxes,sizeof(RP_type),qsortRP ); + // sort the lists of neighboring ranks and remove duplicates... + qsort(coarseRanks,numCoarseBoxes,sizeof( int),qsortInt); + int numCoarseRanks=0; + int _rank=-1;int neighbor=0; + for(neighbor=0;neighborlevels[level]->interpolation.num_recvs = numCoarseRanks; + all_grids->levels[level]->interpolation.recv_ranks = (int*)malloc(numCoarseRanks*sizeof(int)); + all_grids->levels[level]->interpolation.recv_sizes = (int*)malloc(numCoarseRanks*sizeof(int)); + all_grids->levels[level]->interpolation.recv_buffers = (double**)malloc(numCoarseRanks*sizeof(double*)); + if(numCoarseRanks>0){ + if(all_grids->levels[level]->interpolation.recv_ranks ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->interpolation.recv_ranks\n",level);exit(0);} + if(all_grids->levels[level]->interpolation.recv_sizes ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->interpolation.recv_sizes\n",level);exit(0);} + if(all_grids->levels[level]->interpolation.recv_buffers==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->interpolation.recv_buffers\n",level);exit(0);} + } + + int elementSize = all_grids->levels[level]->box_dim*all_grids->levels[level]->box_dim*all_grids->levels[level]->box_dim; + double * all_recv_buffers = (double*)malloc(numCoarseBoxes*elementSize*sizeof(double)); + if(numCoarseBoxes*elementSize>0) + if(all_recv_buffers==NULL){fprintf(stderr,"malloc failed - interpolation/all_recv_buffers\n");exit(0);} + memset(all_recv_buffers,0,numCoarseBoxes*elementSize*sizeof(double)); // DO NOT DELETE... you must initialize to 0 to avoid getting something like 0.0*NaN and corrupting the solve + //printf("level=%d, rank=%2d, recv_buffers=%6d\n",level,all_grids->my_rank,numCoarseBoxes*elementSize*sizeof(double)); + + // for each neighbor, construct the unpack list and allocate the MPI recv buffer... + for(neighbor=0;neighborlevels[level]->interpolation.recv_buffers[neighbor] = all_recv_buffers; + for(coarseBox=0;coarseBoxlevels[level]->interpolation.blocks[2]),&(all_grids->levels[level]->interpolation.allocated_blocks[2]),&(all_grids->levels[level]->interpolation.num_blocks[2]), + /* dim.i = */ all_grids->levels[level]->box_dim, + /* dim.j = */ all_grids->levels[level]->box_dim, + /* dim.k = */ all_grids->levels[level]->box_dim, + /* read.box = */ -1, + /* read.ptr = */ all_grids->levels[level]->interpolation.recv_buffers[neighbor], + /* read.i = */ offset, + /* read.j = */ 0, + /* read.k = */ 0, + /* read.jStride = */ all_grids->levels[level]->box_dim, + /* read.kStride = */ all_grids->levels[level]->box_dim*all_grids->levels[level]->box_dim, + /* read.scale = */ 1, + /* write.box = */ coarseBoxes[coarseBox].recvBox, + /* write.ptr = */ NULL, + /* write.i = */ 0, + /* write.j = */ 0, + /* write.k = */ 0, + /* write.jStride = */ all_grids->levels[level]->my_boxes[coarseBoxes[coarseBox].recvBox].jStride, + /* write.kStride = */ all_grids->levels[level]->my_boxes[coarseBoxes[coarseBox].recvBox].kStride, + /* write.scale = */ 1, + /* blockcopy_i = */ BLOCKCOPY_TILE_I, // default + /* blockcopy_j = */ BLOCKCOPY_TILE_J, // default + /* blockcopy_k = */ BLOCKCOPY_TILE_K, // default + /* subtype = */ 0 + ); + offset+=elementSize; + } + all_grids->levels[level]->interpolation.recv_ranks[neighbor] = coarseRanks[neighbor]; + all_grids->levels[level]->interpolation.recv_sizes[neighbor] = offset; + all_recv_buffers+=offset; + } // neighbor + + // free temporary storage... + free(coarseBoxes); + free(coarseRanks); + } // recv/unpack + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + } // all levels + + + #ifdef USE_MPI + for(level=0;levelnum_levels;level++){ + all_grids->levels[level]->interpolation.requests = NULL; + all_grids->levels[level]->interpolation.status = NULL; + if(levelnum_levels-1){ // i.e. bottom never calls interpolation() + // by convention, level_f allocates a combined array of requests for both level_f recvs and level_c sends... + int nMessages = all_grids->levels[level+1]->interpolation.num_sends + all_grids->levels[level]->interpolation.num_recvs; + all_grids->levels[level]->interpolation.requests = (MPI_Request*)malloc(nMessages*sizeof(MPI_Request)); + all_grids->levels[level]->interpolation.status = (MPI_Status *)malloc(nMessages*sizeof(MPI_Status )); + } + } + #endif +} + + +//---------------------------------------------------------------------------------------------------------------------------------------------------- +// build a list of operations and MPI buffers to affect distributed restriction +// the three lists constitute +// - buffer packing (i.e. restrict a local box and place the result in an MPI buffer to be sent to a remote coarse grid process) +// - local operations (i.e. restrict a local box and place the result in another local box or region of another local box) +// - buffer upacking (i.e. copy restricted data recieved from another process into a local box or region of a local box) +void build_restriction(mg_type *all_grids, int restrictionType){ + int level; + for(level=0;levelnum_levels;level++){ + + // initialize to defaults... + all_grids->levels[level]->restriction[restrictionType].num_recvs = 0; + all_grids->levels[level]->restriction[restrictionType].num_sends = 0; + all_grids->levels[level]->restriction[restrictionType].recv_ranks = NULL; + all_grids->levels[level]->restriction[restrictionType].send_ranks = NULL; + all_grids->levels[level]->restriction[restrictionType].recv_sizes = NULL; + all_grids->levels[level]->restriction[restrictionType].send_sizes = NULL; + all_grids->levels[level]->restriction[restrictionType].recv_buffers = NULL; + all_grids->levels[level]->restriction[restrictionType].send_buffers = NULL; + all_grids->levels[level]->restriction[restrictionType].blocks[0] = NULL; + all_grids->levels[level]->restriction[restrictionType].blocks[1] = NULL; + all_grids->levels[level]->restriction[restrictionType].blocks[2] = NULL; + all_grids->levels[level]->restriction[restrictionType].allocated_blocks[0] = 0; + all_grids->levels[level]->restriction[restrictionType].allocated_blocks[1] = 0; + all_grids->levels[level]->restriction[restrictionType].allocated_blocks[2] = 0; + all_grids->levels[level]->restriction[restrictionType].num_blocks[0] = 0; // number of unpack/insert operations = number of boxes on level+1 that I don't own and restrict to + all_grids->levels[level]->restriction[restrictionType].num_blocks[1] = 0; // number of unpack/insert operations = number of boxes on level+1 that I own and restrict to + all_grids->levels[level]->restriction[restrictionType].num_blocks[2] = 0; // number of unpack/insert operations = number of boxes on level-1 that I don't own that restrict to me + #ifdef USE_MPI + all_grids->levels[level]->restriction[restrictionType].requests = NULL; + all_grids->levels[level]->restriction[restrictionType].status = NULL; + #endif + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // construct pack, send, and local... + if( (levelnum_levels-1) && (all_grids->levels[level]->num_my_boxes>0) ){ // not bottom *and* I have boxes to send + + // construct the list of coarsened boxes and neighboring ranks... + int numCoarseBoxes = (all_grids->levels[level]->boxes_in.i/all_grids->levels[level+1]->boxes_in.i)* + (all_grids->levels[level]->boxes_in.j/all_grids->levels[level+1]->boxes_in.j)* + (all_grids->levels[level]->boxes_in.k/all_grids->levels[level+1]->boxes_in.k)* + all_grids->levels[level]->num_my_boxes; + int *coarseRanks = ( int*)malloc(numCoarseBoxes*sizeof( int)); // high water mark (assumes every neighboring box is a different process) + RP_type *coarseBoxes = (RP_type*)malloc(numCoarseBoxes*sizeof(RP_type)); + numCoarseBoxes = 0; + int numCoarseBoxesLocal = 0; + int numCoarseBoxesRemote = 0; + int fineBox; + for(fineBox=0;fineBoxlevels[level]->num_my_boxes;fineBox++){ + int fineBoxID = all_grids->levels[level]->my_boxes[fineBox].global_box_id; + int fineBox_i = all_grids->levels[level]->my_boxes[fineBox].low.i / all_grids->levels[level]->box_dim; + int fineBox_j = all_grids->levels[level]->my_boxes[fineBox].low.j / all_grids->levels[level]->box_dim; + int fineBox_k = all_grids->levels[level]->my_boxes[fineBox].low.k / all_grids->levels[level]->box_dim; + int coarseBox_i = fineBox_i*all_grids->levels[level+1]->boxes_in.i/all_grids->levels[level]->boxes_in.i; + int coarseBox_j = fineBox_j*all_grids->levels[level+1]->boxes_in.j/all_grids->levels[level]->boxes_in.j; + int coarseBox_k = fineBox_k*all_grids->levels[level+1]->boxes_in.k/all_grids->levels[level]->boxes_in.k; + int coarseBoxID = coarseBox_i + coarseBox_j*all_grids->levels[level+1]->boxes_in.i + coarseBox_k*all_grids->levels[level+1]->boxes_in.i*all_grids->levels[level+1]->boxes_in.j; + int coarseBox = -1;int c;for(c=0;clevels[level+1]->num_my_boxes;c++)if( all_grids->levels[level+1]->my_boxes[c].global_box_id == coarseBoxID )coarseBox=c; // try and find the coarseBox index of a box with global_box_id == coaseBoxID + coarseBoxes[numCoarseBoxes].sendRank = all_grids->levels[level ]->rank_of_box[ fineBoxID]; + coarseBoxes[numCoarseBoxes].sendBoxID = fineBoxID; + coarseBoxes[numCoarseBoxes].sendBox = fineBox; + coarseBoxes[numCoarseBoxes].recvRank = all_grids->levels[level+1]->rank_of_box[coarseBoxID]; + coarseBoxes[numCoarseBoxes].recvBoxID = coarseBoxID; + coarseBoxes[numCoarseBoxes].recvBox = coarseBox; // -1 if off-node + coarseBoxes[numCoarseBoxes].i = (all_grids->levels[level]->box_dim/2)*( fineBox_i % (all_grids->levels[level]->boxes_in.i/all_grids->levels[level+1]->boxes_in.i) ); + coarseBoxes[numCoarseBoxes].j = (all_grids->levels[level]->box_dim/2)*( fineBox_j % (all_grids->levels[level]->boxes_in.j/all_grids->levels[level+1]->boxes_in.j) ); + coarseBoxes[numCoarseBoxes].k = (all_grids->levels[level]->box_dim/2)*( fineBox_k % (all_grids->levels[level]->boxes_in.k/all_grids->levels[level+1]->boxes_in.k) ); + numCoarseBoxes++; + if(all_grids->levels[level]->my_rank != all_grids->levels[level+1]->rank_of_box[coarseBoxID]){ + coarseRanks[numCoarseBoxesRemote++] = all_grids->levels[level+1]->rank_of_box[coarseBoxID]; + }else{numCoarseBoxesLocal++;} + } // my (fine) boxes + + // sort boxes by sendRank(==my rank) then by sendBoxID... ensures the sends and receive buffers are always sorted by sendBoxID... + qsort(coarseBoxes,numCoarseBoxes ,sizeof(RP_type),qsortRP ); + // sort the lists of neighboring ranks and remove duplicates... + qsort(coarseRanks,numCoarseBoxesRemote,sizeof( int),qsortInt); + int numCoarseRanks=0; + int _rank=-1;int neighbor=0; + for(neighbor=0;neighborlevels[level]->restriction[restrictionType].num_sends = numCoarseRanks; + all_grids->levels[level]->restriction[restrictionType].send_ranks = (int*)malloc(numCoarseRanks*sizeof(int)); + all_grids->levels[level]->restriction[restrictionType].send_sizes = (int*)malloc(numCoarseRanks*sizeof(int)); + all_grids->levels[level]->restriction[restrictionType].send_buffers = (double**)malloc(numCoarseRanks*sizeof(double*)); + if(numCoarseRanks>0){ + if(all_grids->levels[level]->restriction[restrictionType].send_ranks ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->restriction[restrictionType].send_ranks\n",level);exit(0);} + if(all_grids->levels[level]->restriction[restrictionType].send_sizes ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->restriction[restrictionType].send_sizes\n",level);exit(0);} + if(all_grids->levels[level]->restriction[restrictionType].send_buffers==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->restriction[restrictionType].send_buffers\n",level);exit(0);} + } + + int elementSize; + int restrict_dim_i=-1; + int restrict_dim_j=-1; + int restrict_dim_k=-1; + switch(restrictionType){ + case RESTRICT_CELL : restrict_dim_i = ( all_grids->levels[level]->box_dim/2); + restrict_dim_j = ( all_grids->levels[level]->box_dim/2); + restrict_dim_k = ( all_grids->levels[level]->box_dim/2);break; + case RESTRICT_FACE_I : restrict_dim_i = (1+all_grids->levels[level]->box_dim/2); + restrict_dim_j = ( all_grids->levels[level]->box_dim/2); + restrict_dim_k = ( all_grids->levels[level]->box_dim/2);break; + case RESTRICT_FACE_J : restrict_dim_i = ( all_grids->levels[level]->box_dim/2); + restrict_dim_j = (1+all_grids->levels[level]->box_dim/2); + restrict_dim_k = ( all_grids->levels[level]->box_dim/2);break; + case RESTRICT_FACE_K : restrict_dim_i = ( all_grids->levels[level]->box_dim/2); + restrict_dim_j = ( all_grids->levels[level]->box_dim/2); + restrict_dim_k = (1+all_grids->levels[level]->box_dim/2);break; + } + elementSize = restrict_dim_i*restrict_dim_j*restrict_dim_k; + + double * all_send_buffers = (double*)malloc(numCoarseBoxesRemote*elementSize*sizeof(double)); + if(numCoarseBoxesRemote*elementSize>0) + if(all_send_buffers==NULL){fprintf(stderr,"malloc failed - restriction/all_send_buffers\n");exit(0);} + memset(all_send_buffers,0,numCoarseBoxesRemote*elementSize*sizeof(double)); // DO NOT DELETE... you must initialize to 0 to avoid getting something like 0.0*NaN and corrupting the solve + + // for each neighbor, construct the pack list and allocate the MPI send buffer... + for(neighbor=0;neighborlevels[level]->restriction[restrictionType].send_buffers[neighbor] = all_send_buffers; + for(coarseBox=0;coarseBoxlevels[level]->restriction[restrictionType].blocks[0]), + &(all_grids->levels[level]->restriction[restrictionType].allocated_blocks[0]), + &(all_grids->levels[level]->restriction[restrictionType].num_blocks[0]), + /* dim.i = */ restrict_dim_i, + /* dim.j = */ restrict_dim_j, + /* dim.k = */ restrict_dim_k, + /* read.box = */ coarseBoxes[coarseBox].sendBox, + /* read.ptr = */ NULL, + /* read.i = */ 0, + /* read.j = */ 0, + /* read.k = */ 0, + /* read.jStride = */ all_grids->levels[level]->my_boxes[coarseBoxes[coarseBox].sendBox].jStride, + /* read.kStride = */ all_grids->levels[level]->my_boxes[coarseBoxes[coarseBox].sendBox].kStride, + /* read.scale = */ 2, + /* write.box = */ -1, + /* write.ptr = */ all_grids->levels[level]->restriction[restrictionType].send_buffers[neighbor], + /* write.i = */ offset, + /* write.j = */ 0, + /* write.k = */ 0, + /* write.jStride = */ restrict_dim_i, + /* write.kStride = */ restrict_dim_i*restrict_dim_j, + /* write.scale = */ 1, + /* blockcopy_i = */ BLOCKCOPY_TILE_I, // default + /* blockcopy_j = */ BLOCKCOPY_TILE_J, // default + /* blockcopy_k = */ BLOCKCOPY_TILE_K, // default + /* subtype = */ 0 + ); + offset+=elementSize; + } + all_grids->levels[level]->restriction[restrictionType].send_ranks[neighbor] = coarseRanks[neighbor]; + all_grids->levels[level]->restriction[restrictionType].send_sizes[neighbor] = offset; + all_send_buffers+=offset; + } + // for construct the local restriction list... + { + int coarseBox; + for(coarseBox=0;coarseBoxlevels[level+1]->my_rank){ + // restrict to local... + append_block_to_list( &(all_grids->levels[level]->restriction[restrictionType].blocks[1]), + &(all_grids->levels[level]->restriction[restrictionType].allocated_blocks[1]), + &(all_grids->levels[level]->restriction[restrictionType].num_blocks[1]), + /* dim.i = */ restrict_dim_i, + /* dim.j = */ restrict_dim_j, + /* dim.k = */ restrict_dim_k, + /* read.box = */ coarseBoxes[coarseBox].sendBox, + /* read.ptr = */ NULL, + /* read.i = */ 0, + /* read.j = */ 0, + /* read.k = */ 0, + /* read.jStride = */ all_grids->levels[level]->my_boxes[coarseBoxes[coarseBox].sendBox].jStride, + /* read.kStride = */ all_grids->levels[level]->my_boxes[coarseBoxes[coarseBox].sendBox].kStride, + /* read.scale = */ 2, + /* write.box = */ coarseBoxes[coarseBox].recvBox, + /* write.ptr = */ NULL, + /* write.i = */ coarseBoxes[coarseBox].i, + /* write.j = */ coarseBoxes[coarseBox].j, + /* write.k = */ coarseBoxes[coarseBox].k, + /* write.jStride = */ all_grids->levels[level+1]->my_boxes[coarseBoxes[coarseBox].recvBox].jStride, + /* write.kStride = */ all_grids->levels[level+1]->my_boxes[coarseBoxes[coarseBox].recvBox].kStride, + /* write.scale = */ 1, + /* blockcopy_i = */ BLOCKCOPY_TILE_I, // default + /* blockcopy_j = */ BLOCKCOPY_TILE_J, // default + /* blockcopy_k = */ BLOCKCOPY_TILE_K, // default + /* subtype = */ 0 + ); + } + } // local to local + + // free temporary storage... + free(coarseBoxes); + free(coarseRanks); + } // send/pack/local + + + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // construct recv and unpack... + if( (level>0) && (all_grids->levels[level]->num_my_boxes>0) ){ // not top *and* I have boxes to receive + // construct a list of fine boxes to be coarsened and sent to me... + int numFineBoxesMax = (all_grids->levels[level-1]->boxes_in.i/all_grids->levels[level]->boxes_in.i)* + (all_grids->levels[level-1]->boxes_in.j/all_grids->levels[level]->boxes_in.j)* + (all_grids->levels[level-1]->boxes_in.k/all_grids->levels[level]->boxes_in.k)* + all_grids->levels[level]->num_my_boxes; + int *fineRanks = ( int*)malloc(numFineBoxesMax*sizeof( int)); // high water mark (assumes every neighboring box is a different process) + RP_type *fineBoxes = (RP_type*)malloc(numFineBoxesMax*sizeof(RP_type)); + int numFineBoxesRemote = 0; + int coarseBox; + for(coarseBox=0;coarseBoxlevels[level]->num_my_boxes;coarseBox++){ + int bi,bj,bk; + int coarseBoxID = all_grids->levels[level]->my_boxes[coarseBox].global_box_id; + int coarseBox_i = all_grids->levels[level]->my_boxes[coarseBox].low.i / all_grids->levels[level]->box_dim; + int coarseBox_j = all_grids->levels[level]->my_boxes[coarseBox].low.j / all_grids->levels[level]->box_dim; + int coarseBox_k = all_grids->levels[level]->my_boxes[coarseBox].low.k / all_grids->levels[level]->box_dim; + for(bk=0;bklevels[level-1]->boxes_in.k/all_grids->levels[level]->boxes_in.k;bk++){ + for(bj=0;bjlevels[level-1]->boxes_in.j/all_grids->levels[level]->boxes_in.j;bj++){ + for(bi=0;bilevels[level-1]->boxes_in.i/all_grids->levels[level]->boxes_in.i;bi++){ + int fineBox_i = (all_grids->levels[level-1]->boxes_in.i/all_grids->levels[level]->boxes_in.i)*coarseBox_i + bi; + int fineBox_j = (all_grids->levels[level-1]->boxes_in.j/all_grids->levels[level]->boxes_in.j)*coarseBox_j + bj; + int fineBox_k = (all_grids->levels[level-1]->boxes_in.k/all_grids->levels[level]->boxes_in.k)*coarseBox_k + bk; + int fineBoxID = fineBox_i + fineBox_j*all_grids->levels[level-1]->boxes_in.i + fineBox_k*all_grids->levels[level-1]->boxes_in.i*all_grids->levels[level-1]->boxes_in.j; + if(all_grids->levels[level-1]->rank_of_box[fineBoxID] != all_grids->levels[level]->my_rank){ + fineBoxes[numFineBoxesRemote].sendRank = all_grids->levels[level-1]->rank_of_box[ fineBoxID]; + fineBoxes[numFineBoxesRemote].sendBoxID = fineBoxID; + fineBoxes[numFineBoxesRemote].sendBox = -1; // I don't know the off-node box index + fineBoxes[numFineBoxesRemote].recvRank = all_grids->levels[level ]->rank_of_box[coarseBoxID]; + fineBoxes[numFineBoxesRemote].recvBoxID = coarseBoxID; + fineBoxes[numFineBoxesRemote].recvBox = coarseBox; + fineBoxes[numFineBoxesRemote].i = bi*all_grids->levels[level-1]->box_dim/2; + fineBoxes[numFineBoxesRemote].j = bj*all_grids->levels[level-1]->box_dim/2; + fineBoxes[numFineBoxesRemote].k = bk*all_grids->levels[level-1]->box_dim/2; + fineRanks[numFineBoxesRemote] = all_grids->levels[level-1]->rank_of_box[fineBoxID]; + numFineBoxesRemote++; + } + }}} + } // my (coarse) boxes + // sort boxes by sendRank(==my rank) then by sendBoxID... ensures the sends and receive buffers are always sorted by sendBoxID... + qsort(fineBoxes,numFineBoxesRemote,sizeof(RP_type),qsortRP ); + // sort the lists of neighboring ranks and remove duplicates... + qsort(fineRanks,numFineBoxesRemote,sizeof( int),qsortInt); + int numFineRanks=0; + int _rank=-1;int neighbor=0; + for(neighbor=0;neighborlevels[level]->restriction[restrictionType].num_recvs = numFineRanks; + all_grids->levels[level]->restriction[restrictionType].recv_ranks = (int*)malloc(numFineRanks*sizeof(int)); + all_grids->levels[level]->restriction[restrictionType].recv_sizes = (int*)malloc(numFineRanks*sizeof(int)); + all_grids->levels[level]->restriction[restrictionType].recv_buffers = (double**)malloc(numFineRanks*sizeof(double*)); + if(numFineRanks>0){ + if(all_grids->levels[level]->restriction[restrictionType].recv_ranks ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->restriction[restrictionType].recv_ranks \n",level);exit(0);} + if(all_grids->levels[level]->restriction[restrictionType].recv_sizes ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->restriction[restrictionType].recv_sizes \n",level);exit(0);} + if(all_grids->levels[level]->restriction[restrictionType].recv_buffers==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->restriction[restrictionType].recv_buffers\n",level);exit(0);} + } + + int elementSize; + int restrict_dim_i=-1; + int restrict_dim_j=-1; + int restrict_dim_k=-1; + switch(restrictionType){ + case RESTRICT_CELL : restrict_dim_i = ( all_grids->levels[level-1]->box_dim/2); + restrict_dim_j = ( all_grids->levels[level-1]->box_dim/2); + restrict_dim_k = ( all_grids->levels[level-1]->box_dim/2);break; + case RESTRICT_FACE_I : restrict_dim_i = (1+all_grids->levels[level-1]->box_dim/2); + restrict_dim_j = ( all_grids->levels[level-1]->box_dim/2); + restrict_dim_k = ( all_grids->levels[level-1]->box_dim/2);break; + case RESTRICT_FACE_J : restrict_dim_i = ( all_grids->levels[level-1]->box_dim/2); + restrict_dim_j = (1+all_grids->levels[level-1]->box_dim/2); + restrict_dim_k = ( all_grids->levels[level-1]->box_dim/2);break; + case RESTRICT_FACE_K : restrict_dim_i = ( all_grids->levels[level-1]->box_dim/2); + restrict_dim_j = ( all_grids->levels[level-1]->box_dim/2); + restrict_dim_k = (1+all_grids->levels[level-1]->box_dim/2);break; + } + elementSize = restrict_dim_i*restrict_dim_j*restrict_dim_k; + + double * all_recv_buffers = (double*)malloc(numFineBoxesRemote*elementSize*sizeof(double)); + if(numFineBoxesRemote*elementSize>0) + if(all_recv_buffers==NULL){fprintf(stderr,"malloc failed - restriction/all_recv_buffers\n");exit(0);} + memset(all_recv_buffers,0,numFineBoxesRemote*elementSize*sizeof(double)); // DO NOT DELETE... you must initialize to 0 to avoid getting something like 0.0*NaN and corrupting the solve + //printf("level=%d, rank=%2d, recv_buffers=%6d\n",level,all_grids->my_rank,numFineBoxesRemote*elementSize*sizeof(double)); + + // for each neighbor, construct the unpack list and allocate the MPI recv buffer... + for(neighbor=0;neighborlevels[level]->restriction[restrictionType].recv_buffers[neighbor] = all_recv_buffers; + for(fineBox=0;fineBoxlevels[level]->restriction[restrictionType].blocks[2]), + &(all_grids->levels[level]->restriction[restrictionType].allocated_blocks[2]), + &(all_grids->levels[level]->restriction[restrictionType].num_blocks[2]), + /* dim.i = */ restrict_dim_i, + /* dim.j = */ restrict_dim_j, + /* dim.k = */ restrict_dim_k, + /* read.box = */ -1, + /* read.ptr = */ all_grids->levels[level]->restriction[restrictionType].recv_buffers[neighbor], + /* read.i = */ offset, + /* read.j = */ 0, + /* read.k = */ 0, + /* read.jStride = */ restrict_dim_i, + /* read.kStride = */ restrict_dim_i*restrict_dim_j, + /* read.scale = */ 1, + /* write.box = */ fineBoxes[fineBox].recvBox, + /* write.ptr = */ NULL, + /* write.i = */ fineBoxes[fineBox].i, + /* write.j = */ fineBoxes[fineBox].j, + /* write.k = */ fineBoxes[fineBox].k, + /* write.jStride = */ all_grids->levels[level]->my_boxes[fineBoxes[fineBox].recvBox].jStride, + /* write.kStride = */ all_grids->levels[level]->my_boxes[fineBoxes[fineBox].recvBox].kStride, + /* write.scale = */ 1, + /* blockcopy_i = */ BLOCKCOPY_TILE_I, // default + /* blockcopy_j = */ BLOCKCOPY_TILE_J, // default + /* blockcopy_k = */ BLOCKCOPY_TILE_K, // default + /* subtype = */ 0 + ); + offset+=elementSize; + } + all_grids->levels[level]->restriction[restrictionType].recv_ranks[neighbor] = fineRanks[neighbor]; + all_grids->levels[level]->restriction[restrictionType].recv_sizes[neighbor] = offset; + all_recv_buffers+=offset; + } // neighbor + + // free temporary storage... + free(fineBoxes); + free(fineRanks); + } // recv/unpack + + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + } // level loop + + + #ifdef USE_MPI + for(level=0;levelnum_levels;level++){ + all_grids->levels[level]->restriction[restrictionType].requests = NULL; + all_grids->levels[level]->restriction[restrictionType].status = NULL; + if(levelnum_levels-1){ // bottom never calls restriction() + // by convention, level_f allocates a combined array of requests for both level_f sends and level_c recvs... + int nMessages = all_grids->levels[level+1]->restriction[restrictionType].num_recvs + all_grids->levels[level]->restriction[restrictionType].num_sends; + all_grids->levels[level]->restriction[restrictionType].requests = (MPI_Request*)malloc(nMessages*sizeof(MPI_Request)); + all_grids->levels[level]->restriction[restrictionType].status = (MPI_Status *)malloc(nMessages*sizeof(MPI_Status )); + } + } + #endif +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// given a fine grid input, build a hiearchy of MG levels +// level 0 simply points to fine_grid. All other levels are created +// rebuild the restriction/interpolation lists for each coarse grid level +// rebuild the operator on each coarse grid level +// add extra vectors to the coarse grid once here instead of on every call to the coarse grid solve +// NOTE, this routine presumes the fine_grid domain is cubical... fine_grid->dim.i==fine_grid->dim.j==fine_grid->dim.k +// NOTE, as this function is not timed, it has not been optimzied for performance +void MGBuild(mg_type *all_grids, level_type *fine_grid, double a, double b, int minCoarseGridDim, const MPI_Comm comm){ + int maxLevels=100; // i.e. maximum problem size is (2^100)^3 + int nProcs[100]; + int dim_i[100]; + int boxes_in_i[100]; + int box_dim[100]; + int box_ghosts[100]; + all_grids->my_rank = fine_grid->my_rank; + all_grids->timers.MGBuild = 0; + double _timeStartMGBuild = getTime(); + + // calculate how deep we can make the v-cycle... + int level=1; + int coarse_dim = fine_grid->dim.i; +//if(fine_grid->dim.jdim.j; +//if(fine_grid->dim.kdim.k; + while( (coarse_dim>=2*minCoarseGridDim) && ((coarse_dim&0x1)==0) ){ // grid dimension is even and big enough... + level++; + coarse_dim = coarse_dim / 2; + }if(levelnum_ranks; + dim_i[0] = fine_grid->dim.i; + boxes_in_i[0] = fine_grid->boxes_in.i; + box_dim[0] = fine_grid->box_dim; + box_ghosts[0] = fine_grid->box_ghosts; + + // build the list of levels... + all_grids->levels = (level_type**)malloc(maxLevels*sizeof(level_type*)); + if(all_grids->levels == NULL){fprintf(stderr,"malloc failed - MGBuild/all_grids->levels\n");exit(0);} + all_grids->num_levels=1; + all_grids->levels[0] = fine_grid; + + + // build a table to guide the construction of the v-cycle... + int doRestrict=1;if(maxLevels<2)doRestrict=0; // i.e. can't restrict if there is only one level !!! + #ifdef USE_UCYCLES + while(doRestrict){ + level = all_grids->num_levels; + doRestrict=0; + if( (box_dim[level-1] % 2 == 0) ){ + nProcs[level] = nProcs[level-1]; + dim_i[level] = dim_i[level-1]/2; + box_dim[level] = box_dim[level-1]/2; + boxes_in_i[level] = boxes_in_i[level-1]; + box_ghosts[level] = box_ghosts[level-1]; + doRestrict = 1; + } + if(box_dim[level] < box_ghosts[level])doRestrict=0; + if(dim_i[level]num_levels++; + } + #else // TRUE V-Cycle... + while(doRestrict){ + level = all_grids->num_levels; + doRestrict=0; + int fine_box_dim = box_dim[level-1]; + int fine_nProcs = nProcs[level-1]; + int fine_dim_i = dim_i[level-1]; + int fine_boxes_in_i = boxes_in_i[level-1]; + if( (fine_box_dim % 2 == 0) && (fine_box_dim > MG_AGGLOMERATION_START) && ((fine_box_dim/2)>=stencil_get_radius()) ){ // Boxes are too big to agglomerate + nProcs[level] = fine_nProcs; + dim_i[level] = fine_dim_i/2; + box_dim[level] = fine_box_dim/2; // FIX, verify its not less than the stencil radius + boxes_in_i[level] = fine_boxes_in_i; + box_ghosts[level] = box_ghosts[level-1]; + doRestrict = 1; + }else + if( (fine_boxes_in_i % 2 == 0) && ((fine_box_dim)>=stencil_get_radius()) ){ // 8:1 box agglomeration + nProcs[level] = fine_nProcs; + dim_i[level] = fine_dim_i/2; + box_dim[level] = fine_box_dim; + boxes_in_i[level] = fine_boxes_in_i/2; + box_ghosts[level] = box_ghosts[level-1]; + doRestrict = 1; + }else + if( (coarse_dim != 1) && (fine_dim_i == 2*coarse_dim) && ((fine_dim_i/2)>=stencil_get_radius()) ){ // agglomerate everything + nProcs[level] = 1; + dim_i[level] = fine_dim_i/2; + box_dim[level] = fine_dim_i/2; // FIX, verify its not less than the stencil radius + boxes_in_i[level] = 1; + box_ghosts[level] = box_ghosts[level-1]; + doRestrict = 1; + }else + if( (coarse_dim != 1) && (fine_dim_i == 4*coarse_dim) && ((fine_box_dim/2)>=stencil_get_radius()) ){ // restrict box dimension, and run on fewer ranks + nProcs[level] = coarse_dim=stencil_get_radius()) ){ // restrict box dimension, and run on fewer ranks + nProcs[level] = coarse_dim*coarse_dim=stencil_get_radius()) ){ // restrict box dimension, and run on the same number of ranks + nProcs[level] = fine_nProcs; + dim_i[level] = fine_dim_i/2; + box_dim[level] = fine_box_dim/2; // FIX, verify its not less than the stencil radius + boxes_in_i[level] = fine_boxes_in_i; + box_ghosts[level] = box_ghosts[level-1]; + doRestrict = 1; + } + if(dim_i[level]num_levels++; + } + #endif + + + // now build all the coarsened levels... + for(level=1;levelnum_levels;level++){ + all_grids->levels[level] = (level_type*)malloc(sizeof(level_type)); + if(all_grids->levels[level] == NULL){fprintf(stderr,"malloc failed - MGBuild/doRestrict\n");exit(0);} + create_level(all_grids->levels[level],boxes_in_i[level],box_dim[level],box_ghosts[level],all_grids->levels[level-1]->numVectors,all_grids->levels[level-1]->boundary_condition.type,all_grids->levels[level-1]->my_rank,nProcs[level], comm); + all_grids->levels[level]->h = 2.0*all_grids->levels[level-1]->h; + } + + + // bottom solver (level = all_grids->num_levels-1) gets extra vectors... + create_vectors(all_grids->levels[all_grids->num_levels-1],all_grids->levels[all_grids->num_levels-1]->numVectors + IterativeSolver_NumVectors() ); + + + // build the restriction and interpolation communicators... + if(all_grids->my_rank==0){fprintf(stdout,"\n Building restriction and interpolation lists... ");fflush(stdout);} + build_restriction(all_grids,RESTRICT_CELL ); // cell-centered + build_restriction(all_grids,RESTRICT_FACE_I); // face-centered, normal to i + build_restriction(all_grids,RESTRICT_FACE_J); // face-centered, normal to j + build_restriction(all_grids,RESTRICT_FACE_K); // face-centered, normal to k + build_interpolation(all_grids); + if(all_grids->my_rank==0){fprintf(stdout,"done\n");fflush(stdout);} + + + // build subcommunicators... + #ifdef USE_MPI + #ifdef USE_SUBCOMM + if(all_grids->my_rank==0){fprintf(stdout,"\n");} + for(level=1;levelnum_levels;level++){ + double comm_split_start = MPI_Wtime(); + if(all_grids->my_rank==0){fprintf(stdout," Building MPI subcommunicator for level %d... ",level);fflush(stdout);} + all_grids->levels[level]->active=0; + int ll;for(ll=level;llnum_levels;ll++)if(all_grids->levels[ll]->num_my_boxes>0)all_grids->levels[level]->active=1; + MPI_Comm_split(comm, all_grids->levels[level]->active, all_grids->levels[level]->my_rank, &all_grids->levels[level]->MPI_COMM_ALLREDUCE); + double comm_split_end = MPI_Wtime(); + double comm_split_time_send = comm_split_end-comm_split_start; + double comm_split_time = 0; + MPI_Allreduce(&comm_split_time_send,&comm_split_time,1,MPI_DOUBLE,MPI_MAX,all_grids->levels[level]->MPI_COMM_ALLREDUCE); + if(all_grids->my_rank==0){fprintf(stdout,"done (%0.6f seconds)\n",comm_split_time);fflush(stdout);} + } + #endif + #endif + + + // rebuild various coefficients for the operator... must occur after build_restriction !!! + if(all_grids->my_rank==0){fprintf(stdout,"\n");} + for(level=1;levelnum_levels;level++){ + rebuild_operator(all_grids->levels[level],(level>0)?all_grids->levels[level-1]:NULL,a,b); + } + if(all_grids->my_rank==0){fprintf(stdout,"\n");} + + + // quick tests for Poisson, Neumann, etc... + for(level=0;levelnum_levels;level++){ + all_grids->levels[level]->must_subtract_mean = 0; + int alpha_is_zero = (dot(all_grids->levels[level],VECTOR_ALPHA,VECTOR_ALPHA) == 0.0); + // For Poisson with Periodic Boundary Conditions, by convention we assume the solution sums to zero. Eliminate any constants from the solution by subtracting the mean. + if( (all_grids->levels[level]->boundary_condition.type==BC_PERIODIC) && ((a==0) || (alpha_is_zero==1)) )all_grids->levels[level]->must_subtract_mean = 1; + } + + + all_grids->timers.MGBuild += (double)(getTime()-_timeStartMGBuild); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// deallocate all memory created in the MG hierarchy +// WARNING, this will free the fine_grid level as well (FIX?) +void MGDestroy(mg_type *all_grids){ + int level; + int i; + + #ifdef USE_MPI + #ifdef USE_SUBCOMM + // only MGBuild creates subcommunicators (level_create assigns) + for(level=all_grids->num_levels-1;level>0;level--){ + if(all_grids->levels[level]->MPI_COMM_ALLREDUCE != MPI_COMM_WORLD) + MPI_Comm_free(&all_grids->levels[level]->MPI_COMM_ALLREDUCE); + } + #endif + #endif + + if(all_grids->my_rank==0){fprintf(stdout,"attempting to free the restriction and interpolation lists... ");fflush(stdout);} + for(level=all_grids->num_levels-1;level>=0;level--){ + // destroy restriction mini program created by MGBuild... + for(i=0;i<4;i++){ + if(all_grids->levels[level]->restriction[i].num_recvs>0){ + //for(j=0;jlevels[level]->restriction[i].num_recvs;j++)if(all_grids->levels[level]->restriction[i].recv_buffers[j])free(all_grids->levels[level]->restriction[i].recv_buffers[j]); + if(all_grids->levels[level]->restriction[i].recv_buffers[0])free(all_grids->levels[level]->restriction[i].recv_buffers[0]); // allocated in bulk + if(all_grids->levels[level]->restriction[i].recv_buffers )free(all_grids->levels[level]->restriction[i].recv_buffers ); + if(all_grids->levels[level]->restriction[i].recv_ranks )free(all_grids->levels[level]->restriction[i].recv_ranks ); + if(all_grids->levels[level]->restriction[i].recv_sizes )free(all_grids->levels[level]->restriction[i].recv_sizes ); + } + if(all_grids->levels[level]->restriction[i].num_sends>0){ + //for(j=0;jlevels[level]->restriction[i].num_sends;j++)if(all_grids->levels[level]->restriction[i].send_buffers[j])free(all_grids->levels[level]->restriction[i].send_buffers[j]); + if(all_grids->levels[level]->restriction[i].send_buffers[0])free(all_grids->levels[level]->restriction[i].send_buffers[0]); // allocated in bulk + if(all_grids->levels[level]->restriction[i].send_buffers )free(all_grids->levels[level]->restriction[i].send_buffers ); + if(all_grids->levels[level]->restriction[i].send_ranks )free(all_grids->levels[level]->restriction[i].send_ranks ); + if(all_grids->levels[level]->restriction[i].send_sizes )free(all_grids->levels[level]->restriction[i].send_sizes ); + } + if(all_grids->levels[level]->restriction[i].blocks[0] )free(all_grids->levels[level]->restriction[i].blocks[0] ); + if(all_grids->levels[level]->restriction[i].blocks[1] )free(all_grids->levels[level]->restriction[i].blocks[1] ); + if(all_grids->levels[level]->restriction[i].blocks[2] )free(all_grids->levels[level]->restriction[i].blocks[2] ); + #ifdef USE_MPI + if(all_grids->levels[level]->restriction[i].requests )free(all_grids->levels[level]->restriction[i].requests ); + if(all_grids->levels[level]->restriction[i].status )free(all_grids->levels[level]->restriction[i].status ); + #endif + } + + // destroy interpolation mini program created by MGBuild... + if(all_grids->levels[level]->interpolation.num_recvs>0){ + //for(j=0;jlevels[level]->interpolation.num_recvs;j++)if(all_grids->levels[level]->interpolation.recv_buffers[j])free(all_grids->levels[level]->interpolation.recv_buffers[j]); + if(all_grids->levels[level]->interpolation.recv_buffers[0])free(all_grids->levels[level]->interpolation.recv_buffers[0]); // allocated in bulk + if(all_grids->levels[level]->interpolation.recv_buffers )free(all_grids->levels[level]->interpolation.recv_buffers ); + if(all_grids->levels[level]->interpolation.recv_ranks )free(all_grids->levels[level]->interpolation.recv_ranks ); + if(all_grids->levels[level]->interpolation.recv_sizes )free(all_grids->levels[level]->interpolation.recv_sizes ); + } + if(all_grids->levels[level]->interpolation.num_sends>0){ + //for(j=0;jlevels[level]->interpolation.num_sends;j++)if(all_grids->levels[level]->interpolation.send_buffers[j])free(all_grids->levels[level]->interpolation.send_buffers[j]); + if(all_grids->levels[level]->interpolation.send_buffers[0])free(all_grids->levels[level]->interpolation.send_buffers[0]); // allocated in bulk + if(all_grids->levels[level]->interpolation.send_buffers )free(all_grids->levels[level]->interpolation.send_buffers ); + if(all_grids->levels[level]->interpolation.send_ranks )free(all_grids->levels[level]->interpolation.send_ranks ); + if(all_grids->levels[level]->interpolation.send_sizes )free(all_grids->levels[level]->interpolation.send_sizes ); + } + if(all_grids->levels[level]->interpolation.blocks[0] )free(all_grids->levels[level]->interpolation.blocks[0] ); + if(all_grids->levels[level]->interpolation.blocks[1] )free(all_grids->levels[level]->interpolation.blocks[1] ); + if(all_grids->levels[level]->interpolation.blocks[2] )free(all_grids->levels[level]->interpolation.blocks[2] ); + #ifdef USE_MPI + if(all_grids->levels[level]->interpolation.requests )free(all_grids->levels[level]->interpolation.requests ); + if(all_grids->levels[level]->interpolation.status )free(all_grids->levels[level]->interpolation.status ); + #endif + + } + if(all_grids->my_rank==0){fprintf(stdout,"done\n");} + + // now destroy the level itself (but don't destroy level 0 as it was not created by MGBuild) + for(level=all_grids->num_levels-1;level>0;level--){ + destroy_level(all_grids->levels[level]); + } + if(all_grids->levels)free(all_grids->levels); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// perform a richardson error analysis to infer the order of the operator/solver +void richardson_error(mg_type *all_grids, int levelh, int u_id){ + // in FV... + // +-------+ +---+---+ +-------+ +-------+ + // | | | a | b | | | |a+b+c+d| + // | u^2h | - +---+---+ = | u^2h | - | --- | + // | | | c | d | | | | 4 | + // +-------+ +---+---+ +-------+ +-------+ + // + restriction(all_grids->levels[levelh+1],VECTOR_TEMP,all_grids->levels[levelh ],u_id,RESTRICT_CELL); // temp^2h = R u^h + restriction(all_grids->levels[levelh+2],VECTOR_TEMP,all_grids->levels[levelh+1],u_id,RESTRICT_CELL); // temp^4h = R u^2h + add_vectors(all_grids->levels[levelh+1],VECTOR_TEMP,1.0,u_id,-1.0,VECTOR_TEMP); // temp^2h = u^2h - temp^2h = u^2h - R u^h + add_vectors(all_grids->levels[levelh+2],VECTOR_TEMP,1.0,u_id,-1.0,VECTOR_TEMP); // temp^2h = u^4h - temp^4h = u^4h - R u^2h + double norm_of_u2h_minus_uh = norm(all_grids->levels[levelh+1],VECTOR_TEMP); // || u^2h - R u^h ||max + double norm_of_u4h_minus_u2h = norm(all_grids->levels[levelh+2],VECTOR_TEMP); // || u^4h - R u^2h ||max + // estimate the error^h using ||u^2h - R u^h|| + if(all_grids->my_rank==0){fprintf(stdout," h=%0.15e ||error||=%0.15e\n",all_grids->levels[levelh]->h,norm_of_u2h_minus_uh);fflush(stdout);} + // log( ||u^4h - R u^2h|| / ||u^2h - R u^h|| ) / log(2) is an estimate of the order of the method (e.g. 4th order) + if(all_grids->my_rank==0){fprintf(stdout," order=%0.3f\n",log(norm_of_u4h_minus_u2h / norm_of_u2h_minus_uh) / log(2) );fflush(stdout);} +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void MGVCycle(mg_type *all_grids, int e_id, int R_id, double a, double b, int level){ + if(!all_grids->levels[level]->active)return; + double _LevelStart; + + // bottom solve... + if(level==all_grids->num_levels-1){ + double _timeBottomStart = getTime(); + IterativeSolver(all_grids->levels[level],e_id,R_id,a,b,MG_DEFAULT_BOTTOM_NORM); + all_grids->levels[level]->timers.Total += (double)(getTime()-_timeBottomStart); + return; + } + + // down... + _LevelStart = getTime(); + smooth(all_grids->levels[level ],e_id,R_id,a,b); + residual(all_grids->levels[level ],VECTOR_TEMP,e_id,R_id,a,b); + restriction(all_grids->levels[level+1],R_id,all_grids->levels[level],VECTOR_TEMP,RESTRICT_CELL); + zero_vector(all_grids->levels[level+1],e_id); + all_grids->levels[level]->timers.Total += (double)(getTime()-_LevelStart); + + // recursion... + MGVCycle(all_grids,e_id,R_id,a,b,level+1); + + // up... + _LevelStart = getTime(); + interpolation_vcycle(all_grids->levels[level ],e_id,1.0,all_grids->levels[level+1],e_id); + smooth(all_grids->levels[level ],e_id,R_id,a,b); + + all_grids->levels[level]->timers.Total += (double)(getTime()-_LevelStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void MGSolve(mg_type *all_grids, int onLevel, int u_id, int F_id, double a, double b, double dtol, double rtol){ + // solves Au=f on level 'onLevel' + all_grids->MGSolves_performed++; + if(!all_grids->levels[onLevel]->active)return; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + int e_id = u_id; // __u FIX + int R_id = VECTOR_F_MINUS_AV; + int v; + int maxVCycles = 20; + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #ifdef _OPENMP + double MG_Start_Time = omp_get_wtime(); + #elif USE_MPI + double MG_Start_Time = MPI_Wtime(); + #endif + if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"MGSolve... ");} + double _timeStartMGSolve = getTime(); + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // calculate norm of f for convergence criteria... + double norm_of_F = 1.0; + double norm_of_DinvF = 1.0; + if(dtol>0){ + mul_vectors(all_grids->levels[onLevel],VECTOR_TEMP,1.0,F_id,VECTOR_DINV); // D^{-1}F + norm_of_DinvF = norm(all_grids->levels[onLevel],VECTOR_TEMP); // ||D^{-1}F|| + } + if(rtol>0)norm_of_F = norm(all_grids->levels[onLevel],F_id); // ||F|| + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // make initial guess for e (=0) and setup the RHS + zero_vector(all_grids->levels[onLevel],e_id); // ee = 0 + scale_vector(all_grids->levels[onLevel],R_id,1.0,F_id); // R_id = F_id + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // now do v-cycles to calculate the correction... + for(v=0;vlevels[level]->vcycles_from_this_level++; + + // do the v-cycle... + MGVCycle(all_grids,e_id,R_id,a,b,level); + + // now calculate the norm of the residual... + double _timeStart = getTime(); + if(all_grids->levels[level]->must_subtract_mean == 1){ + double average_value_of_e = mean(all_grids->levels[level],e_id); + shift_vector(all_grids->levels[level],e_id,e_id,-average_value_of_e); + } + residual(all_grids->levels[level],VECTOR_TEMP,e_id,F_id,a,b); + if(dtol>0)mul_vectors(all_grids->levels[level],VECTOR_TEMP,1.0,VECTOR_TEMP,VECTOR_DINV); // Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria... + double norm_of_residual = norm(all_grids->levels[level],VECTOR_TEMP); + double _timeNorm = getTime(); + all_grids->levels[level]->timers.Total += (double)(_timeNorm-_timeStart); + if(all_grids->levels[level]->my_rank==0){ + double rel = 0.0; + if(rtol>0)rel = norm_of_residual/norm_of_F; + else rel = norm_of_residual/norm_of_DinvF; + if( v>0){fprintf(stdout,"\n v-cycle=%2d norm=%1.15e rel=%1.15e ",v+1,norm_of_residual,rel);} + else{fprintf(stdout, "v-cycle=%2d norm=%1.15e rel=%1.15e ",v+1,norm_of_residual,rel);} + } + if(norm_of_residual/norm_of_F < rtol)break; + if(norm_of_residual < dtol)break; + } // maxVCycles + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + all_grids->timers.MGSolve += (double)(getTime()-_timeStartMGSolve); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #ifdef _OPENMP + if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done (%f seconds)\n",omp_get_wtime()-MG_Start_Time);} // used to monitor variability in individual solve times + #elif USE_MPI + if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done (%f seconds)\n",MPI_Wtime()-MG_Start_Time);} // used to monitor variability in individual solve times + #else + if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done\n");} + #endif +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void FMGSolve(mg_type *all_grids, int onLevel, int u_id, int F_id, double a, double b, double dtol, double rtol){ + + #ifdef UNLIMIT_FMG_FCYCLES + + all_grids->MGSolves_performed++; + if(!all_grids->levels[onLevel]->active)return; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + int maxFCycles=20; + int f; + int level; + int e_id = VECTOR_E; + int R_id = VECTOR_F_MINUS_AV; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #ifdef _OPENMP + double FMG_Start_Time = omp_get_wtime(); + #elif USE_MPI + double FMG_Start_Time = MPI_Wtime(); + #endif + if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"FMGSolve... ");} + double _timeStartMGSolve = getTime(); + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // calculate norm of f... + double _LevelStart = getTime(); + double norm_of_F = norm(all_grids->levels[onLevel],F_id); // ||F|| + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // initialize the RHS for the f-cycle to f... + scale_vector(all_grids->levels[onLevel],R_id,1.0,F_id); // R_id = F-Au = F-0 = F_id + all_grids->levels[onLevel]->timers.Total += (double)(getTime()-_LevelStart); + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // iterate on f-cycles... + for(f=0;fnum_levels-1);level++){ + double _LevelStart = getTime(); + restriction(all_grids->levels[level+1],R_id,all_grids->levels[level],R_id,RESTRICT_CELL); + all_grids->levels[level]->timers.Total += (double)(getTime()-_LevelStart); + } + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // solve coarsest grid... + double _timeBottomStart = getTime(); + level = all_grids->num_levels-1; + if(level>onLevel)zero_vector(all_grids->levels[level],e_id);//else use whatever was the initial guess + IterativeSolver(all_grids->levels[level],e_id,R_id,a,b,MG_DEFAULT_BOTTOM_NORM); // -1 == exact solution + all_grids->levels[level]->timers.Total += (double)(getTime()-_timeBottomStart); + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // now do the F-cycle proper... + for(level=all_grids->num_levels-2;level>=onLevel;level--){ + // high-order interpolation + _LevelStart = getTime(); + interpolation_fcycle(all_grids->levels[level],e_id,0.0,all_grids->levels[level+1],e_id); + all_grids->levels[level]->timers.Total += (double)(getTime()-_LevelStart); + + // v-cycle + all_grids->levels[level]->vcycles_from_this_level++; + MGVCycle(all_grids,e_id,R_id,a,b,level); + } + + // correct current solution and calculate residual (new RHS)... + _LevelStart = getTime(); + add_vectors(all_grids->levels[onLevel],u_id,1.0,u_id,1.0,e_id ); + if(all_grids->levels[onLevel]->must_subtract_mean == 1){ + double average_value_of_u = mean(all_grids->levels[onLevel],u_id); + shift_vector(all_grids->levels[onLevel],u_id,u_id,-average_value_of_u); + } + residual(all_grids->levels[onLevel],R_id,u_id,F_id,a,b); + double norm_of_residual = norm(all_grids->levels[onLevel],R_id); + all_grids->levels[onLevel]->timers.Total += (double)(getTime()-_LevelStart); + + // test convergence... + if(all_grids->levels[onLevel]->my_rank==0){ + double rel = 0.0; + rel = norm_of_residual/norm_of_F; + if(f>0){fprintf(stdout,"\n f-cycle=%2d norm=%1.15e rel=%1.15e ",f,norm_of_residual,rel);} + else{fprintf(stdout, "f-cycle=%2d norm=%1.15e rel=%1.15e ",f,norm_of_residual,rel);} + } + if(norm_of_residual/norm_of_F < rtol)break; + + } // F-cycle + + #else + + all_grids->MGSolves_performed++; + if(!all_grids->levels[onLevel]->active)return; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #ifdef UNLIMIT_FMG_VCYCLES + int maxVCycles=20; + #else + int maxVCycles=0; + #endif + int v; + int level; + int e_id = u_id; + int R_id = VECTOR_F_MINUS_AV; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #ifdef _OPENMP + double FMG_Start_Time = omp_get_wtime(); + #elif USE_MPI + double FMG_Start_Time = MPI_Wtime(); + #endif + if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"FMGSolve... ");} + double _timeStartMGSolve = getTime(); + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // calculate norm of f... + double _LevelStart = getTime(); + double norm_of_F = 1.0; + double norm_of_DinvF = 1.0; + if(dtol>0){ + mul_vectors(all_grids->levels[onLevel],VECTOR_TEMP,1.0,F_id,VECTOR_DINV); // D^{-1}F + norm_of_DinvF = norm(all_grids->levels[onLevel],VECTOR_TEMP); // ||D^{-1}F|| + } + if(rtol>0)norm_of_F = norm(all_grids->levels[onLevel],F_id); // ||F|| + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // initialize the RHS for the f-cycle to f... + scale_vector(all_grids->levels[onLevel],R_id,1.0,F_id); // R_id = F_id + all_grids->levels[onLevel]->timers.Total += (double)(getTime()-_LevelStart); + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // restrict RHS to bottom (coarsest grids) + for(level=onLevel;level<(all_grids->num_levels-1);level++){ + double _LevelStart = getTime(); + restriction(all_grids->levels[level+1],R_id,all_grids->levels[level],R_id,RESTRICT_CELL); + all_grids->levels[level]->timers.Total += (double)(getTime()-_LevelStart); + } + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // solve coarsest grid... + double _timeBottomStart = getTime(); + level = all_grids->num_levels-1; + if(level>onLevel)zero_vector(all_grids->levels[level],e_id);//else use whatever was the initial guess + IterativeSolver(all_grids->levels[level],e_id,R_id,a,b,MG_DEFAULT_BOTTOM_NORM); // -1 == exact solution + all_grids->levels[level]->timers.Total += (double)(getTime()-_timeBottomStart); + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // now do the F-cycle proper... + for(level=all_grids->num_levels-2;level>=onLevel;level--){ + // high-order interpolation + _LevelStart = getTime(); + interpolation_fcycle(all_grids->levels[level],e_id,0.0,all_grids->levels[level+1],e_id); + all_grids->levels[level]->timers.Total += (double)(getTime()-_LevelStart); + + // v-cycle + all_grids->levels[level]->vcycles_from_this_level++; + MGVCycle(all_grids,e_id,R_id,a,b,level); + } + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // now do the post-F V-cycles + for(v=-1;v=0){ + all_grids->levels[level]->vcycles_from_this_level++; + MGVCycle(all_grids,e_id,R_id,a,b,level); + } + + // now calculate the norm of the residual... + double _timeStart = getTime(); + if(all_grids->levels[level]->must_subtract_mean == 1){ + double average_value_of_e = mean(all_grids->levels[level],e_id); + shift_vector(all_grids->levels[level],e_id,e_id,-average_value_of_e); + } + residual(all_grids->levels[level],VECTOR_TEMP,e_id,F_id,a,b); + if(dtol>0)mul_vectors(all_grids->levels[level],VECTOR_TEMP,1.0,VECTOR_TEMP,VECTOR_DINV); // Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria... + double norm_of_residual = norm(all_grids->levels[level],VECTOR_TEMP); + double _timeNorm = getTime(); + all_grids->levels[level]->timers.Total += (double)(_timeNorm-_timeStart); + if(all_grids->levels[level]->my_rank==0){ + double rel = 0.0; + if(rtol>0)rel = norm_of_residual/norm_of_F; + else rel = norm_of_residual/norm_of_DinvF; + if( v>=0){fprintf(stdout,"\n v-cycle=%2d norm=%1.15e rel=%1.15e ",v+1,norm_of_residual,rel);} + else{fprintf(stdout, "f-cycle norm=%1.15e rel=%1.15e ",norm_of_residual,rel);} + } + if(norm_of_residual/norm_of_F < rtol)break; + if(norm_of_residual < dtol)break; + } + + #endif /* UNLIMIT_FMG_FCYCLES */ + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + all_grids->timers.MGSolve += (double)(getTime()-_timeStartMGSolve); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #ifdef _OPENMP + if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done (%f seconds)\n",omp_get_wtime()-FMG_Start_Time);} // used to monitor variability in individual solve times + #elif USE_MPI + if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done (%f seconds)\n",MPI_Wtime()-FMG_Start_Time);} // used to monitor variability in individual solve times + #else + if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done\n");} + #endif +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void MGPCG(mg_type *all_grids, int onLevel, int x_id, int F_id, double a, double b, double dtol, double rtol){ + // Algorithm 9.1 in Iterative Methods for Sparse Linear Systems(Yousef Saad) using a MG V-Cycle as M^{-1} + level_type * level = all_grids->levels[onLevel]; + if(!level->active)return; + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // CG with a MG preconditioner, every level needs 3 extra vectors (p, Ap, z) + int l; + for(l=0;lnum_levels;l++){ + create_vectors(all_grids->levels[l],VECTORS_RESERVED+3); + } + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Test for Poisson with Periodic BCs + for(l=0;lnum_levels;l++){ + if(all_grids->levels[l]->must_subtract_mean==-1){ + all_grids->levels[l]->must_subtract_mean=0; + int alpha_is_zero = (dot(all_grids->levels[l],VECTOR_ALPHA,VECTOR_ALPHA) == 0.0); + if( (all_grids->levels[l]->boundary_condition.type==BC_PERIODIC) && ((a==0) || (alpha_is_zero)) )all_grids->levels[l]->must_subtract_mean = 1; + } + } + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + int r_id = VECTOR_F_MINUS_AV; + int p_id = VECTORS_RESERVED+0; + int Ap_id = VECTORS_RESERVED+1; + int z_id = VECTORS_RESERVED+2; + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #ifdef _OPENMP + double MGPCG_Start_Time = omp_get_wtime(); + #elif USE_MPI + double MGPCG_Start_Time = MPI_Wtime(); + #endif + if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"MGPCG... ");} + double _timeStartMGSolve = getTime(); + all_grids->MGSolves_performed++; + int jMax=20; + int j=0; + int CGFailed = 0; + int CGConverged = 0; + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + zero_vector(level,x_id); // x[] = 0 + residual(level,r_id,x_id,F_id,a,b); // r[] = F_id[] - A(x_id) + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + if(level->must_subtract_mean == 1){ + double mean_of_r = mean(level,r_id); + shift_vector(level,r_id,r_id,-mean_of_r); + } + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + double norm_of_r0 = norm(level,r_id); // the norm of the initial residual... + if(norm_of_r0 == 0.0){CGConverged=1;} // entered CG with exact solution + level->vcycles_from_this_level++; // + zero_vector(level,z_id); // z[] = 0 + MGVCycle(all_grids,z_id,r_id,a,b,onLevel); // z[] = M^{-1}r[] + scale_vector(level,p_id,1.0,z_id); // p[] = z[] + double r_dot_z = dot(level,r_id,z_id); // r_dot_z = dot(r,z) + while( (jKrylov_iterations++; // + apply_op(level,Ap_id,p_id,a,b); // Ap[] = A(p) + double Ap_dot_p = dot(level,Ap_id,p_id); // Ap_dot_p = dot(Ap,p) + if(Ap_dot_p == 0.0){CGFailed=1;break;} // pivot breakdown ??? + double alpha = r_dot_z / Ap_dot_p; // alpha = r_dot_z / Ap_dot_p + if(isinf(alpha)){CGFailed=1;break;} // ??? + add_vectors(level,x_id,1.0,x_id, alpha,p_id ); // x_id[] = x_id[] + alpha*p[] + add_vectors(level,r_id,1.0,r_id,-alpha,Ap_id); // r[] = r[] - alpha*Ap[] (intermediate residual?) + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + if(level->must_subtract_mean == 1){ + double mean_of_r = mean(level,r_id); + shift_vector(level,r_id,r_id,-mean_of_r); + } + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + //double norm_of_r = norm(level,r_id); // norm of intermediate residual (delusional convergence) + residual(level,VECTOR_TEMP,x_id,F_id,a,b); // true residual + double norm_of_r = norm(level,VECTOR_TEMP); // norm of true residual (true convergence test) + if(norm_of_r == 0.0){CGConverged=1;break;} // + if(level->my_rank==0){ + if( j>1){fprintf(stdout,"\n ");} + if(rtol>0){fprintf(stdout,"iter=%3d norm=%1.15e rel=%1.15e ",j,norm_of_r,norm_of_r/norm_of_r0 );} + } + if(norm_of_r/norm_of_r0 < rtol)break; // norm if true residual is small enough + level->vcycles_from_this_level++; // + zero_vector(level,z_id); // z[] = 0 + MGVCycle(all_grids,z_id,r_id,a,b,onLevel); // z[] = M^{-1}r[] + double r_dot_z_new = dot(level,r_id,z_id); // r_dot_z_new = dot(r_{j+1},z_{j+1}) + if(r_dot_z_new == 0.0){CGFailed=1;break;} // Lanczos breakdown ??? + double beta = (r_dot_z_new/r_dot_z); // beta = (r_dot_z_new/r_dot_z) + if(isinf(beta)){CGFailed=1;break;} // ??? + add_vectors(level,p_id,1.0,z_id,beta,p_id ); // p[] = z[] + beta*p[] + r_dot_z = r_dot_z_new; // r_dot_r = r_dot_r_new (save old r_dot_r) + // FIX... need to test for stalled convergence... + } // } + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + all_grids->timers.MGSolve += (double)(getTime()-_timeStartMGSolve); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #ifdef _OPENMP + if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done (%f seconds)\n",omp_get_wtime()-MGPCG_Start_Time);} // used to monitor variability in individual solve times + #elif USE_MPI + if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done (%f seconds)\n",MPI_Wtime()-MGPCG_Start_Time);} // used to monitor variability in individual solve times + #else + if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done\n");} + #endif +} +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators.27pt.c b/Util/hpgmg/finite-volume/source/operators.27pt.c new file mode 100644 index 00000000..2d71e465 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.27pt.c @@ -0,0 +1,163 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef _OPENMP +#include +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#include "timers.h" +#include "defines.h" +#include "level.h" +#include "operators.h" +//------------------------------------------------------------------------------------------------------------------------------ +#define MyPragma(a) _Pragma(#a) +//------------------------------------------------------------------------------------------------------------------------------ +#if (_OPENMP>=201107) // OpenMP 3.1 supports max reductions... + // XL C/C++ 12.01.0000.0009 sets _OPENMP to 201107, but does not support the max clause within a _Pragma(). + // This issue was fixed by XL C/C++ 12.01.0000.0011 + // If you do not have this version of XL C/C++ and run into this bug, uncomment these macros... + //#warning not threading norm() calculations due to issue with XL/C, _Pragma, and reduction(max:bmax) + //#define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) ) + //#define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction( +:bsum) ) + //#define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) + #define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction( +:bsum) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(max:bmax) ) +#elif _OPENMP // older OpenMP versions don't support the max reduction clause + #warning Threading max reductions requires OpenMP 3.1 (July 2011). Please upgrade your compiler. + #define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction( +:bsum) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) +#else // flat MPI should not define any threading... + #define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) + #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) +#endif +//------------------------------------------------------------------------------------------------------------------------------ +void apply_BCs(level_type * level, int x_id, int shape){apply_BCs_p2(level,x_id,shape);} // 27pt uses cell centered, not cell averaged +//void apply_BCs(level_type * level, int x_id, int shape){apply_BCs_v2(level,x_id,shape);} +//------------------------------------------------------------------------------------------------------------------------------ +#define STENCIL_COEF0 (-4.2666666666666666666) // -128.0/30.0; +#define STENCIL_COEF1 ( 0.4666666666666666666) // 14.0/30.0; +#define STENCIL_COEF2 ( 0.1000000000000000000) // 3.0/30.0; +#define STENCIL_COEF3 ( 0.0333333333333333333) // 1.0/30.0; +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef STENCIL_VARIABLE_COEFFICIENT + #error This implementation does not support variable-coefficient operators +#endif +#ifdef STENCIL_FUSE_BC + #error This implementation does not support fusion of the boundary conditions with the operator +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#define Dinv_ijk() Dinv[ijk] // simply retrieve it rather than recalculating it +//------------------------------------------------------------------------------------------------------------------------------ +#define apply_op_ijk(x) \ +( \ + a*x[ijk] - b*h2inv*( \ + STENCIL_COEF3*(x[ijk-kStride-jStride-1] + \ + x[ijk-kStride-jStride+1] + \ + x[ijk-kStride+jStride-1] + \ + x[ijk-kStride+jStride+1] + \ + x[ijk+kStride-jStride-1] + \ + x[ijk+kStride-jStride+1] + \ + x[ijk+kStride+jStride-1] + \ + x[ijk+kStride+jStride+1] ) + \ + STENCIL_COEF2*(x[ijk-kStride-jStride ] + \ + x[ijk-kStride -1] + \ + x[ijk-kStride +1] + \ + x[ijk-kStride+jStride ] + \ + x[ijk -jStride-1] + \ + x[ijk -jStride+1] + \ + x[ijk +jStride-1] + \ + x[ijk +jStride+1] + \ + x[ijk+kStride-jStride ] + \ + x[ijk+kStride -1] + \ + x[ijk+kStride +1] + \ + x[ijk+kStride+jStride ] ) + \ + STENCIL_COEF1*(x[ijk-kStride ] + \ + x[ijk -jStride ] + \ + x[ijk -1] + \ + x[ijk +1] + \ + x[ijk +jStride ] + \ + x[ijk+kStride ] ) + \ + STENCIL_COEF0*(x[ijk ] ) \ + ) \ +) +//------------------------------------------------------------------------------------------------------------------------------ +int stencil_get_radius(){return(1);} // 27pt = dense 3^3 +int stencil_get_shape(){return(STENCIL_SHAPE_BOX);} // needs faces, edges, and corners +//------------------------------------------------------------------------------------------------------------------------------ +void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){ + // form restriction of alpha[], beta_*[] coefficients from fromLevel + if(fromLevel != NULL){ + restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL ); + restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I); + restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J); + restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K); + } // else case assumes alpha/beta have been set + + // exchange alpha/beta/... (must be done before calculating Dinv) + exchange_boundary(level,VECTOR_ALPHA ,STENCIL_SHAPE_BOX); // safe + exchange_boundary(level,VECTOR_BETA_I,STENCIL_SHAPE_BOX); + exchange_boundary(level,VECTOR_BETA_J,STENCIL_SHAPE_BOX); + exchange_boundary(level,VECTOR_BETA_K,STENCIL_SHAPE_BOX); + + // black box rebuild of D^{-1}, l1^{-1}, dominant eigenvalue, ... + rebuild_operator_blackbox(level,a,b,2); + + // exchange Dinv/L1inv/... + exchange_boundary(level,VECTOR_DINV ,STENCIL_SHAPE_BOX); // safe + exchange_boundary(level,VECTOR_L1INV,STENCIL_SHAPE_BOX); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef USE_GSRB +#warning GSRB is not recommended for the 27pt operator +#define GSRB_OOP +#define NUM_SMOOTHS 2 // RBRB +#include "operators/gsrb.c" +#elif USE_CHEBY +#define NUM_SMOOTHS 1 +#define CHEBYSHEV_DEGREE 4 // i.e. one degree-4 polynomial smoother +#include "operators/chebyshev.c" +#elif USE_JACOBI +#define NUM_SMOOTHS 6 +#include "operators/jacobi.c" +#elif USE_L1JACOBI +#define NUM_SMOOTHS 6 +#include "operators/jacobi.c" +#elif USE_SYMGS +#define NUM_SMOOTHS 2 // FBFB +#include "operators/symgs.c" +#else +#error You must compile with either -DUSE_GSRB, -DUSE_CHEBY, -DUSE_JACOBI, -DUSE_L1JACOBI, or -DUSE_SYMGS +#endif +#include "operators/residual.c" +#include "operators/apply_op.c" +#include "operators/rebuild.c" +//------------------------------------------------------------------------------------------------------------------------------ +#include "operators/blockCopy.c" +#include "operators/misc.c" +#include "operators/exchange_boundary.c" +#include "operators/boundary_fd.c" // 27pt uses cell centered, not cell averaged +//#include "operators/boundary_fv.c" +#include "operators/restriction.c" +#include "operators/interpolation_p2.c" +//#include "operators/interpolation_v2.c" +//------------------------------------------------------------------------------------------------------------------------------ +void interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_p2(level_f,id_f,prescale_f,level_c,id_c);} // 27pt uses cell centered, not cell averaged +void interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_p2(level_f,id_f,prescale_f,level_c,id_c);} +//void interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_v2(level_f,id_f,prescale_f,level_c,id_c);} +//void interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_v2(level_f,id_f,prescale_f,level_c,id_c);} +//------------------------------------------------------------------------------------------------------------------------------ +#include "operators/problem.p6.c" +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators.7pt.c b/Util/hpgmg/finite-volume/source/operators.7pt.c new file mode 100644 index 00000000..4802c72e --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.7pt.c @@ -0,0 +1,275 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef _OPENMP +#include +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#include "timers.h" +#include "defines.h" +#include "level.h" +#include "operators.h" +//------------------------------------------------------------------------------------------------------------------------------ +#define MyPragma(a) _Pragma(#a) +//------------------------------------------------------------------------------------------------------------------------------ +#if (_OPENMP>=201107) // OpenMP 3.1 supports max reductions... + // XL C/C++ 12.01.0000.0009 sets _OPENMP to 201107, but does not support the max clause within a _Pragma(). + // This issue was fixed by XL C/C++ 12.01.0000.0011 + // If you do not have this version of XL C/C++ and run into this bug, uncomment these macros... + //#warning not threading norm() calculations due to issue with XL/C, _Pragma, and reduction(max:bmax) + //#define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) ) + //#define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction( +:bsum) ) + //#define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) + #define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction( +:bsum) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(max:bmax) ) +#elif _OPENMP // older OpenMP versions don't support the max reduction clause + #warning Threading max reductions requires OpenMP 3.1 (July 2011). Please upgrade your compiler. + #define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction( +:bsum) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) +#else // flat MPI should not define any threading... + #define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) + #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) +#endif +//------------------------------------------------------------------------------------------------------------------------------ +void apply_BCs(level_type * level, int x_id, int shape){apply_BCs_p1(level,x_id,shape);} +//------------------------------------------------------------------------------------------------------------------------------ +#define Dinv_ijk() Dinv[ijk] // simply retrieve it rather than recalculating it +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef STENCIL_VARIABLE_COEFFICIENT +#ifdef USE_HELMHOLTZ // variable coefficient Helmholtz... + #define apply_op_ijk(x) \ + ( \ + a*alpha[ijk]*x[ijk] \ + -b*h2inv*( \ + + beta_i[ijk+1 ]*( x[ijk+1 ] - x[ijk] ) \ + + beta_i[ijk ]*( x[ijk-1 ] - x[ijk] ) \ + + beta_j[ijk+jStride]*( x[ijk+jStride] - x[ijk] ) \ + + beta_j[ijk ]*( x[ijk-jStride] - x[ijk] ) \ + + beta_k[ijk+kStride]*( x[ijk+kStride] - x[ijk] ) \ + + beta_k[ijk ]*( x[ijk-kStride] - x[ijk] ) \ + ) \ + ) +#else // variable coefficient Poisson... + #define apply_op_ijk(x) \ + ( \ + -b*h2inv*( \ + + beta_i[ijk+1 ]*( x[ijk+1 ] - x[ijk] ) \ + + beta_i[ijk ]*( x[ijk-1 ] - x[ijk] ) \ + + beta_j[ijk+jStride]*( x[ijk+jStride] - x[ijk] ) \ + + beta_j[ijk ]*( x[ijk-jStride] - x[ijk] ) \ + + beta_k[ijk+kStride]*( x[ijk+kStride] - x[ijk] ) \ + + beta_k[ijk ]*( x[ijk-kStride] - x[ijk] ) \ + ) \ + ) +#endif +#else // constant coefficient case... + #define apply_op_ijk(x) \ + ( \ + a*x[ijk] - b*h2inv*( \ + + x[ijk+1 ] \ + + x[ijk-1 ] \ + + x[ijk+jStride] \ + + x[ijk-jStride] \ + + x[ijk+kStride] \ + + x[ijk-kStride] \ + - x[ijk ]*6.0 \ + ) \ + ) +#endif // variable/constant coefficient + +//------------------------------------------------------------------------------------------------------------------------------ +int stencil_get_radius(){return(1);} // 7pt reaches out 1 point +int stencil_get_shape(){return(STENCIL_SHAPE_STAR);} // needs just faces +//------------------------------------------------------------------------------------------------------------------------------ +void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){ + if(level->my_rank==0){fprintf(stdout," rebuilding operator for level... h=%e ",level->h);fflush(stdout);} + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // form restriction of alpha[], beta_*[] coefficients from fromLevel + if(fromLevel != NULL){ + restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL ); + restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I); + restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J); + restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K); + } // else case assumes alpha/beta have been set + + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // exchange alpha/beta/... (must be done before calculating Dinv) + exchange_boundary(level,VECTOR_ALPHA ,STENCIL_SHAPE_BOX); // safe + exchange_boundary(level,VECTOR_BETA_I,STENCIL_SHAPE_BOX); + exchange_boundary(level,VECTOR_BETA_J,STENCIL_SHAPE_BOX); + exchange_boundary(level,VECTOR_BETA_K,STENCIL_SHAPE_BOX); + + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // calculate Dinv, L1inv, and estimate the dominant Eigenvalue + double _timeStart = getTime(); + int block; + + double dominant_eigenvalue = -1e9; + + PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,block,level->num_my_blocks,dominant_eigenvalue) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + double h2inv = 1.0/(level->h*level->h); + double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); + double * __restrict__ L1inv = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); + double block_eigenvalue = -1e9; + + for(k=klo;kboundary_condition.type != BC_PERIODIC){ + if(level->my_boxes[box].low.i+i-1 < 0)ilo_is_valid = 0.0; + if(level->my_boxes[box].low.j+j-1 < 0)jlo_is_valid = 0.0; + if(level->my_boxes[box].low.k+k-1 < 0)klo_is_valid = 0.0; + if(level->my_boxes[box].low.i+i+1 >= level->dim.i)ihi_is_valid = 0.0; + if(level->my_boxes[box].low.j+j+1 >= level->dim.j)jhi_is_valid = 0.0; + if(level->my_boxes[box].low.k+k+1 >= level->dim.k)khi_is_valid = 0.0; + } + + #ifdef STENCIL_VARIABLE_COEFFICIENT + // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements... + double sumAbsAij = fabs(b*h2inv) * ( + fabs( beta_i[ijk ]*ilo_is_valid )+ + fabs( beta_j[ijk ]*jlo_is_valid )+ + fabs( beta_k[ijk ]*klo_is_valid )+ + fabs( beta_i[ijk+1 ]*ihi_is_valid )+ + fabs( beta_j[ijk+jStride]*jhi_is_valid )+ + fabs( beta_k[ijk+kStride]*khi_is_valid ) + ); + + // center of Gershgorin disc is the diagonal element... + double Aii = a*alpha[ijk] - b*h2inv*( + beta_i[ijk ]*( ilo_is_valid-2.0 )+ + beta_j[ijk ]*( jlo_is_valid-2.0 )+ + beta_k[ijk ]*( klo_is_valid-2.0 )+ + beta_i[ijk+1 ]*( ihi_is_valid-2.0 )+ + beta_j[ijk+jStride]*( jhi_is_valid-2.0 )+ + beta_k[ijk+kStride]*( khi_is_valid-2.0 ) + ); + #else // Constant coefficient versions with fused BC's... + // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements... + double sumAbsAij = fabs(b*h2inv) * ( + ilo_is_valid + + jlo_is_valid + + klo_is_valid + + ihi_is_valid + + jhi_is_valid + + khi_is_valid + ); + + // center of Gershgorin disc is the diagonal element... + double Aii = a - b*h2inv*( + ilo_is_valid + + jlo_is_valid + + klo_is_valid + + ihi_is_valid + + jhi_is_valid + + khi_is_valid - 12.0 + ); + #endif + + // calculate Dinv = D^{-1}, L1inv = ( D+D^{L1} )^{-1}, and the dominant eigenvalue... + Dinv[ijk] = 1.0/Aii; // inverse of the diagonal Aii + //L1inv[ijk] = 1.0/(Aii+sumAbsAij); // inverse of the L1 row norm... L1inv = ( D+D^{L1} )^{-1} + if(Aii>=1.5*sumAbsAij)L1inv[ijk] = 1.0/(Aii ); // as suggested by eq 6.5 in Baker et al, "Multigrid smoothers for ultra-parallel computing: additional theory and discussion"... + else L1inv[ijk] = 1.0/(Aii+0.5*sumAbsAij); // + double Di = (Aii + sumAbsAij)/Aii;if(Di>block_eigenvalue)block_eigenvalue=Di; // upper limit to Gershgorin disc == bound on dominant eigenvalue + }}} + if(block_eigenvalue>dominant_eigenvalue){dominant_eigenvalue = block_eigenvalue;} + } + level->timers.blas1 += (double)(getTime()-_timeStart); + + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Reduce the local estimates dominant eigenvalue to a global estimate + #ifdef USE_MPI + double _timeStartAllReduce = getTime(); + double send = dominant_eigenvalue; + MPI_Allreduce(&send,&dominant_eigenvalue,1,MPI_DOUBLE,MPI_MAX,level->MPI_COMM_ALLREDUCE); + double _timeEndAllReduce = getTime(); + level->timers.collectives += (double)(_timeEndAllReduce-_timeStartAllReduce); + #endif + if(level->my_rank==0){fprintf(stdout,"eigenvalue_max<%e\n",dominant_eigenvalue);} + level->dominant_eigenvalue_of_DinvA = dominant_eigenvalue; + + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // exchange Dinv/L1inv/... + exchange_boundary(level,VECTOR_DINV ,STENCIL_SHAPE_BOX); // safe + exchange_boundary(level,VECTOR_L1INV,STENCIL_SHAPE_BOX); + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +} + + +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef USE_GSRB +#define NUM_SMOOTHS 2 // RBRB +#include "operators/gsrb.c" +#elif USE_CHEBY +#define NUM_SMOOTHS 1 +#define CHEBYSHEV_DEGREE 4 // i.e. one degree-4 polynomial smoother +#include "operators/chebyshev.c" +#elif USE_JACOBI +#define NUM_SMOOTHS 6 +#include "operators/jacobi.c" +#elif USE_L1JACOBI +#define NUM_SMOOTHS 6 +#include "operators/jacobi.c" +#elif USE_SYMGS +#define NUM_SMOOTHS 2 +#include "operators/symgs.c" +#else +#error You must compile with either -DUSE_GSRB, -DUSE_CHEBY, -DUSE_JACOBI, -DUSE_L1JACOBI, or -DUSE_SYMGS +#endif +#include "operators/residual.c" +#include "operators/apply_op.c" +//------------------------------------------------------------------------------------------------------------------------------ +#include "operators/blockCopy.c" +#include "operators/misc.c" +#include "operators/exchange_boundary.c" +#include "operators/boundary_fd.c" +#include "operators/restriction.c" +#include "operators/interpolation_p1.c" +//------------------------------------------------------------------------------------------------------------------------------ +void interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_p1(level_f,id_f,prescale_f,level_c,id_c);} +void interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_p1(level_f,id_f,prescale_f,level_c,id_c);} +//------------------------------------------------------------------------------------------------------------------------------ +#include "operators/problem.p6.c" +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators.fv2.c b/Util/hpgmg/finite-volume/source/operators.fv2.c new file mode 100644 index 00000000..41a875c5 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.fv2.c @@ -0,0 +1,162 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef _OPENMP +#include +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#include "timers.h" +#include "defines.h" +#include "level.h" +#include "operators.h" +//------------------------------------------------------------------------------------------------------------------------------ +#define MyPragma(a) _Pragma(#a) +//------------------------------------------------------------------------------------------------------------------------------ +#if (_OPENMP>=201107) // OpenMP 3.1 supports max reductions... + // XL C/C++ 12.01.0000.0009 sets _OPENMP to 201107, but does not support the max clause within a _Pragma(). + // This issue was fixed by XL C/C++ 12.01.0000.0011 + // If you do not have this version of XL C/C++ and run into this bug, uncomment these macros... + //#warning not threading norm() calculations due to issue with XL/C, _Pragma, and reduction(max:bmax) + //#define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) ) + //#define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction( +:bsum) ) + //#define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) + #define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction( +:bsum) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(max:bmax) ) +#elif _OPENMP // older OpenMP versions don't support the max reduction clause + #warning Threading max reductions requires OpenMP 3.1 (July 2011). Please upgrade your compiler. + #define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction( +:bsum) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) +#else // flat MPI should not define any threading... + #define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) + #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef STENCIL_FUSE_BC + #error This implementation does not support fusion of the boundary conditions with the operator +#endif +//------------------------------------------------------------------------------------------------------------------------------ +void apply_BCs(level_type * level, int x_id, int shape){apply_BCs_v2(level,x_id,shape);} +//------------------------------------------------------------------------------------------------------------------------------ +#define Dinv_ijk() Dinv[ijk] // simply retrieve it rather than recalculating it +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef STENCIL_VARIABLE_COEFFICIENT + #ifdef USE_HELMHOLTZ // variable coefficient Helmholtz... + #define apply_op_ijk(x) \ + ( \ + a*alpha[ijk]*x[ijk] \ + -b*h2inv*( \ + + beta_i[ijk+1 ]*( x[ijk+1 ] - x[ijk] ) \ + + beta_i[ijk ]*( x[ijk-1 ] - x[ijk] ) \ + + beta_j[ijk+jStride]*( x[ijk+jStride] - x[ijk] ) \ + + beta_j[ijk ]*( x[ijk-jStride] - x[ijk] ) \ + + beta_k[ijk+kStride]*( x[ijk+kStride] - x[ijk] ) \ + + beta_k[ijk ]*( x[ijk-kStride] - x[ijk] ) \ + ) \ + ) + #else // variable coefficient Poisson... + #define apply_op_ijk(x) \ + ( \ + -b*h2inv*( \ + + beta_i[ijk+1 ]*( x[ijk+1 ] - x[ijk] ) \ + + beta_i[ijk ]*( x[ijk-1 ] - x[ijk] ) \ + + beta_j[ijk+jStride]*( x[ijk+jStride] - x[ijk] ) \ + + beta_j[ijk ]*( x[ijk-jStride] - x[ijk] ) \ + + beta_k[ijk+kStride]*( x[ijk+kStride] - x[ijk] ) \ + + beta_k[ijk ]*( x[ijk-kStride] - x[ijk] ) \ + ) \ + ) + #endif +#else // constant coefficient case... + #define apply_op_ijk(x) \ + ( \ + a*x[ijk] - b*h2inv*( \ + + x[ijk+1 ] \ + + x[ijk-1 ] \ + + x[ijk+jStride] \ + + x[ijk-jStride] \ + + x[ijk+kStride] \ + + x[ijk-kStride] \ + - x[ijk ]*6.0 \ + ) \ + ) +#endif // variable/constant coefficient +//------------------------------------------------------------------------------------------------------------------------------ +int stencil_get_radius(){return(1);} +int stencil_get_shape(){return(STENCIL_SHAPE_STAR);} // needs just faces +//------------------------------------------------------------------------------------------------------------------------------ +void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){ + // form restriction of alpha[], beta_*[] coefficients from fromLevel + if(fromLevel != NULL){ + restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL ); + restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I); + restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J); + restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K); + } // else case assumes alpha/beta have been set + + //no need to extrapolate the beta's into the ghost zones (no mixed derivatives in 2nd order) + //extrapolate_betas(level); + //initialize_problem(level,level->h,a,b); // approach used for testing smooth beta's; destroys the black box nature of the solver + + // exchange alpha/beta/... (must be done before calculating Dinv) + exchange_boundary(level,VECTOR_ALPHA ,STENCIL_SHAPE_BOX); // safe + exchange_boundary(level,VECTOR_BETA_I,STENCIL_SHAPE_BOX); + exchange_boundary(level,VECTOR_BETA_J,STENCIL_SHAPE_BOX); + exchange_boundary(level,VECTOR_BETA_K,STENCIL_SHAPE_BOX); + + // black box rebuild of D^{-1}, l1^{-1}, dominant eigenvalue, ... + rebuild_operator_blackbox(level,a,b,2); + + // exchange Dinv/L1inv/... + exchange_boundary(level,VECTOR_DINV ,STENCIL_SHAPE_BOX); // safe + exchange_boundary(level,VECTOR_L1INV,STENCIL_SHAPE_BOX); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef USE_GSRB +//#define GSRB_OOP // no need for out-of-place for 7pt +#define NUM_SMOOTHS 3 // RBRBRB +#include "operators/gsrb.c" +#elif USE_CHEBY +#define NUM_SMOOTHS 1 +#define CHEBYSHEV_DEGREE 6 // i.e. one degree-6 polynomial smoother +#include "operators/chebyshev.c" +#elif USE_JACOBI +#define NUM_SMOOTHS 6 +#include "operators/jacobi.c" +#elif USE_L1JACOBI +#define NUM_SMOOTHS 6 +#include "operators/jacobi.c" +#elif USE_SYMGS +#define NUM_SMOOTHS 2 // FBFB +#include "operators/symgs.c" +#else +#error You must compile with either -DUSE_GSRB, -DUSE_CHEBY, -DUSE_JACOBI, -DUSE_L1JACOBI, or -DUSE_SYMGS +#endif +#include "operators/residual.c" +#include "operators/apply_op.c" +#include "operators/rebuild.c" +//------------------------------------------------------------------------------------------------------------------------------ +#include "operators/blockCopy.c" +#include "operators/misc.c" +#include "operators/exchange_boundary.c" +#include "operators/boundary_fv.c" +#include "operators/restriction.c" +#include "operators/interpolation_v2.c" +//------------------------------------------------------------------------------------------------------------------------------ +void interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_v2(level_f,id_f,prescale_f,level_c,id_c);} +void interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_v2(level_f,id_f,prescale_f,level_c,id_c);} +//------------------------------------------------------------------------------------------------------------------------------ +#include "operators/problem.fv.c" +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators.fv4.c b/Util/hpgmg/finite-volume/source/operators.fv4.c new file mode 100644 index 00000000..220b0ff9 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.fv4.c @@ -0,0 +1,211 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef _OPENMP +#include +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#include "timers.h" +#include "defines.h" +#include "level.h" +#include "operators.h" +//------------------------------------------------------------------------------------------------------------------------------ +#define MyPragma(a) _Pragma(#a) +//------------------------------------------------------------------------------------------------------------------------------ +#if (_OPENMP>=201107) // OpenMP 3.1 supports max reductions... + // XL C/C++ 12.01.0000.0009 sets _OPENMP to 201107, but does not support the max clause within a _Pragma(). + // This issue was fixed by XL C/C++ 12.01.0000.0011 + // If you do not have this version of XL C/C++ and run into this bug, uncomment these macros... + //#warning not threading norm() calculations due to issue with XL/C, _Pragma, and reduction(max:bmax) + //#define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) ) + //#define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction( +:bsum) ) + //#define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) + #define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction( +:bsum) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(max:bmax) ) +#elif _OPENMP // older OpenMP versions don't support the max reduction clause + #warning Threading max reductions requires OpenMP 3.1 (July 2011). Please upgrade your compiler. + #define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction( +:bsum) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) +#else // flat MPI should not define any threading... + #define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) + #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef STENCIL_FUSE_BC + #error This implementation does not support fusion of the boundary conditions with the operator +#endif +//------------------------------------------------------------------------------------------------------------------------------ +void apply_BCs(level_type * level, int x_id, int shape){apply_BCs_v4(level,x_id,shape);} +//------------------------------------------------------------------------------------------------------------------------------ +#define Dinv_ijk() Dinv[ijk] // simply retrieve it rather than recalculating it +//------------------------------------------------------------------------------------------------------------------------------ +#define STENCIL_TWELFTH ( 0.0833333333333333333) // 1.0/12.0; +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef STENCIL_VARIABLE_COEFFICIENT + #ifdef USE_HELMHOLTZ + #define apply_op_ijk(x) \ + ( \ + a*alpha[ijk]*x[ijk] \ + -b*h2inv*( \ + STENCIL_TWELFTH*( \ + + beta_i[ijk ]*( 15.0*(x[ijk-1 ]-x[ijk]) - (x[ijk-2 ]-x[ijk+1 ]) ) \ + + beta_i[ijk+1 ]*( 15.0*(x[ijk+1 ]-x[ijk]) - (x[ijk+2 ]-x[ijk-1 ]) ) \ + + beta_j[ijk ]*( 15.0*(x[ijk-jStride]-x[ijk]) - (x[ijk-2*jStride]-x[ijk+jStride]) ) \ + + beta_j[ijk+jStride]*( 15.0*(x[ijk+jStride]-x[ijk]) - (x[ijk+2*jStride]-x[ijk-jStride]) ) \ + + beta_k[ijk ]*( 15.0*(x[ijk-kStride]-x[ijk]) - (x[ijk-2*kStride]-x[ijk+kStride]) ) \ + + beta_k[ijk+kStride]*( 15.0*(x[ijk+kStride]-x[ijk]) - (x[ijk+2*kStride]-x[ijk-kStride]) ) \ + ) \ + + 0.25*STENCIL_TWELFTH*( \ + + (beta_i[ijk +jStride]-beta_i[ijk -jStride]) * (x[ijk-1 +jStride]-x[ijk+jStride]-x[ijk-1 -jStride]+x[ijk-jStride]) \ + + (beta_i[ijk +kStride]-beta_i[ijk -kStride]) * (x[ijk-1 +kStride]-x[ijk+kStride]-x[ijk-1 -kStride]+x[ijk-kStride]) \ + + (beta_j[ijk +1 ]-beta_j[ijk -1 ]) * (x[ijk-jStride+1 ]-x[ijk+1 ]-x[ijk-jStride-1 ]+x[ijk-1 ]) \ + + (beta_j[ijk +kStride]-beta_j[ijk -kStride]) * (x[ijk-jStride+kStride]-x[ijk+kStride]-x[ijk-jStride-kStride]+x[ijk-kStride]) \ + + (beta_k[ijk +1 ]-beta_k[ijk -1 ]) * (x[ijk-kStride+1 ]-x[ijk+1 ]-x[ijk-kStride-1 ]+x[ijk-1 ]) \ + + (beta_k[ijk +jStride]-beta_k[ijk -jStride]) * (x[ijk-kStride+jStride]-x[ijk+jStride]-x[ijk-kStride-jStride]+x[ijk-jStride]) \ + \ + + (beta_i[ijk+1 +jStride]-beta_i[ijk+1 -jStride]) * (x[ijk+1 +jStride]-x[ijk+jStride]-x[ijk+1 -jStride]+x[ijk-jStride]) \ + + (beta_i[ijk+1 +kStride]-beta_i[ijk+1 -kStride]) * (x[ijk+1 +kStride]-x[ijk+kStride]-x[ijk+1 -kStride]+x[ijk-kStride]) \ + + (beta_j[ijk+jStride+1 ]-beta_j[ijk+jStride-1 ]) * (x[ijk+jStride+1 ]-x[ijk+1 ]-x[ijk+jStride-1 ]+x[ijk-1 ]) \ + + (beta_j[ijk+jStride+kStride]-beta_j[ijk+jStride-kStride]) * (x[ijk+jStride+kStride]-x[ijk+kStride]-x[ijk+jStride-kStride]+x[ijk-kStride]) \ + + (beta_k[ijk+kStride+1 ]-beta_k[ijk+kStride-1 ]) * (x[ijk+kStride+1 ]-x[ijk+1 ]-x[ijk+kStride-1 ]+x[ijk-1 ]) \ + + (beta_k[ijk+kStride+jStride]-beta_k[ijk+kStride-jStride]) * (x[ijk+kStride+jStride]-x[ijk+jStride]-x[ijk+kStride-jStride]+x[ijk-jStride]) \ + ) \ + ) \ + ) + #else // Poisson... + #define apply_op_ijk(x) \ + ( \ + -b*h2inv*( \ + STENCIL_TWELFTH*( \ + + beta_i[ijk ]*( 15.0*(x[ijk-1 ]-x[ijk]) - (x[ijk-2 ]-x[ijk+1 ]) ) \ + + beta_i[ijk+1 ]*( 15.0*(x[ijk+1 ]-x[ijk]) - (x[ijk+2 ]-x[ijk-1 ]) ) \ + + beta_j[ijk ]*( 15.0*(x[ijk-jStride]-x[ijk]) - (x[ijk-2*jStride]-x[ijk+jStride]) ) \ + + beta_j[ijk+jStride]*( 15.0*(x[ijk+jStride]-x[ijk]) - (x[ijk+2*jStride]-x[ijk-jStride]) ) \ + + beta_k[ijk ]*( 15.0*(x[ijk-kStride]-x[ijk]) - (x[ijk-2*kStride]-x[ijk+kStride]) ) \ + + beta_k[ijk+kStride]*( 15.0*(x[ijk+kStride]-x[ijk]) - (x[ijk+2*kStride]-x[ijk-kStride]) ) \ + ) \ + + 0.25*STENCIL_TWELFTH*( \ + + (beta_i[ijk +jStride]-beta_i[ijk -jStride]) * (x[ijk-1 +jStride]-x[ijk+jStride]-x[ijk-1 -jStride]+x[ijk-jStride]) \ + + (beta_i[ijk +kStride]-beta_i[ijk -kStride]) * (x[ijk-1 +kStride]-x[ijk+kStride]-x[ijk-1 -kStride]+x[ijk-kStride]) \ + + (beta_j[ijk +1 ]-beta_j[ijk -1 ]) * (x[ijk-jStride+1 ]-x[ijk+1 ]-x[ijk-jStride-1 ]+x[ijk-1 ]) \ + + (beta_j[ijk +kStride]-beta_j[ijk -kStride]) * (x[ijk-jStride+kStride]-x[ijk+kStride]-x[ijk-jStride-kStride]+x[ijk-kStride]) \ + + (beta_k[ijk +1 ]-beta_k[ijk -1 ]) * (x[ijk-kStride+1 ]-x[ijk+1 ]-x[ijk-kStride-1 ]+x[ijk-1 ]) \ + + (beta_k[ijk +jStride]-beta_k[ijk -jStride]) * (x[ijk-kStride+jStride]-x[ijk+jStride]-x[ijk-kStride-jStride]+x[ijk-jStride]) \ + \ + + (beta_i[ijk+1 +jStride]-beta_i[ijk+1 -jStride]) * (x[ijk+1 +jStride]-x[ijk+jStride]-x[ijk+1 -jStride]+x[ijk-jStride]) \ + + (beta_i[ijk+1 +kStride]-beta_i[ijk+1 -kStride]) * (x[ijk+1 +kStride]-x[ijk+kStride]-x[ijk+1 -kStride]+x[ijk-kStride]) \ + + (beta_j[ijk+jStride+1 ]-beta_j[ijk+jStride-1 ]) * (x[ijk+jStride+1 ]-x[ijk+1 ]-x[ijk+jStride-1 ]+x[ijk-1 ]) \ + + (beta_j[ijk+jStride+kStride]-beta_j[ijk+jStride-kStride]) * (x[ijk+jStride+kStride]-x[ijk+kStride]-x[ijk+jStride-kStride]+x[ijk-kStride]) \ + + (beta_k[ijk+kStride+1 ]-beta_k[ijk+kStride-1 ]) * (x[ijk+kStride+1 ]-x[ijk+1 ]-x[ijk+kStride-1 ]+x[ijk-1 ]) \ + + (beta_k[ijk+kStride+jStride]-beta_k[ijk+kStride-jStride]) * (x[ijk+kStride+jStride]-x[ijk+jStride]-x[ijk+kStride-jStride]+x[ijk-jStride]) \ + ) \ + ) \ + ) + #endif +#else // constant coefficient (don't bother differentiating between Poisson and Helmholtz)... + #define apply_op_ijk(x) \ + ( \ + a*x[ijk] - b*h2inv*STENCIL_TWELFTH*( \ + - 1.0*(x[ijk-2*kStride] + \ + x[ijk-2*jStride] + \ + x[ijk-2 ] + \ + x[ijk+2 ] + \ + x[ijk+2*jStride] + \ + x[ijk+2*kStride] ) \ + +16.0*(x[ijk -kStride] + \ + x[ijk -jStride] + \ + x[ijk -1 ] + \ + x[ijk +1 ] + \ + x[ijk +jStride] + \ + x[ijk +kStride] ) \ + -90.0*(x[ijk ] ) \ + ) \ + ) +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef STENCIL_VARIABLE_COEFFICIENT +int stencil_get_radius(){return(2);} // stencil reaches out 2 cells +int stencil_get_shape(){return(STENCIL_SHAPE_NO_CORNERS);} // needs faces and edges, but not corners +#else +int stencil_get_radius(){return(2);} // stencil reaches out 2 cells +int stencil_get_shape(){return(STENCIL_SHAPE_STAR);} // needs just faces +#endif +//------------------------------------------------------------------------------------------------------------------------------ +void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){ + // form restriction of alpha[], beta_*[] coefficients from fromLevel + if(fromLevel != NULL){ + restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL ); + restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I); + restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J); + restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K); + } // else case assumes alpha/beta have been set + + // extrapolate the beta's into the ghost zones (needed for mixed derivatives) + extrapolate_betas(level); + //initialize_problem(level,level->h,a,b); // approach used for testing smooth beta's; destroys the black box nature of the solver + + // exchange alpha/beta/... (must be done before calculating Dinv) + exchange_boundary(level,VECTOR_ALPHA ,STENCIL_SHAPE_BOX); // safe + exchange_boundary(level,VECTOR_BETA_I,STENCIL_SHAPE_BOX); + exchange_boundary(level,VECTOR_BETA_J,STENCIL_SHAPE_BOX); + exchange_boundary(level,VECTOR_BETA_K,STENCIL_SHAPE_BOX); + + // black box rebuild of D^{-1}, l1^{-1}, dominant eigenvalue, ... + rebuild_operator_blackbox(level,a,b,4); + + // exchange Dinv/L1inv/... + exchange_boundary(level,VECTOR_DINV ,STENCIL_SHAPE_BOX); // safe + exchange_boundary(level,VECTOR_L1INV,STENCIL_SHAPE_BOX); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef USE_GSRB +#define GSRB_OOP +#define NUM_SMOOTHS 3 // RBRBRB +#include "operators/gsrb.c" +#elif USE_CHEBY +#warning The Chebyshev smoother is currently underperforming for 4th order. Please use -DUSE_GSRB or -DUSE_JACOBI +#define NUM_SMOOTHS 1 +#define CHEBYSHEV_DEGREE 6 // i.e. one degree-6 polynomial smoother +#include "operators/chebyshev.c" +#elif USE_JACOBI +#define NUM_SMOOTHS 6 +#include "operators/jacobi.c" +#elif USE_L1JACOBI +#define NUM_SMOOTHS 6 +#include "operators/jacobi.c" +#elif USE_SYMGS +#define NUM_SMOOTHS 2 // FBFB +#include "operators/symgs.c" +#else +#error You must compile with either -DUSE_GSRB, -DUSE_CHEBY, -DUSE_JACOBI, -DUSE_L1JACOBI, or -DUSE_SYMGS +#endif +#include "operators/residual.c" +#include "operators/apply_op.c" +#include "operators/rebuild.c" +//------------------------------------------------------------------------------------------------------------------------------ +#include "operators/blockCopy.c" +#include "operators/misc.c" +#include "operators/exchange_boundary.c" +#include "operators/boundary_fv.c" +#include "operators/restriction.c" +#include "operators/interpolation_v2.c" +#include "operators/interpolation_v4.c" +//------------------------------------------------------------------------------------------------------------------------------ +void interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_v2(level_f,id_f,prescale_f,level_c,id_c);} +void interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_v4(level_f,id_f,prescale_f,level_c,id_c);} +//------------------------------------------------------------------------------------------------------------------------------ +#include "operators/problem.fv.c" +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators.h b/Util/hpgmg/finite-volume/source/operators.h new file mode 100644 index 00000000..847244d1 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.h @@ -0,0 +1,52 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef OPERATORS_H +#define OPERATORS_H +//------------------------------------------------------------------------------------------------------------------------------ +#define RESTRICT_CELL 0 +#define RESTRICT_FACE_I 1 +#define RESTRICT_FACE_J 2 +#define RESTRICT_FACE_K 3 +//------------------------------------------------------------------------------------------------------------------------------ +int stencil_get_radius(); +int stencil_get_shape(); +//------------------------------------------------------------------------------------------------------------------------------ + void apply_op(level_type * level, int Ax_id, int x_id, double a, double b); + void residual(level_type * level, int res_id, int x_id, int rhs_id, double a, double b); + void smooth(level_type * level, int phi_id, int rhs_id, double a, double b); + void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b); + void rebuild_operator_blackbox(level_type * level, double a, double b, int colors_in_each_dim); +//------------------------------------------------------------------------------------------------------------------------------ + void restriction(level_type * level_c, int id_c, level_type *level_f, int id_f, int restrictionType); + void interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c); // interpolation used inside a v-cycle + void interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c); // interpolation used in the f-cycle to create a new initial guess for the next finner v-cycle +//------------------------------------------------------------------------------------------------------------------------------ + void exchange_boundary(level_type * level, int id_a, int shape); + void apply_BCs_p1(level_type * level, int x_id, int shape); // piecewise (cell centered) linear + void apply_BCs_p2(level_type * level, int x_id, int shape); // piecewise (cell centered) quadratic + void apply_BCs_v1(level_type * level, int x_id, int shape); // volumetric linear + void apply_BCs_v2(level_type * level, int x_id, int shape); // volumetric quadratic + void apply_BCs_v4(level_type * level, int x_id, int shape); // volumetric quartic + void extrapolate_betas(level_type * level); +//------------------------------------------------------------------------------------------------------------------------------ +double dot(level_type * level, int id_a, int id_b); +double norm(level_type * level, int id_a); +double mean(level_type * level, int id_a); +double error(level_type * level, int id_a, int id_b); + void add_vectors(level_type * level, int id_c, double scale_a, int id_a, double scale_b, int id_b); + void scale_vector( level_type * level, int id_c, double scale_a, int id_a); + void zero_vector( level_type * level, int id_a); + void shift_vector( level_type * level, int id_c, int id_a, double shift_a); + void mul_vectors(level_type * level, int id_c, double scale, int id_a, int id_b); + void invert_vector( level_type * level, int id_c, double scale_a, int id_a); + void init_vector( level_type * level, int id_a, double scalar); +//------------------------------------------------------------------------------------------------------------------------------ +void color_vector(level_type * level, int id, int colors, int icolor, int jcolor, int kcolor); +void random_vector(level_type * level, int id); +//------------------------------------------------------------------------------------------------------------------------------ + void initialize_problem(level_type * level, double hLevel, double a, double b); +//------------------------------------------------------------------------------------------------------------------------------ +#endif diff --git a/Util/hpgmg/finite-volume/source/operators.old.c b/Util/hpgmg/finite-volume/source/operators.old.c new file mode 100644 index 00000000..9ba6db4e --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.old.c @@ -0,0 +1,280 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef _OPENMP +#include +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#include "timers.h" +#include "defines.h" +#include "level.h" +#include "operators.h" +//------------------------------------------------------------------------------------------------------------------------------ +#define MyPragma(a) _Pragma(#a) +//------------------------------------------------------------------------------------------------------------------------------ +#if (_OPENMP>=201107) // OpenMP 3.1 supports max reductions... + // XL C/C++ 12.01.0000.0009 sets _OPENMP to 201107, but does not support the max clause within a _Pragma(). + // This issue was fixed by XL C/C++ 12.01.0000.0011 + // If you do not have this version of XL C/C++ and run into this bug, uncomment these macros... + //#warning not threading norm() calculations due to issue with XL/C, _Pragma, and reduction(max:bmax) + //#define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) ) + //#define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction( +:bsum) ) + //#define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) + #define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction( +:bsum) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(max:bmax) ) +#elif _OPENMP // older OpenMP versions don't support the max reduction clause + #warning Threading max reductions requires OpenMP 3.1 (July 2011). Please upgrade your compiler. + #define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction( +:bsum) ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) +#else // flat MPI should not define any threading... + #define PRAGMA_THREAD_ACROSS_BLOCKS( level,b,nb ) + #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum) + #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax) +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#warning operators.old.c represents an older, lower performance, less performance portable approach to smoothers/residual calculation. It is strongly suggested you use the default operator file. +//------------------------------------------------------------------------------------------------------------------------------ +void apply_BCs(level_type * level, int x_id, int shape){apply_BCs_p1(level,x_id,shape);} +//------------------------------------------------------------------------------------------------------------------------------ +#define Dinv_ijk() Dinv[ijk] // simply retrieve it rather than recalculating it +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef STENCIL_VARIABLE_COEFFICIENT +#ifdef USE_HELMHOLTZ // variable coefficient Helmholtz... + #define apply_op_ijk(x) \ + ( \ + a*alpha[ijk]*x[ijk] \ + -b*h2inv*( \ + + beta_i[ijk+1 ]*( x[ijk+1 ] - x[ijk] ) \ + + beta_i[ijk ]*( x[ijk-1 ] - x[ijk] ) \ + + beta_j[ijk+jStride]*( x[ijk+jStride] - x[ijk] ) \ + + beta_j[ijk ]*( x[ijk-jStride] - x[ijk] ) \ + + beta_k[ijk+kStride]*( x[ijk+kStride] - x[ijk] ) \ + + beta_k[ijk ]*( x[ijk-kStride] - x[ijk] ) \ + ) \ + ) +#else // variable coefficient Poisson... + #define apply_op_ijk(x) \ + ( \ + -b*h2inv*( \ + + beta_i[ijk+1 ]*( x[ijk+1 ] - x[ijk] ) \ + + beta_i[ijk ]*( x[ijk-1 ] - x[ijk] ) \ + + beta_j[ijk+jStride]*( x[ijk+jStride] - x[ijk] ) \ + + beta_j[ijk ]*( x[ijk-jStride] - x[ijk] ) \ + + beta_k[ijk+kStride]*( x[ijk+kStride] - x[ijk] ) \ + + beta_k[ijk ]*( x[ijk-kStride] - x[ijk] ) \ + ) \ + ) +#endif +#else // constant coefficient case... + #define apply_op_ijk(x) \ + ( \ + a*x[ijk] - b*h2inv*( \ + + x[ijk+1 ] \ + + x[ijk-1 ] \ + + x[ijk+jStride] \ + + x[ijk-jStride] \ + + x[ijk+kStride] \ + + x[ijk-kStride] \ + - x[ijk ]*6.0 \ + ) \ + ) +#endif // variable/constant coefficient + + +//------------------------------------------------------------------------------------------------------------------------------ +int stencil_get_radius(){return(1);} // 7pt reaches out 1 point +int stencil_get_shape(){return(STENCIL_SHAPE_STAR);} // needs just faces +//------------------------------------------------------------------------------------------------------------------------------ +void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){ + if(level->my_rank==0){fprintf(stdout," rebuilding operator for level... h=%e ",level->h);} + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // form restriction of alpha[], beta_*[] coefficients from fromLevel + if(fromLevel != NULL){ + restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL ); + restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I); + restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J); + restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K); + } // else case assumes alpha/beta have been set + + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // exchange alpha/beta/... (must be done before calculating Dinv) + exchange_boundary(level,VECTOR_ALPHA ,STENCIL_SHAPE_BOX); // safe + exchange_boundary(level,VECTOR_BETA_I,STENCIL_SHAPE_BOX); + exchange_boundary(level,VECTOR_BETA_J,STENCIL_SHAPE_BOX); + exchange_boundary(level,VECTOR_BETA_K,STENCIL_SHAPE_BOX); + + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // calculate Dinv, L1inv, and estimate the dominant Eigenvalue + double _timeStart = getTime(); + int block; + + double dominant_eigenvalue = -1e9; + + PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,block,level->num_my_blocks,dominant_eigenvalue) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + double h2inv = 1.0/(level->h*level->h); + double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); + double * __restrict__ L1inv = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); + double block_eigenvalue = -1e9; + + for(k=klo;kboundary_condition.type != BC_PERIODIC){ + if(level->my_boxes[box].low.i+i-1 < 0)ilo_is_valid = 0.0; + if(level->my_boxes[box].low.j+j-1 < 0)jlo_is_valid = 0.0; + if(level->my_boxes[box].low.k+k-1 < 0)klo_is_valid = 0.0; + if(level->my_boxes[box].low.i+i+1 >= level->dim.i)ihi_is_valid = 0.0; + if(level->my_boxes[box].low.j+j+1 >= level->dim.j)jhi_is_valid = 0.0; + if(level->my_boxes[box].low.k+k+1 >= level->dim.k)khi_is_valid = 0.0; + } + + #ifdef STENCIL_VARIABLE_COEFFICIENT + // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements... + double sumAbsAij = fabs(b*h2inv) * ( + fabs( beta_i[ijk ]*ilo_is_valid )+ + fabs( beta_j[ijk ]*jlo_is_valid )+ + fabs( beta_k[ijk ]*klo_is_valid )+ + fabs( beta_i[ijk+1 ]*ihi_is_valid )+ + fabs( beta_j[ijk+jStride]*jhi_is_valid )+ + fabs( beta_k[ijk+kStride]*khi_is_valid ) + ); + + // center of Gershgorin disc is the diagonal element... + double Aii = a*alpha[ijk] - b*h2inv*( + beta_i[ijk ]*( ilo_is_valid-2.0 )+ + beta_j[ijk ]*( jlo_is_valid-2.0 )+ + beta_k[ijk ]*( klo_is_valid-2.0 )+ + beta_i[ijk+1 ]*( ihi_is_valid-2.0 )+ + beta_j[ijk+jStride]*( jhi_is_valid-2.0 )+ + beta_k[ijk+kStride]*( khi_is_valid-2.0 ) + ); + #else // Constant coefficient versions with fused BC's... + // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements... + double sumAbsAij = fabs(b*h2inv) * ( + ilo_is_valid + + jlo_is_valid + + klo_is_valid + + ihi_is_valid + + jhi_is_valid + + khi_is_valid + ); + + // center of Gershgorin disc is the diagonal element... + double Aii = a - b*h2inv*( + ilo_is_valid + + jlo_is_valid + + klo_is_valid + + ihi_is_valid + + jhi_is_valid + + khi_is_valid - 12.0 + ); + #endif + + // calculate Dinv = D^{-1}, L1inv = ( D+D^{L1} )^{-1}, and the dominant eigenvalue... + Dinv[ijk] = 1.0/Aii; // inverse of the diagonal Aii + //L1inv[ijk] = 1.0/(Aii+sumAbsAij); // inverse of the L1 row norm... L1inv = ( D+D^{L1} )^{-1} + if(Aii>=1.5*sumAbsAij)L1inv[ijk] = 1.0/(Aii ); // as suggested by eq 6.5 in Baker et al, "Multigrid smoothers for ultra-parallel computing: additional theory and discussion"... + else L1inv[ijk] = 1.0/(Aii+0.5*sumAbsAij); // + double Di = (Aii + sumAbsAij)/Aii;if(Di>block_eigenvalue)block_eigenvalue=Di; // upper limit to Gershgorin disc == bound on dominant eigenvalue + }}} + if(block_eigenvalue>dominant_eigenvalue){dominant_eigenvalue = block_eigenvalue;} + } + level->timers.blas1 += (double)(getTime()-_timeStart); + + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Reduce the local estimates dominant eigenvalue to a global estimate + #ifdef USE_MPI + double _timeStartAllReduce = getTime(); + double send = dominant_eigenvalue; + MPI_Allreduce(&send,&dominant_eigenvalue,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD); + double _timeEndAllReduce = getTime(); + level->timers.collectives += (double)(_timeEndAllReduce-_timeStartAllReduce); + #endif + if(level->my_rank==0){fprintf(stdout,"eigenvalue_max<%e\n",dominant_eigenvalue);} + level->dominant_eigenvalue_of_DinvA = dominant_eigenvalue; + + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // exchange Dinv/L1inv/... + exchange_boundary(level,VECTOR_DINV ,STENCIL_SHAPE_BOX); // safe + exchange_boundary(level,VECTOR_L1INV,STENCIL_SHAPE_BOX); + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +} + + +//------------------------------------------------------------------------------------------------------------------------------ +#include "operators.old/iterators.c" +#ifdef USE_GSRB +#define NUM_SMOOTHS 2 // RBRB +#include "operators.old/gsrb.c" +#elif USE_CHEBY +#define NUM_SMOOTHS 1 +#define CHEBYSHEV_DEGREE 4 // i.e. one degree-4 polynomial smoother +#include "operators.old/chebyshev.c" +#elif USE_JACOBI +#define NUM_SMOOTHS 6 +#include "operators.old/jacobi.c" +#elif USE_L1JACOBI +#define NUM_SMOOTHS 6 +#include "operators.old/jacobi.c" +#elif USE_SYMGS +#define NUM_SMOOTHS 1 +#include "operators.old/symgs.c" +#else +#error You must compile with either -DUSE_GSRB, -DUSE_CHEBY, -DUSE_JACOBI, -DUSE_L1JACOBI, or -DUSE_SYMGS +#endif +#include "operators.old/residual.c" +#include "operators.old/apply_op.c" +//------------------------------------------------------------------------------------------------------------------------------ +#include "operators/blockCopy.c" +#include "operators/misc.c" +#include "operators/exchange_boundary.c" +#include "operators/boundary_fd.c" +#include "operators/restriction.c" +#include "operators/interpolation_p0.c" +#include "operators/interpolation_p1.c" +//------------------------------------------------------------------------------------------------------------------------------ +void interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_p0(level_f,id_f,prescale_f,level_c,id_c);} +void interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_p1(level_f,id_f,prescale_f,level_c,id_c);} +//------------------------------------------------------------------------------------------------------------------------------ +#include "operators/problem.p6.c" +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/chebyshev.c b/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/chebyshev.c new file mode 100644 index 00000000..1d42e08b --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/chebyshev.c @@ -0,0 +1,96 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +// Based on Yousef Saad's Iterative Methods for Sparse Linear Algebra, Algorithm 12.1, page 399 +//------------------------------------------------------------------------------------------------------------------------------ +void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ + if( (level->dominant_eigenvalue_of_DinvA<=0.0) && (level->my_rank==0) )printf("dominant_eigenvalue_of_DinvA <= 0.0 !\n"); + if((CHEBYSHEV_DEGREE*NUM_SMOOTHS)&1){ + printf("error... CHEBYSHEV_DEGREE*NUM_SMOOTHS must be even for the chebyshev smoother...\n"); + exit(0); + } + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + int box,s; + int ghosts = level->box_ghosts; + int communicationAvoiding = ghosts > stencil_get_radius(); + + + // compute the Chebyshev coefficients... + double beta = 1.000*level->dominant_eigenvalue_of_DinvA; +//double alpha = 0.300000*beta; +//double alpha = 0.250000*beta; +//double alpha = 0.166666*beta; + double alpha = 0.125000*beta; + double theta = 0.5*(beta+alpha); // center of the spectral ellipse + double delta = 0.5*(beta-alpha); // major axis? + double sigma = theta/delta; + double rho_n = 1/sigma; // rho_0 + double chebyshev_c1[CHEBYSHEV_DEGREE]; // + c1*(x_n-x_nm1) == rho_n*rho_nm1 + double chebyshev_c2[CHEBYSHEV_DEGREE]; // + c2*(b-Ax_n) + chebyshev_c1[0] = 0.0; + chebyshev_c2[0] = 1/theta; + for(s=1;snum_my_boxes;box++){ + int i,j,k,ss; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int dim = level->my_boxes[box].dim; + const double h2inv = 1.0/(level->h*level->h); + const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); + const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain + + int ghostsToOperateOn=ghosts-1; + for(ss=s;ssmy_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); + x_nm1 = level->my_boxes[box].vectors[VECTOR_TEMP] + ghosts*(1+jStride+kStride); + x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP] + ghosts*(1+jStride+kStride);} + else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP] + ghosts*(1+jStride+kStride); + x_nm1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); + x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride);} + const double c1 = chebyshev_c1[ss%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE. + const double c2 = chebyshev_c2[ss%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE. + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=0-ghostsToOperateOn;kcycles.smooth += (uint64_t)(CycleTime()-_timeStart); + } // s-loop +} diff --git a/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/gsrb.c b/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/gsrb.c new file mode 100644 index 00000000..cbb9e754 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/gsrb.c @@ -0,0 +1,90 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +//#define GSRB_STRIDE2 +//#define GSRB_FP +//------------------------------------------------------------------------------------------------------------------------------ +void smooth(level_type * level, int phi_id, int rhs_id, double a, double b){ + int box,s; + int ghosts = level->box_ghosts; + int communicationAvoiding = ghosts > stencil_get_radius(); + + // if communication-avoiding, need updated RHS for stencils in ghost zones + if(communicationAvoiding)exchange_boundary(level,rhs_id,0); + + for(s=0;s<2*NUM_SMOOTHS;s+=ghosts){ // there are two sweeps per GSRB smooth + exchange_boundary(level,phi_id,stencil_is_star_shaped() && !communicationAvoiding); + apply_BCs(level,phi_id); + + // now do ghosts communication-avoiding smooths on each box... + uint64_t _timeStart = CycleTime(); + PRAGMA_THREAD_ACROSS_BOXES(level,box) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k,ss; + int color000 = (level->my_boxes[box].low.i^level->my_boxes[box].low.j^level->my_boxes[box].low.k)&1; // is element 000 red or black ??? (should only be an issue if box dimension is odd) + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int dim = level->my_boxes[box].dim; + const double h2inv = 1.0/(level->h*level->h); + const double * __restrict__ phi = level->my_boxes[box].vectors[ phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + double * __restrict__ phi_new = level->my_boxes[box].vectors[ phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); + const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain + const double * __restrict__ RedBlack[2] = {level->RedBlack_FP[0] + ghosts*(1+jStride), + level->RedBlack_FP[1] + ghosts*(1+jStride)}; + + + int ghostsToOperateOn=ghosts-1; + for(ss=s;sscycles.smooth += (uint64_t)(CycleTime()-_timeStart); + } // s-loop +} + + +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/jacobi.c b/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/jacobi.c new file mode 100644 index 00000000..4b09da9a --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/jacobi.c @@ -0,0 +1,79 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include "../timer.h" +//------------------------------------------------------------------------------------------------------------------------------ +void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ + if(NUM_SMOOTHS&1){ + printf("error - NUM_SMOOTHS must be even...\n"); + exit(0); + } + + + int box,s; + int ghosts = level->box_ghosts; + int starShaped = stencil_is_star_shaped(); + int communicationAvoiding = ghosts > stencil_get_radius(); + + #ifdef USE_L1JACOBI + double weight = 1.0; + #else + double weight = 2.0/3.0; + #endif + + + // if communication-avoiding, need updated RHS for stencils in ghost zones + if(communicationAvoiding)exchange_boundary(level,rhs_id,0); + + for(s=0;snum_my_boxes;box++){ + int i,j,k,ss; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int dim = level->my_boxes[box].dim; + const double h2inv = 1.0/(level->h*level->h); + const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain + #ifdef USE_L1JACOBI + const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); + #else + const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); + #endif + int ghostsToOperateOn=ghosts-1; + for(ss=s;ssmy_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); + x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP] + ghosts*(1+jStride+kStride);} + else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP] + ghosts*(1+jStride+kStride); + x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride);} + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=0-ghostsToOperateOn;kcycles.smooth += (uint64_t)(CycleTime()-_timeStart); + } // s-loop +} + +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators.old/apply_op.c b/Util/hpgmg/finite-volume/source/operators.old/apply_op.c new file mode 100644 index 00000000..0f7aeab9 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.old/apply_op.c @@ -0,0 +1,40 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +void apply_op(level_type * level, int Ax_id, int x_id, double a, double b){ // y=Ax + // exchange the boundary of x in preparation for Ax + exchange_boundary(level,x_id,stencil_get_shape()); + apply_BCs(level,x_id,stencil_get_shape()); + + // now do Ax proper... + double _timeStart = getTime(); + const int ghosts = level->box_ghosts; + const int jStride = level->box_jStride; + const int kStride = level->box_kStride; + const int dim = level->box_dim; + const double h2inv = 1.0/(level->h*level->h); + int box; + + PRAGMA_THREAD_ACROSS_BOXES(level,box) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + double * __restrict__ Ax = level->my_boxes[box].vectors[ Ax_id] + ghosts*(1+jStride+kStride); + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=0;ktimers.apply_op += (double)(getTime()-_timeStart); +} +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators.old/chebyshev.c b/Util/hpgmg/finite-volume/source/operators.old/chebyshev.c new file mode 100644 index 00000000..c86be4cf --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.old/chebyshev.c @@ -0,0 +1,93 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +// Based on Yousef Saad's Iterative Methods for Sparse Linear Algebra, Algorithm 12.1, page 399 +//------------------------------------------------------------------------------------------------------------------------------ +void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ + if((CHEBYSHEV_DEGREE*NUM_SMOOTHS)&1){ + fprintf(stderr,"error... CHEBYSHEV_DEGREE*NUM_SMOOTHS must be even for the chebyshev smoother...\n"); + exit(0); + } + if( (level->dominant_eigenvalue_of_DinvA<=0.0) && (level->my_rank==0) )fprintf(stderr,"dominant_eigenvalue_of_DinvA <= 0.0 !\n"); + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + int box,s; + + + + // compute the Chebyshev coefficients... + double beta = 1.000*level->dominant_eigenvalue_of_DinvA; +//double alpha = 0.300000*beta; +//double alpha = 0.250000*beta; +//double alpha = 0.166666*beta; + double alpha = 0.125000*beta; + double theta = 0.5*(beta+alpha); // center of the spectral ellipse + double delta = 0.5*(beta-alpha); // major axis? + double sigma = theta/delta; + double rho_n = 1/sigma; // rho_0 + double chebyshev_c1[CHEBYSHEV_DEGREE]; // + c1*(x_n-x_nm1) == rho_n*rho_nm1 + double chebyshev_c2[CHEBYSHEV_DEGREE]; // + c2*(b-Ax_n) + chebyshev_c1[0] = 0.0; + chebyshev_c2[0] = 1/theta; + for(s=1;sbox_ghosts; + const int jStride = level->box_jStride; + const int kStride = level->box_kStride; + const int dim = level->box_dim; + const double h2inv = 1.0/(level->h*level->h); + + PRAGMA_THREAD_ACROSS_BOXES(level,box) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); + + double * __restrict__ x_np1; + const double * __restrict__ x_n; + const double * __restrict__ x_nm1; + if((s&1)==0){x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); + x_nm1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); + x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride);} + else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); + x_nm1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); + x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride);} + const double c1 = chebyshev_c1[s%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE. + const double c2 = chebyshev_c2[s%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE. + + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=0;ktimers.smooth += (double)(getTime()-_timeStart); + } // s-loop +} diff --git a/Util/hpgmg/finite-volume/source/operators.old/gsrb.c b/Util/hpgmg/finite-volume/source/operators.old/gsrb.c new file mode 100644 index 00000000..bcf54475 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.old/gsrb.c @@ -0,0 +1,133 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#if defined(GSRB_FP) + #warning Overriding default GSRB implementation and using pre-computed 1.0/0.0 FP array for Red-Black to facilitate vectorization... +#elif defined(GSRB_STRIDE2) + #if defined(GSRB_OOP) + #warning Overriding default GSRB implementation and using out-of-place and stride-2 accesses to minimize the number of flops + #else + #warning Overriding default GSRB implementation and using stride-2 accesses to minimize the number of flops + #endif +#elif defined(GSRB_BRANCH) + #if defined(GSRB_OOP) + #warning Overriding default GSRB implementation and using out-of-place implementation with an if-then-else on loop indices... + #else + #warning Overriding default GSRB implementation and using if-then-else on loop indices... + #endif +#else +#define GSRB_STRIDE2 // default implementation +#endif +//------------------------------------------------------------------------------------------------------------------------------ +void smooth(level_type * level, int phi_id, int rhs_id, double a, double b){ + int box,s; + for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps per GSRB smooth + + // exchange the ghost zone... + #ifdef GSRB_OOP // out-of-place GSRB ping pongs between x and VECTOR_TEMP + if((s&1)==0){exchange_boundary(level, phi_id,stencil_get_shape());apply_BCs(level, phi_id,stencil_get_shape());} + else{exchange_boundary(level,VECTOR_TEMP,stencil_get_shape());apply_BCs(level,VECTOR_TEMP,stencil_get_shape());} + #else // in-place GSRB only operates on x + exchange_boundary(level, phi_id,stencil_get_shape());apply_BCs(level, phi_id,stencil_get_shape()); + #endif + + + // apply the smoother... + double _timeStart = getTime(); + const int ghosts = level->box_ghosts; + const int jStride = level->box_jStride; + const int kStride = level->box_kStride; + const int dim = level->box_dim; + const double h2inv = 1.0/(level->h*level->h); + + PRAGMA_THREAD_ACROSS_BOXES(level,box) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int color000 = (level->my_boxes[box].low.i^level->my_boxes[box].low.j^level->my_boxes[box].low.k^s)&1; // is element 000 red or black on *THIS* sweep + + const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); + #ifdef GSRB_OOP + const double * __restrict__ phi; + double * __restrict__ phi_new; + if((s&1)==0){phi = level->my_boxes[box].vectors[ phi_id] + ghosts*(1+jStride+kStride); + phi_new = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride);} + else{phi = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); + phi_new = level->my_boxes[box].vectors[ phi_id] + ghosts*(1+jStride+kStride);} + #else + const double * __restrict__ phi = level->my_boxes[box].vectors[ phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + double * __restrict__ phi_new = level->my_boxes[box].vectors[ phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + #endif + + + #if defined(GSRB_FP) + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=0;kRedBlack_FP + ghosts*(1+jStride) + kStride*((k^color000)&0x1); + for(i=0;itimers.smooth += (double)(getTime()-_timeStart); + } // s-loop +} + + +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators.old/iterators.c b/Util/hpgmg/finite-volume/source/operators.old/iterators.c new file mode 100644 index 00000000..3ea9637c --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.old/iterators.c @@ -0,0 +1,53 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#if 0 +#if (_OPENMP>=201107) // OpenMP 3.1 supports max reductions... + #define PRAGMA_THREAD_ACROSS_BOXES( level,box) MyPragma(omp parallel for private(box) if(level->concurrent_boxes>1) num_threads(level->concurrent_boxes) ) + #define PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,level_sum) MyPragma(omp parallel for private(box) if(level->concurrent_boxes>1) num_threads(level->concurrent_boxes) reduction( +:level_sum) schedule(static) ) + #define PRAGMA_THREAD_ACROSS_BOXES_MAX(level,box,level_max) MyPragma(omp parallel for private(box) if(level->concurrent_boxes>1) num_threads(level->concurrent_boxes) reduction(max:level_max) schedule(static) ) + #define PRAGMA_THREAD_WITHIN_A_BOX( level,i,j,k) MyPragma(omp parallel for private(i,j,k) if(level->threads_per_box >1) num_threads(level->threads_per_box ) collapse(2) ) + #define PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,box_sum) MyPragma(omp parallel for private(i,j,k) if(level->threads_per_box >1) num_threads(level->threads_per_box ) collapse(2) reduction( +: box_sum) schedule(static) ) + #define PRAGMA_THREAD_WITHIN_A_BOX_MAX(level,i,j,k,box_max) MyPragma(omp parallel for private(i,j,k) if(level->threads_per_box >1) num_threads(level->threads_per_box ) collapse(2) reduction(max: box_max) schedule(static) ) +#elif _OPENMP // older OpenMP versions don't support the max reduction clause + #define PRAGMA_THREAD_ACROSS_BOXES( level,box) MyPragma(omp parallel for private(box) if(level->concurrent_boxes>1) num_threads(level->concurrent_boxes) ) + #define PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,level_sum) MyPragma(omp parallel for private(box) if(level->concurrent_boxes>1) num_threads(level->concurrent_boxes) reduction( +:level_sum) schedule(static) ) + #define PRAGMA_THREAD_ACROSS_BOXES_MAX(level,box,level_max) #warning Threading max reductions requires OpenMP 3.1 (July 2011). Please upgrade your compiler. + #define PRAGMA_THREAD_WITHIN_A_BOX( level,i,j,k) MyPragma(omp parallel for private(i,j,k) if(level->threads_per_box >1) num_threads(level->threads_per_box ) collapse(2) ) + #define PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,box_sum) MyPragma(omp parallel for private(i,j,k) if(level->threads_per_box >1) num_threads(level->threads_per_box ) collapse(2) reduction( +: box_sum) schedule(static) ) + #define PRAGMA_THREAD_WITHIN_A_BOX_MAX(level,i,j,k,box_max) #warning Threading max reductions requires OpenMP 3.1 (July 2011). Please upgrade your compiler. +#else // flat MPI should not define any threading... + #define PRAGMA_THREAD_ACROSS_BOXES( level,box) + #define PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,level_sum) + #define PRAGMA_THREAD_ACROSS_BOXES_MAX(level,box,level_max) + #define PRAGMA_THREAD_WITHIN_A_BOX( level,i,j,k) + #define PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,box_sum) + #define PRAGMA_THREAD_WITHIN_A_BOX_MAX(level,i,j,k,box_max) +#endif +#else +#if (_OPENMP>=201107) // OpenMP 3.1 supports max reductions... + #define PRAGMA_THREAD_ACROSS_BOXES( level,box) + #define PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,level_sum) + #define PRAGMA_THREAD_ACROSS_BOXES_MAX(level,box,level_max) + #define PRAGMA_THREAD_WITHIN_A_BOX( level,i,j,k) MyPragma(omp parallel for private(i,j,k) collapse(2) ) + #define PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,box_sum) MyPragma(omp parallel for private(i,j,k) collapse(2) reduction( +: box_sum) schedule(static) ) + #define PRAGMA_THREAD_WITHIN_A_BOX_MAX(level,i,j,k,box_max) MyPragma(omp parallel for private(i,j,k) collapse(2) reduction(max: box_max) schedule(static) ) +#elif _OPENMP // older OpenMP versions don't support the max reduction clause + #define PRAGMA_THREAD_ACROSS_BOXES( level,box) + #define PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,level_sum) + #define PRAGMA_THREAD_ACROSS_BOXES_MAX(level,box,level_max) #warning Threading max reductions requires OpenMP 3.1 (July 2011). Please upgrade your compiler. + #define PRAGMA_THREAD_WITHIN_A_BOX( level,i,j,k) MyPragma(omp parallel for private(i,j,k) collapse(2) ) + #define PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,box_sum) MyPragma(omp parallel for private(i,j,k) collapse(2) reduction( +: box_sum) schedule(static) ) + #define PRAGMA_THREAD_WITHIN_A_BOX_MAX(level,i,j,k,box_max) #warning Threading max reductions requires OpenMP 3.1 (July 2011). Please upgrade your compiler. +#else // flat MPI should not define any threading... + #define PRAGMA_THREAD_ACROSS_BOXES( level,box) + #define PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,level_sum) + #define PRAGMA_THREAD_ACROSS_BOXES_MAX(level,box,level_max) + #define PRAGMA_THREAD_WITHIN_A_BOX( level,i,j,k) + #define PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,box_sum) + #define PRAGMA_THREAD_WITHIN_A_BOX_MAX(level,i,j,k,box_max) +#endif +#endif +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators.old/jacobi.c b/Util/hpgmg/finite-volume/source/operators.old/jacobi.c new file mode 100644 index 00000000..c930f4e3 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.old/jacobi.c @@ -0,0 +1,66 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +//------------------------------------------------------------------------------------------------------------------------------ +void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ + if(NUM_SMOOTHS&1){ + fprintf(stderr,"error - NUM_SMOOTHS must be even...\n"); + exit(0); + } + + #ifdef USE_L1JACOBI + double weight = 1.0; + #else + double weight = 2.0/3.0; + #endif + + int box,s; + for(s=0;sbox_ghosts; + const int jStride = level->box_jStride; + const int kStride = level->box_kStride; + const int dim = level->box_dim; + const double h2inv = 1.0/(level->h*level->h); + + PRAGMA_THREAD_ACROSS_BOXES(level,box) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + #ifdef USE_L1JACOBI + const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); + #else + const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); + #endif + const double * __restrict__ x_n; + double * __restrict__ x_np1; + if((s&1)==0){x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); + x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride);} + else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); + x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride);} + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=0;ktimers.smooth += (double)(getTime()-_timeStart); + } // s-loop +} + +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators.old/misc.c b/Util/hpgmg/finite-volume/source/operators.old/misc.c new file mode 100644 index 00000000..fecc3f73 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.old/misc.c @@ -0,0 +1,373 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +void zero_vector(level_type * level, int component_id){ + // zero's the entire grid INCLUDING ghost zones... + double _timeStart = getTime(); + int box; + + PRAGMA_THREAD_ACROSS_BOXES(level,box) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim = level->my_boxes[box].dim; + double * __restrict__ grid = level->my_boxes[box].vectors[component_id] + ghosts*(1+jStride+kStride); + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=-ghosts;ktimers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void init_vector(level_type * level, int component_id, double scalar){ + // initializes the grid to a scalar while zero'ing the ghost zones... + double _timeStart = getTime(); + int box; + + PRAGMA_THREAD_ACROSS_BOXES(level,box) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim = level->my_boxes[box].dim; + double * __restrict__ grid = level->my_boxes[box].vectors[component_id] + ghosts*(1+jStride+kStride); + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=-ghosts;k=dim) || (j>=dim) || (k>=dim); + grid[ijk] = ghostZone ? 0.0 : scalar; + }}} + } + level->timers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void add_vectors(level_type * level, int id_c, double scale_a, int id_a, double scale_b, int id_b){ // c=scale_a*id_a + scale_b*id_b + double _timeStart = getTime(); + + int box; + + PRAGMA_THREAD_ACROSS_BOXES(level,box) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim = level->my_boxes[box].dim; + double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); + double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); + double * __restrict__ grid_b = level->my_boxes[box].vectors[id_b] + ghosts*(1+jStride+kStride); + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=0;ktimers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void mul_vectors(level_type * level, int id_c, double scale, int id_a, int id_b){ // id_c=scale*id_a*id_b + double _timeStart = getTime(); + + int box; + + PRAGMA_THREAD_ACROSS_BOXES(level,box) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim = level->my_boxes[box].dim; + double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); + double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); + double * __restrict__ grid_b = level->my_boxes[box].vectors[id_b] + ghosts*(1+jStride+kStride); + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=0;ktimers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void invert_vector(level_type * level, int id_c, double scale_a, int id_a){ // c[]=scale_a/a[] + double _timeStart = getTime(); + + int box; + + PRAGMA_THREAD_ACROSS_BOXES(level,box) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim = level->my_boxes[box].dim; + double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); + double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=0;ktimers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void scale_vector(level_type * level, int id_c, double scale_a, int id_a){ // c[]=scale_a*a[] + double _timeStart = getTime(); + + int box; + + PRAGMA_THREAD_ACROSS_BOXES(level,box) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim = level->my_boxes[box].dim; + double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); + double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=0;ktimers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +double dot(level_type * level, int id_a, int id_b){ + double _timeStart = getTime(); + + + int box; + double a_dot_b_level = 0.0; + // FIX, schedule(static) is a stand in to guarantee reproducibility... + PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,a_dot_b_level) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim = level->my_boxes[box].dim; + double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + double * __restrict__ grid_b = level->my_boxes[box].vectors[id_b] + ghosts*(1+jStride+kStride); + double a_dot_b_box = 0.0; + PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,a_dot_b_box) + for(k=0;ktimers.blas1 += (double)(getTime()-_timeStart); + + #ifdef USE_MPI + double _timeStartAllReduce = getTime(); + double send = a_dot_b_level; + MPI_Allreduce(&send,&a_dot_b_level,1,MPI_DOUBLE,MPI_SUM,level->MPI_COMM_ALLREDUCE); + double _timeEndAllReduce = getTime(); + level->timers.collectives += (double)(_timeEndAllReduce-_timeStartAllReduce); + #endif + + return(a_dot_b_level); +} + +//------------------------------------------------------------------------------------------------------------------------------ +double norm(level_type * level, int component_id){ // implements the max norm + double _timeStart = getTime(); + + int box; + double max_norm = 0.0; + // FIX, schedule(static) is a stand in to guarantee reproducibility... + PRAGMA_THREAD_ACROSS_BOXES_MAX(level,box,max_norm) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim = level->my_boxes[box].dim; + double * __restrict__ grid = level->my_boxes[box].vectors[component_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + double box_norm = 0.0; + PRAGMA_THREAD_WITHIN_A_BOX_MAX(level,i,j,k,box_norm) + for(k=0;kbox_norm){box_norm=fabs_grid_ijk;} // max norm + }}} + if(box_norm>max_norm){max_norm = box_norm;} + } // box list + level->timers.blas1 += (double)(getTime()-_timeStart); + + #ifdef USE_MPI + double _timeStartAllReduce = getTime(); + double send = max_norm; + MPI_Allreduce(&send,&max_norm,1,MPI_DOUBLE,MPI_MAX,level->MPI_COMM_ALLREDUCE); + double _timeEndAllReduce = getTime(); + level->timers.collectives += (double)(_timeEndAllReduce-_timeStartAllReduce); + #endif + return(max_norm); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +double mean(level_type * level, int id_a){ + double _timeStart = getTime(); + + + int box; + double sum_level = 0.0; + PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,sum_level) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim = level->my_boxes[box].dim; + double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + double sum_box = 0.0; + PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,sum_box) + for(k=0;ktimers.blas1 += (double)(getTime()-_timeStart); + double ncells_level = (double)level->dim.i*(double)level->dim.j*(double)level->dim.k; + + #ifdef USE_MPI + double _timeStartAllReduce = getTime(); + double send = sum_level; + MPI_Allreduce(&send,&sum_level,1,MPI_DOUBLE,MPI_SUM,level->MPI_COMM_ALLREDUCE); + double _timeEndAllReduce = getTime(); + level->timers.collectives += (double)(_timeEndAllReduce-_timeStartAllReduce); + #endif + + double mean_level = sum_level / ncells_level; + return(mean_level); +} + + +void shift_vector(level_type * level, int id_c, int id_a, double shift_a){ + double _timeStart = getTime(); + + + int box; + PRAGMA_THREAD_ACROSS_BOXES(level,box) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim = level->my_boxes[box].dim; + double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=0;ktimers.blas1 += (double)(getTime()-_timeStart); +} + +//------------------------------------------------------------------------------------------------------------------------------ +double error(level_type * level, int id_a, int id_b){ + double h3 = level->h * level->h * level->h; + add_vectors(level,VECTOR_TEMP,1.0,id_a,-1.0,id_b); // VECTOR_TEMP = id_a - id_b + double max = norm(level,VECTOR_TEMP); return(max); // max norm of error function + double L2 = sqrt( dot(level,VECTOR_TEMP,VECTOR_TEMP)*h3);return( L2); // normalized L2 error ? +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void color_vector(level_type * level, int id, int colors_in_each_dim, int icolor, int jcolor, int kcolor){ + double _timeStart = getTime(); + int box; + PRAGMA_THREAD_ACROSS_BOXES(level,box) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim = level->my_boxes[box].dim; + double * __restrict__ grid = level->my_boxes[box].vectors[id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=0;ktimers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void random_vector(level_type * level, int id){ + double _timeStart = getTime(); + int box; + PRAGMA_THREAD_ACROSS_BOXES(level,box) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim = level->my_boxes[box].dim; + double * __restrict__ grid = level->my_boxes[box].vectors[id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=0;ktimers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators.old/residual.c b/Util/hpgmg/finite-volume/source/operators.old/residual.c new file mode 100644 index 00000000..03f75f27 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.old/residual.c @@ -0,0 +1,44 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +// calculate res_id = rhs_id - A(x_id) + +void residual(level_type * level, int res_id, int x_id, int rhs_id, double a, double b){ + // exchange the boundary for x in prep for Ax... + exchange_boundary(level,x_id,stencil_get_shape()); + apply_BCs(level,x_id,stencil_get_shape()); + + // now do residual/restriction proper... + double _timeStart = getTime(); + const int ghosts = level->box_ghosts; + const int jStride = level->box_jStride; + const int kStride = level->box_kStride; + const int dim = level->box_dim; + const double h2inv = 1.0/(level->h*level->h); + int box; + + PRAGMA_THREAD_ACROSS_BOXES(level,box) + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + double * __restrict__ res = level->my_boxes[box].vectors[ res_id] + ghosts*(1+jStride+kStride); + + PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) + for(k=0;ktimers.residual += (double)(getTime()-_timeStart); +} + diff --git a/Util/hpgmg/finite-volume/source/operators.old/symgs.c b/Util/hpgmg/finite-volume/source/operators.old/symgs.c new file mode 100644 index 00000000..51eebeeb --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators.old/symgs.c @@ -0,0 +1,59 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +void smooth(level_type * level, int phi_id, int rhs_id, double a, double b){ + int box,s; + + for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps (forward/backward) per GS smooth + exchange_boundary(level,phi_id,stencil_get_shape()); + apply_BCs(level,phi_id,stencil_get_shape()); + + // now do ghosts communication-avoiding smooths on each box... + double _timeStart = getTime(); + const int ghosts = level->box_ghosts; + const int jStride = level->box_jStride; + const int kStride = level->box_kStride; + const int dim = level->box_dim; + const double h2inv = 1.0/(level->h*level->h); + + #ifdef _OPENMP + #pragma omp parallel for + #endif + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + double * __restrict__ phi = level->my_boxes[box].vectors[ phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); + + + if( (s&0x1)==0 ){ // forward sweep... hard to thread + for(k=0;k=0;k--){ + for(j=dim-1;j>=0;j--){ + for(i=dim-1;i>=0;i--){ + int ijk = i + j*jStride + k*kStride; + double Ax = apply_op_ijk(phi); + phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax); + }}} + } + + } // boxes + level->timers.smooth += (double)(getTime()-_timeStart); + } // s-loop +} + + +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators/apply_op.c b/Util/hpgmg/finite-volume/source/operators/apply_op.c new file mode 100644 index 00000000..bc3a1c98 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/apply_op.c @@ -0,0 +1,48 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +// Applies the linear operator specified in the apply_op_ijk macro to vector x_id and stores the result in Ax_id +// This requires exchanging a ghost zone and/or enforcing a boundary condition. +// NOTE, Ax_id and x_id must be distinct +void apply_op(level_type * level, int Ax_id, int x_id, double a, double b){ + // exchange the boundary of x in preparation for Ax + exchange_boundary(level,x_id,stencil_get_shape()); + apply_BCs(level,x_id,stencil_get_shape()); + + // now do Ax proper... + double _timeStart = getTime(); + int block; + + PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const double h2inv = 1.0/(level->h*level->h); + const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + double * __restrict__ Ax = level->my_boxes[box].vectors[ Ax_id] + ghosts*(1+jStride+kStride); + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + + for(k=klo;ktimers.apply_op += (double)(getTime()-_timeStart); +} +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators/blockCopy.c b/Util/hpgmg/finite-volume/source/operators/blockCopy.c new file mode 100644 index 00000000..bd387c2c --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/blockCopy.c @@ -0,0 +1,136 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +static inline void CopyBlock(level_type *level, int id, blockCopy_type *block){ + // copy 3D array from read_i,j,k of read[] to write_i,j,k in write[] + int dim_i = block->dim.i; + int dim_j = block->dim.j; + int dim_k = block->dim.k; + + int read_i = block->read.i; + int read_j = block->read.j; + int read_k = block->read.k; + int read_jStride = block->read.jStride; + int read_kStride = block->read.kStride; + + int write_i = block->write.i; + int write_j = block->write.j; + int write_k = block->write.k; + int write_jStride = block->write.jStride; + int write_kStride = block->write.kStride; + + const double * __restrict__ read = block->read.ptr; + double * __restrict__ write = block->write.ptr; + + if(block->read.box >=0){ + read_jStride = level->my_boxes[block->read.box ].jStride; + read_kStride = level->my_boxes[block->read.box ].kStride; + read = level->my_boxes[ block->read.box].vectors[id] + level->box_ghosts*(1+ read_jStride+ read_kStride); + } + if(block->write.box>=0){ + write_jStride = level->my_boxes[block->write.box].jStride; + write_kStride = level->my_boxes[block->write.box].kStride; + write = level->my_boxes[block->write.box].vectors[id] + level->box_ghosts*(1+write_jStride+write_kStride); + } + + + int i,j,k; + if(dim_i==1){ // be smart and don't have an inner loop from 0 to 0 + for(k=0;kdim.i; + int dim_j = block->dim.j; + int dim_k = block->dim.k; + + int read_i = block->read.i; + int read_j = block->read.j; + int read_k = block->read.k; + int read_jStride = block->read.jStride; + int read_kStride = block->read.kStride; + + int write_i = block->write.i; + int write_j = block->write.j; + int write_k = block->write.k; + int write_jStride = block->write.jStride; + int write_kStride = block->write.kStride; + + const double * __restrict__ read = block->read.ptr; + double * __restrict__ write = block->write.ptr; + + if(block->read.box >=0){ + read_jStride = level->my_boxes[block->read.box ].jStride; + read_kStride = level->my_boxes[block->read.box ].kStride; + read = level->my_boxes[ block->read.box].vectors[id] + level->box_ghosts*(1+ read_jStride+ read_kStride); + } + if(block->write.box>=0){ + write_jStride = level->my_boxes[block->write.box].jStride; + write_kStride = level->my_boxes[block->write.box].kStride; + write = level->my_boxes[block->write.box].vectors[id] + level->box_ghosts*(1+write_jStride+write_kStride); + } + + int i,j,k; + for(k=0;k=STENCIL_MAX_SHAPES)shape=STENCIL_SHAPE_BOX; // shape must be < STENCIL_MAX_SHAPES in order to safely index into boundary_condition.blocks[] + if(level->boundary_condition.type == BC_PERIODIC)return; // no BC's to apply ! + + const int faces[27] = {0,0,0,0,1,0,0,0,0, 0,1,0,1,0,1,0,1,0, 0,0,0,0,1,0,0,0,0}; + const int edges[27] = {0,1,0,1,0,1,0,1,0, 1,0,1,0,0,0,1,0,1, 0,1,0,1,0,1,0,1,0}; + const int corners[27] = {1,0,1,0,0,0,1,0,1, 0,0,0,0,0,0,0,0,0, 1,0,1,0,0,0,1,0,1}; + + int buffer; + double _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->boundary_condition.num_blocks[shape]) + for(buffer=0;bufferboundary_condition.num_blocks[shape];buffer++){ + double scale = 1.0; + if( faces[level->boundary_condition.blocks[shape][buffer].subtype])scale=-1.0; + if( edges[level->boundary_condition.blocks[shape][buffer].subtype])scale= 1.0; + if(corners[level->boundary_condition.blocks[shape][buffer].subtype])scale=-1.0; + + int i,j,k; + const int box = level->boundary_condition.blocks[shape][buffer].read.box; + const int dim_i = level->boundary_condition.blocks[shape][buffer].dim.i; + const int dim_j = level->boundary_condition.blocks[shape][buffer].dim.j; + const int dim_k = level->boundary_condition.blocks[shape][buffer].dim.k; + const int ilo = level->boundary_condition.blocks[shape][buffer].read.i; + const int jlo = level->boundary_condition.blocks[shape][buffer].read.j; + const int klo = level->boundary_condition.blocks[shape][buffer].read.k; + const int normal = 26-level->boundary_condition.blocks[shape][buffer].subtype; // invert the normal vector + + // hard code for box to box BC's + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + double * __restrict__ x = level->my_boxes[box].vectors[x_id] + level->my_boxes[box].ghosts*(1+jStride+kStride); + + // convert normal vector into pointer offsets... + const int di = (((normal % 3) )-1); + const int dj = (((normal % 9)/3)-1); + const int dk = (((normal / 9) )-1); + const int stride = di + dj*jStride + dk*kStride; + + if(dim_i==1){ + for(k=0;ktimers.boundary_conditions += (double)(getTime()-_timeStart); +} + +//------------------------------------------------------------------------------------------------------------------------------ +void apply_BCs_p2(level_type * level, int x_id, int shape){ + // For cell-centered, we need to fill in the ghost zones to apply any BC's + // This code does a simple piecewise quadratic interpolation for homogeneous dirichlet (0 on boundary) + // Nominally, this is first performed across faces, then to edges, then to corners. + // + if(shape>=STENCIL_MAX_SHAPES)shape=STENCIL_SHAPE_BOX; // shape must be < STENCIL_MAX_SHAPES in order to safely index into boundary_condition.blocks[] + if(level->boundary_condition.type == BC_PERIODIC)return; // no BC's to apply ! + if(level->box_dim<2){apply_BCs_p1(level,x_id,shape);return;} + + const int faces[27] = {0,0,0,0,1,0,0,0,0, 0,1,0,1,0,1,0,1,0, 0,0,0,0,1,0,0,0,0}; + const int edges[27] = {0,1,0,1,0,1,0,1,0, 1,0,1,0,0,0,1,0,1, 0,1,0,1,0,1,0,1,0}; + const int corners[27] = {1,0,1,0,0,0,1,0,1, 0,0,0,0,0,0,0,0,0, 1,0,1,0,0,0,1,0,1}; + + int buffer; + double _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->boundary_condition.num_blocks[shape]) + for(buffer=0;bufferboundary_condition.num_blocks[shape];buffer++){ + int i,j,k; + const int box = level->boundary_condition.blocks[shape][buffer].read.box; + const int dim_i = level->boundary_condition.blocks[shape][buffer].dim.i; + const int dim_j = level->boundary_condition.blocks[shape][buffer].dim.j; + const int dim_k = level->boundary_condition.blocks[shape][buffer].dim.k; + const int ilo = level->boundary_condition.blocks[shape][buffer].read.i; + const int jlo = level->boundary_condition.blocks[shape][buffer].read.j; + const int klo = level->boundary_condition.blocks[shape][buffer].read.k; + const int normal = 26-level->boundary_condition.blocks[shape][buffer].subtype; // invert the normal vector + + // hard code for box to box BC's + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + double * __restrict__ x = level->my_boxes[box].vectors[x_id] + level->my_boxes[box].ghosts*(1+jStride+kStride); + + // convert normal vector into pointer offsets... + const int di = (((normal % 3) )-1)*1; + const int dj = (((normal % 9)/3)-1)*jStride; + const int dk = (((normal / 9) )-1)*kStride; + + if(faces[normal]){ + // + // /------/------/------/ + // / ?? / -2 / 1/3 / + // /------/------/------/ + // + const int stride = di+dj+dk; + const int stride2 = stride*2; + for(k=0;ktimers.boundary_conditions += (double)(getTime()-_timeStart); +} diff --git a/Util/hpgmg/finite-volume/source/operators/boundary_fv.c b/Util/hpgmg/finite-volume/source/operators/boundary_fv.c new file mode 100644 index 00000000..180aba93 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/boundary_fv.c @@ -0,0 +1,683 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +void apply_BCs_v1(level_type * level, int x_id, int shape){ + // For cell-centered, we need to fill in the ghost zones to apply any BC's + // This code does a simple linear interpolation for homogeneous dirichlet (0 on boundary) + // Nominally, this is first performed across faces, then to edges, then to corners. + // In this implementation, these three steps are fused + // + // . . . . . . . . . . . . . . . . . . . . + // . . . . . . + // . ? . ? . .+x(0,0).-x(0,0). + // . . . . . . + // . . . . +---0---+-- . . . . +-------+-- + // . | | . | | + // . ? 0 x(0,0)| .-x(0,0)| x(0,0)| + // . | | . | | + // . . . . +-------+-- . . . . +-------+-- + // . | | . | | + // + // + if(shape>=STENCIL_MAX_SHAPES)shape=STENCIL_SHAPE_BOX; // shape must be < STENCIL_MAX_SHAPES in order to safely index into boundary_condition.blocks[] + if(level->boundary_condition.type == BC_PERIODIC)return; // no BC's to apply ! + + const int faces[27] = {0,0,0,0,1,0,0,0,0, 0,1,0,1,0,1,0,1,0, 0,0,0,0,1,0,0,0,0}; + const int edges[27] = {0,1,0,1,0,1,0,1,0, 1,0,1,0,0,0,1,0,1, 0,1,0,1,0,1,0,1,0}; + const int corners[27] = {1,0,1,0,0,0,1,0,1, 0,0,0,0,0,0,0,0,0, 1,0,1,0,0,0,1,0,1}; + + int buffer; + double _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->boundary_condition.num_blocks[shape]) + for(buffer=0;bufferboundary_condition.num_blocks[shape];buffer++){ + double scale = 1.0; + if( faces[level->boundary_condition.blocks[shape][buffer].subtype])scale=-1.0; + if( edges[level->boundary_condition.blocks[shape][buffer].subtype])scale= 1.0; + if(corners[level->boundary_condition.blocks[shape][buffer].subtype])scale=-1.0; + + int i,j,k; + const int box = level->boundary_condition.blocks[shape][buffer].read.box; + const int dim_i = level->boundary_condition.blocks[shape][buffer].dim.i; + const int dim_j = level->boundary_condition.blocks[shape][buffer].dim.j; + const int dim_k = level->boundary_condition.blocks[shape][buffer].dim.k; + const int ilo = level->boundary_condition.blocks[shape][buffer].read.i; + const int jlo = level->boundary_condition.blocks[shape][buffer].read.j; + const int klo = level->boundary_condition.blocks[shape][buffer].read.k; + const int normal = 26-level->boundary_condition.blocks[shape][buffer].subtype; // invert the normal vector + + // hard code for box to box BC's + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + double * __restrict__ x = level->my_boxes[box].vectors[x_id] + level->box_ghosts*(1+jStride+kStride); + + // convert normal vector into pointer offsets... + const int di = (((normal % 3) )-1); + const int dj = (((normal % 9)/3)-1); + const int dk = (((normal / 9) )-1); + const int stride = di + dj*jStride + dk*kStride; + + if(dim_i==1){ + for(k=0;ktimers.boundary_conditions += (double)(getTime()-_timeStart); +} + +//------------------------------------------------------------------------------------------------------------------------------ +// For cell-centered/averaged, one must fill in a ghost zone in order to affect a boundary condition +// The argument shape indicates on which regions of the domain (not the individual boxes) must the boundary condition be enforced. +// If shape exceeds the range of defined shapes, the boundary condition will be applied to all faces, edges, and corners +// This code performs a simple quadratic volume averages extrapolation for homogeneous dirichlet (0 on boundary) +// Nominally, this is first performed across faces, then to edges, then to corners. +// In this implementation, these three steps are fused +// This code will apply the BC only to the first ghost zone. Subsequent (2nd, 3rd, ...) ghost zones will be zero'd +// This code will drop order if one attempts to apply quadratic BC's to boxes of less than 2^3 +void apply_BCs_v2(level_type * level, int x_id, int shape){ + const int box_dim = level->box_dim; + const int box_ghosts = level->box_ghosts; + if(shape>=STENCIL_MAX_SHAPES)shape=STENCIL_SHAPE_BOX; // shape must be < STENCIL_MAX_SHAPES in order to safely index into boundary_condition.blocks[] + if(level->boundary_condition.type == BC_PERIODIC)return; // no BC's to apply ! + if(level->box_dim<2){apply_BCs_v1(level,x_id,shape);return;} + + const int faces[27] = {0,0,0,0,1,0,0,0,0, 0,1,0,1,0,1,0,1,0, 0,0,0,0,1,0,0,0,0}; + const int edges[27] = {0,1,0,1,0,1,0,1,0, 1,0,1,0,0,0,1,0,1, 0,1,0,1,0,1,0,1,0}; + const int corners[27] = {1,0,1,0,0,0,1,0,1, 0,0,0,0,0,0,0,0,0, 1,0,1,0,0,0,1,0,1}; + + int buffer; + double _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->boundary_condition.num_blocks[shape]) + for(buffer=0;bufferboundary_condition.num_blocks[shape];buffer++){ + int i,j,k; + const int box = level->boundary_condition.blocks[shape][buffer].read.box; + const int dim_i = level->boundary_condition.blocks[shape][buffer].dim.i; + const int dim_j = level->boundary_condition.blocks[shape][buffer].dim.j; + const int dim_k = level->boundary_condition.blocks[shape][buffer].dim.k; + const int ilo = level->boundary_condition.blocks[shape][buffer].read.i; + const int jlo = level->boundary_condition.blocks[shape][buffer].read.j; + const int klo = level->boundary_condition.blocks[shape][buffer].read.k; + const int subtype = level->boundary_condition.blocks[shape][buffer].subtype; + //const int normal = 26-subtype; + + // hard code for box to box BC's + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const double * __restrict__ x = level->my_boxes[box].vectors[x_id] + level->box_ghosts*(1+jStride+kStride); + double * __restrict__ xn = level->my_boxes[box].vectors[x_id] + level->box_ghosts*(1+jStride+kStride); // physically the same, but use different pointers for read/write + + // zero out entire ghost region when not all points will be updated... + if(box_ghosts>1){ + for(k=0;k ds + // | |/ + // +---+ + // + int r=-1,rStride=-1,dim_r=-1,rlo=-1; + int s=-1,sStride=-1,ds=-1; + int t=-1,tStride=-1,dt=-1; + // the four 16-point stencils (symmetry allows you to view it as 12 4-point) can point in 12 different directions... + switch(subtype){ + case 1:rlo=ilo;dim_r=dim_i;rStride= 1;s= -1;sStride=jStride;t= -1;tStride=kStride;ds= sStride;dt= tStride;break; // i-edge, low j, low k + case 3:rlo=jlo;dim_r=dim_j;rStride=jStride;s= -1;sStride= 1;t= -1;tStride=kStride;ds= sStride;dt= tStride;break; // j-edge, low i, low k + case 5:rlo=jlo;dim_r=dim_j;rStride=jStride;s=box_dim;sStride= 1;t= -1;tStride=kStride;ds=-sStride;dt= tStride;break; // j-edge, high i, low k + case 7:rlo=ilo;dim_r=dim_i;rStride= 1;s=box_dim;sStride=jStride;t= -1;tStride=kStride;ds=-sStride;dt= tStride;break; // i-edge, high j, low k + case 9:rlo=klo;dim_r=dim_k;rStride=kStride;s= -1;sStride= 1;t= -1;tStride=jStride;ds= sStride;dt= tStride;break; // k-edge, low i, low j + case 11:rlo=klo;dim_r=dim_k;rStride=kStride;s=box_dim;sStride= 1;t= -1;tStride=jStride;ds=-sStride;dt= tStride;break; // k-edge, high i, low j + case 15:rlo=klo;dim_r=dim_k;rStride=kStride;s= -1;sStride= 1;t=box_dim;tStride=jStride;ds= sStride;dt=-tStride;break; // k-edge, low i, high j + case 17:rlo=klo;dim_r=dim_k;rStride=kStride;s=box_dim;sStride= 1;t=box_dim;tStride=jStride;ds=-sStride;dt=-tStride;break; // k-edge, high i, high j + case 19:rlo=ilo;dim_r=dim_i;rStride= 1;s= -1;sStride=jStride;t=box_dim;tStride=kStride;ds= sStride;dt=-tStride;break; // i-edge, low j, high k + case 21:rlo=jlo;dim_r=dim_j;rStride=jStride;s= -1;sStride= 1;t=box_dim;tStride=kStride;ds= sStride;dt=-tStride;break; // j-edge, low i, high k + case 23:rlo=jlo;dim_r=dim_j;rStride=jStride;s=box_dim;sStride= 1;t=box_dim;tStride=kStride;ds=-sStride;dt=-tStride;break; // j-edge, high i, high k + case 25:rlo=ilo;dim_r=dim_i;rStride= 1;s=box_dim;sStride=jStride;t=box_dim;tStride=kStride;ds=-sStride;dt=-tStride;break; // i-edge, high j, high k + } + // FIX... optimize for rStride==1 (unit-stride) + for(r=0;rtimers.boundary_conditions += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// For cell-centered/averaged, one must fill in a ghost zone in order to affect a boundary condition +// The argument shape indicates on which regions of the domain (not the individual boxes) must the boundary condition be enforced. +// If shape exceeds the range of defined shapes, the boundary condition will be applied to all faces, edges, and corners +// This code performs a simple quartic volume averages extrapolation for homogeneous dirichlet (0 on boundary) +// Nominally, this is first performed across faces, then to edges, then to corners. +// In this implementation, these three steps are fused +// It is considered an error to call this routine if the domain has less that two ghost zones +// This code will drop order if one attempts to apply quartic BC's to boxes of less than 4^3 +void apply_BCs_v4(level_type * level, int x_id, int shape){ + const int box_dim = level->box_dim; + const int box_ghosts = level->box_ghosts; + if(shape>=STENCIL_MAX_SHAPES)shape=STENCIL_SHAPE_BOX; // shape must be < STENCIL_MAX_SHAPES in order to safely index into boundary_condition.blocks[] + if(level->boundary_condition.type == BC_PERIODIC)return; // no BC's to apply ! + if(box_ghosts<2){fprintf(stderr,"called quartic BC's with only 1 ghost zone!!!\n");exit(0);} +//if(box_dim <4){fprintf(stderr,"called quartic BC's with boxes < 4^3 \n");exit(0);} + if(box_dim <4){apply_BCs_v2(level,x_id,shape);return;} // FIX... is it safe to drop order on the boundary on coarse grids ?? + + const int faces[27] = {0,0,0,0,1,0,0,0,0, 0,1,0,1,0,1,0,1,0, 0,0,0,0,1,0,0,0,0}; + const int edges[27] = {0,1,0,1,0,1,0,1,0, 1,0,1,0,0,0,1,0,1, 0,1,0,1,0,1,0,1,0}; + const int corners[27] = {1,0,1,0,0,0,1,0,1, 0,0,0,0,0,0,0,0,0, 1,0,1,0,0,0,1,0,1}; + + int buffer; + double _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->boundary_condition.num_blocks[shape]) + for(buffer=0;bufferboundary_condition.num_blocks[shape];buffer++){ + int i,j,k; + const int box = level->boundary_condition.blocks[shape][buffer].read.box; + const int dim_i = level->boundary_condition.blocks[shape][buffer].dim.i; + const int dim_j = level->boundary_condition.blocks[shape][buffer].dim.j; + const int dim_k = level->boundary_condition.blocks[shape][buffer].dim.k; + const int ilo = level->boundary_condition.blocks[shape][buffer].read.i; + const int jlo = level->boundary_condition.blocks[shape][buffer].read.j; + const int klo = level->boundary_condition.blocks[shape][buffer].read.k; + const int subtype = level->boundary_condition.blocks[shape][buffer].subtype; + //const int normal = 26-subtype; + + // hard code for box to box BC's + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const double * __restrict__ x = level->my_boxes[box].vectors[x_id] + level->box_ghosts*(1+jStride+kStride); + double * __restrict__ xn = level->my_boxes[box].vectors[x_id] + level->box_ghosts*(1+jStride+kStride); // physically the same, but use different pointers for read/write + + double OneTwelfth = 1.0/12.0; + + // zero out entire ghost region when not all points will be updated... + if(box_ghosts>2){ + for(k=0;k ds + // / / /|/ + // +---+---+ | + // | | |/ + // +---+---+ + // + // ^ dt + // | + // :....:....|....:....:....:....:. + // : f4 : n4 | 14 : 24 : 34 : 44 : + // :....:....|....:....:....:....:. + // : f3 : n3 | 13 : 23 : 33 : 43 : + // :....:....|....:....:....:....:. + // : f2 : n2 | 12 : 22 : 32 : 42 : + // :....:....|....:....:....:....:. + // : f1 : n1 | 11 : 21 : 31 : 41 : + // ----------+---------------------> ds + // : ?? : ?? | + // :....:....| + // : ?? : ?? | + // :....:....| + // + int r=-1,rStride=-1,dim_r=-1,rlo=-1; + int s=-1,sStride=-1,ds=-1; + int t=-1,tStride=-1,dt=-1; + // the four 16-point stencils (symmetry allows you to view it as 12 4-point) can point in 12 different directions... + switch(subtype){ + case 1:rlo=ilo;dim_r=dim_i;rStride= 1;s= -1;sStride=jStride;t= -1;tStride=kStride;ds= sStride;dt= tStride;break; // i-edge, low j, low k + case 3:rlo=jlo;dim_r=dim_j;rStride=jStride;s= -1;sStride= 1;t= -1;tStride=kStride;ds= sStride;dt= tStride;break; // j-edge, low i, low k + case 5:rlo=jlo;dim_r=dim_j;rStride=jStride;s=box_dim;sStride= 1;t= -1;tStride=kStride;ds=-sStride;dt= tStride;break; // j-edge, high i, low k + case 7:rlo=ilo;dim_r=dim_i;rStride= 1;s=box_dim;sStride=jStride;t= -1;tStride=kStride;ds=-sStride;dt= tStride;break; // i-edge, high j, low k + case 9:rlo=klo;dim_r=dim_k;rStride=kStride;s= -1;sStride= 1;t= -1;tStride=jStride;ds= sStride;dt= tStride;break; // k-edge, low i, low j + case 11:rlo=klo;dim_r=dim_k;rStride=kStride;s=box_dim;sStride= 1;t= -1;tStride=jStride;ds=-sStride;dt= tStride;break; // k-edge, high i, low j + case 15:rlo=klo;dim_r=dim_k;rStride=kStride;s= -1;sStride= 1;t=box_dim;tStride=jStride;ds= sStride;dt=-tStride;break; // k-edge, low i, high j + case 17:rlo=klo;dim_r=dim_k;rStride=kStride;s=box_dim;sStride= 1;t=box_dim;tStride=jStride;ds=-sStride;dt=-tStride;break; // k-edge, high i, high j + case 19:rlo=ilo;dim_r=dim_i;rStride= 1;s= -1;sStride=jStride;t=box_dim;tStride=kStride;ds= sStride;dt=-tStride;break; // i-edge, low j, high k + case 21:rlo=jlo;dim_r=dim_j;rStride=jStride;s= -1;sStride= 1;t=box_dim;tStride=kStride;ds= sStride;dt=-tStride;break; // j-edge, low i, high k + case 23:rlo=jlo;dim_r=dim_j;rStride=jStride;s=box_dim;sStride= 1;t=box_dim;tStride=kStride;ds=-sStride;dt=-tStride;break; // j-edge, high i, high k + case 25:rlo=ilo;dim_r=dim_i;rStride= 1;s=box_dim;sStride=jStride;t=box_dim;tStride=kStride;ds=-sStride;dt=-tStride;break; // i-edge, high j, high k + } + // FIX... optimize for rStride==1 (unit-stride) + // FIX... optimize for ds==+/-1 + double * __restrict__ ghost00 = (double * __restrict__)(x ); // convince the compiler that read (box) & write (ghost zone) are disjoint + double * __restrict__ ghost01 = (double * __restrict__)(x -dt); // convince the compiler that read (box) & write (ghost zone) are disjoint + double * __restrict__ ghost10 = (double * __restrict__)(x-ds ); // convince the compiler that read (box) & write (ghost zone) are disjoint + double * __restrict__ ghost11 = (double * __restrict__)(x-ds-dt); // convince the compiler that read (box) & write (ghost zone) are disjoint + for(r=0;rtimers.boundary_conditions += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void extrapolate_betas(level_type * level){ + if(level->boundary_condition.type == BC_PERIODIC)return; // no BC's to apply ! + int shape=0; + + int buffer; + double _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->boundary_condition.num_blocks[shape]) + for(buffer=0;bufferboundary_condition.num_blocks[shape];buffer++){ + int i,j,k; + const int box = level->boundary_condition.blocks[shape][buffer].read.box; + const int dim_i = level->boundary_condition.blocks[shape][buffer].dim.i; + const int dim_j = level->boundary_condition.blocks[shape][buffer].dim.j; + const int dim_k = level->boundary_condition.blocks[shape][buffer].dim.k; + const int ilo = level->boundary_condition.blocks[shape][buffer].read.i; + const int jlo = level->boundary_condition.blocks[shape][buffer].read.j; + const int klo = level->boundary_condition.blocks[shape][buffer].read.k; + + // total hack/reuse of the existing boundary list... + // however, whereas boundary subtype represents the normal to the domain at that point, + // one needs the box-relative (not domain-relative) normal when extending the face averaged beta's into the ghost zones + // Thus, I reuse the list to tell me which areas are beyond the domain boundary, but must calculate their normals here + int subtype = 13; + if(ilo < 0)subtype-=1; + if(jlo < 0)subtype-=3; + if(klo < 0)subtype-=9; + if(ilo >= level->box_dim)subtype+=1; + if(jlo >= level->box_dim)subtype+=3; + if(klo >= level->box_dim)subtype+=9; + const int normal = 26-subtype; // invert the normal vector + + // hard code for box to box BC's + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + level->box_ghosts*(1+jStride+kStride); + double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + level->box_ghosts*(1+jStride+kStride); + double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + level->box_ghosts*(1+jStride+kStride); + + // convert normal vector into pointer offsets... + const int di = (((normal % 3) )-1); + const int dj = (((normal % 9)/3)-1); + const int dk = (((normal / 9) )-1); + + // beta_i should be extrapolated in the j- and k-directions, but not i + // beta_j should be extrapolated in the i- and k-directions, but not j + // beta_k should be extrapolated in the i- and j-directions, but not k + // e.g. + // ................................. + // . . . . . + // . . ??? . ??? . . + // . . . . . + // ........+-------+-------+........ + // . / / / . + // . ??? /// ??? . + // . / / / . + // ........+-------+-------+........ + // . / / / . + // . ??? /// ??? . + // . / / / . + // ........+-------+-------+........ k j + // . . . . . ^ ^ + // . . ??? . ??? . . | / + // . . . . . |/ + // ................................. +-----> i + // + const int biStride = dj*jStride + dk*kStride; + const int bjStride = di + dk*kStride; + const int bkStride = di + dj*jStride ; + + // note, + // the face values normal to i should have been filled via RESTRICT_I (skip them) + // the face values normal to j should have been filled via RESTRICT_J (skip them) + // the face values normal to k should have been filled via RESTRICT_K (skip them) + if(level->box_dim>=5){ + // quartic extrapolation... + for(k=0;kbox_dim>=4){ + // cubic extrapolation... + for(k=0;kbox_dim>=2){ + // linear extrapolation... + for(k=0;ktimers.boundary_conditions += (double)(getTime()-_timeStart); +} + +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators/chebyshev.c b/Util/hpgmg/finite-volume/source/operators/chebyshev.c new file mode 100644 index 00000000..311ebf40 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/chebyshev.c @@ -0,0 +1,99 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +// Based on Yousef Saad's Iterative Methods for Sparse Linear Algebra, Algorithm 12.1, page 399 +//------------------------------------------------------------------------------------------------------------------------------ +void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ + if((CHEBYSHEV_DEGREE*NUM_SMOOTHS)&1){ + fprintf(stderr,"error... CHEBYSHEV_DEGREE*NUM_SMOOTHS must be even for the chebyshev smoother...\n"); + exit(0); + } + if( (level->dominant_eigenvalue_of_DinvA<=0.0) && (level->my_rank==0) )fprintf(stderr,"dominant_eigenvalue_of_DinvA <= 0.0 !\n"); + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + int s; + int block; + + + // compute the Chebyshev coefficients... + double beta = 1.000*level->dominant_eigenvalue_of_DinvA; +//double alpha = 0.300000*beta; +//double alpha = 0.250000*beta; +//double alpha = 0.166666*beta; + double alpha = 0.125000*beta; + double theta = 0.5*(beta+alpha); // center of the spectral ellipse + double delta = 0.5*(beta-alpha); // major axis? + double sigma = theta/delta; + double rho_n = 1/sigma; // rho_0 + double chebyshev_c1[CHEBYSHEV_DEGREE]; // + c1*(x_n-x_nm1) == rho_n*rho_nm1 + double chebyshev_c2[CHEBYSHEV_DEGREE]; // + c2*(b-Ax_n) + chebyshev_c1[0] = 0.0; + chebyshev_c2[0] = 1/theta; + for(s=1;snum_my_blocks) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + const int ghosts = level->box_ghosts; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const double h2inv = 1.0/(level->h*level->h); + const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); + + double * __restrict__ x_np1; + const double * __restrict__ x_n; + const double * __restrict__ x_nm1; + if((s&1)==0){x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); + x_nm1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); + x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride);} + else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); + x_nm1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); + x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride);} + const double c1 = chebyshev_c1[s%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE. + const double c2 = chebyshev_c2[s%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE. + + for(k=klo;ktimers.smooth += (double)(getTime()-_timeStart); + } // s-loop +} diff --git a/Util/hpgmg/finite-volume/source/operators/exchange_boundary.c b/Util/hpgmg/finite-volume/source/operators/exchange_boundary.c new file mode 100644 index 00000000..d2884739 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/exchange_boundary.c @@ -0,0 +1,117 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +// perform a (intra-level) ghost zone exchange on vector id +// NOTE exchange_boundary() only exchanges the boundary. +// It will not enforce any boundary conditions +// BC's are either the responsibility of a separate function or should be fused into the stencil +// The argument shape indicates which of faces, edges, and corners on each box must be exchanged +// If the specified shape exceeds the range of defined shapes, the code will default to STENCIL_SHAPE_BOX (i.e. exchange faces, edges, and corners) +void exchange_boundary(level_type * level, int id, int shape){ + double _timeCommunicationStart = getTime(); + double _timeStart,_timeEnd; + + if(shape>=STENCIL_MAX_SHAPES)shape=STENCIL_SHAPE_BOX; // shape must be < STENCIL_MAX_SHAPES in order to safely index into exchange_ghosts[] + int my_tag = (level->tag<<4) | shape; + int buffer=0; + int n; + + #ifdef USE_MPI + int nMessages = level->exchange_ghosts[shape].num_recvs + level->exchange_ghosts[shape].num_sends; + MPI_Request *recv_requests = level->exchange_ghosts[shape].requests; + MPI_Request *send_requests = level->exchange_ghosts[shape].requests + level->exchange_ghosts[shape].num_recvs; + + // loop through packed list of MPI receives and prepost Irecv's... + if(level->exchange_ghosts[shape].num_recvs>0){ + _timeStart = getTime(); + #ifdef USE_MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=0;nexchange_ghosts[shape].num_recvs;n++){ + MPI_Irecv(level->exchange_ghosts[shape].recv_buffers[n], + level->exchange_ghosts[shape].recv_sizes[n], + MPI_DOUBLE, + level->exchange_ghosts[shape].recv_ranks[n], + my_tag, + MPI_COMM_WORLD, + &recv_requests[n] + ); + } + _timeEnd = getTime(); + level->timers.ghostZone_recv += (_timeEnd-_timeStart); + } + + + // pack MPI send buffers... + if(level->exchange_ghosts[shape].num_blocks[0]){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->exchange_ghosts[shape].num_blocks[0]) + for(buffer=0;bufferexchange_ghosts[shape].num_blocks[0];buffer++){ + CopyBlock(level,id,&level->exchange_ghosts[shape].blocks[0][buffer]); + } + _timeEnd = getTime(); + level->timers.ghostZone_pack += (_timeEnd-_timeStart); + } + + + // loop through MPI send buffers and post Isend's... + if(level->exchange_ghosts[shape].num_sends>0){ + _timeStart = getTime(); + #ifdef USE_MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=0;nexchange_ghosts[shape].num_sends;n++){ + MPI_Isend(level->exchange_ghosts[shape].send_buffers[n], + level->exchange_ghosts[shape].send_sizes[n], + MPI_DOUBLE, + level->exchange_ghosts[shape].send_ranks[n], + my_tag, + MPI_COMM_WORLD, + &send_requests[n] + ); + } + _timeEnd = getTime(); + level->timers.ghostZone_send += (_timeEnd-_timeStart); + } + #endif + + + // exchange locally... try and hide within Isend latency... + if(level->exchange_ghosts[shape].num_blocks[1]){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->exchange_ghosts[shape].num_blocks[1]) + for(buffer=0;bufferexchange_ghosts[shape].num_blocks[1];buffer++){ + CopyBlock(level,id,&level->exchange_ghosts[shape].blocks[1][buffer]); + } + _timeEnd = getTime(); + level->timers.ghostZone_local += (_timeEnd-_timeStart); + } + + + // wait for MPI to finish... + #ifdef USE_MPI + if(nMessages){ + _timeStart = getTime(); + MPI_Waitall(nMessages,level->exchange_ghosts[shape].requests,level->exchange_ghosts[shape].status); + _timeEnd = getTime(); + level->timers.ghostZone_wait += (_timeEnd-_timeStart); + } + + + // unpack MPI receive buffers + if(level->exchange_ghosts[shape].num_blocks[2]){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->exchange_ghosts[shape].num_blocks[2]) + for(buffer=0;bufferexchange_ghosts[shape].num_blocks[2];buffer++){ + CopyBlock(level,id,&level->exchange_ghosts[shape].blocks[2][buffer]); + } + _timeEnd = getTime(); + level->timers.ghostZone_unpack += (_timeEnd-_timeStart); + } + #endif + + + level->timers.ghostZone_total += (double)(getTime()-_timeCommunicationStart); +} diff --git a/Util/hpgmg/finite-volume/source/operators/gsrb.c b/Util/hpgmg/finite-volume/source/operators/gsrb.c new file mode 100644 index 00000000..aad48371 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/gsrb.c @@ -0,0 +1,136 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#if defined(GSRB_FP) + #warning Overriding default GSRB implementation and using pre-computed 1.0/0.0 FP array for Red-Black to facilitate vectorization... +#elif defined(GSRB_STRIDE2) + #if defined(GSRB_OOP) + #warning Overriding default GSRB implementation and using out-of-place and stride-2 accesses to minimize the number of flops + #else + #warning Overriding default GSRB implementation and using stride-2 accesses to minimize the number of flops + #endif +#elif defined(GSRB_BRANCH) + #if defined(GSRB_OOP) + #warning Overriding default GSRB implementation and using out-of-place implementation with an if-then-else on loop indices... + #else + #warning Overriding default GSRB implementation and using if-then-else on loop indices... + #endif +#else +#define GSRB_STRIDE2 // default implementation +#endif +//------------------------------------------------------------------------------------------------------------------------------ +void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ + int block,s; + for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps per GSRB smooth + + // exchange the ghost zone... + #ifdef GSRB_OOP // out-of-place GSRB ping pongs between x and VECTOR_TEMP + if((s&1)==0){exchange_boundary(level, x_id,stencil_get_shape());apply_BCs(level, x_id,stencil_get_shape());} + else{exchange_boundary(level,VECTOR_TEMP,stencil_get_shape());apply_BCs(level,VECTOR_TEMP,stencil_get_shape());} + #else // in-place GSRB only operates on x + exchange_boundary(level, x_id,stencil_get_shape());apply_BCs(level, x_id,stencil_get_shape()); + #endif + + // apply the smoother... + double _timeStart = getTime(); + + // loop over all block/tiles this process owns... + PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + + int i,j,k; + const double h2inv = 1.0/(level->h*level->h); + const int ghosts = level->box_ghosts; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int color000 = (level->my_boxes[box].low.i^level->my_boxes[box].low.j^level->my_boxes[box].low.k^s)&1; // is element 000 red or black on *THIS* sweep + + const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); + #ifdef GSRB_OOP + const double * __restrict__ x_n; + double * __restrict__ x_np1; + if((s&1)==0){x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); + x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride);} + else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); + x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride);} + #else + const double * __restrict__ x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + double * __restrict__ x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + #endif + + + #if defined(GSRB_FP) + for(k=klo;kRedBlack_FP + ghosts*(1+jStride) + kStride*((k^color000)&0x1); + for(j=jlo;jtimers.smooth += (double)(getTime()-_timeStart); + } // s-loop +} + + +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators/interpolation_p0.c b/Util/hpgmg/finite-volume/source/operators/interpolation_p0.c new file mode 100644 index 00000000..51f7bfd0 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/interpolation_p0.c @@ -0,0 +1,159 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +static inline void interpolation_p0_block(level_type *level_f, int id_f, double prescale_f, level_type *level_c, int id_c, blockCopy_type *block){ + // interpolate 3D array from read_i,j,k of read[] to write_i,j,k in write[] + int dim_i = block->dim.i<<1; // calculate the dimensions of the resultant fine block + int dim_j = block->dim.j<<1; + int dim_k = block->dim.k<<1; + + int read_i = block->read.i; + int read_j = block->read.j; + int read_k = block->read.k; + int read_jStride = block->read.jStride; + int read_kStride = block->read.kStride; + + int write_i = block->write.i; + int write_j = block->write.j; + int write_k = block->write.k; + int write_jStride = block->write.jStride; + int write_kStride = block->write.kStride; + + double * __restrict__ read = block->read.ptr; + double * __restrict__ write = block->write.ptr; + if(block->read.box >=0){ + read = level_c->my_boxes[ block->read.box].vectors[id_c] + level_c->my_boxes[ block->read.box].ghosts*(1+level_c->my_boxes[ block->read.box].jStride+level_c->my_boxes[ block->read.box].kStride); + read_jStride = level_c->my_boxes[block->read.box ].jStride; + read_kStride = level_c->my_boxes[block->read.box ].kStride; + } + if(block->write.box>=0){ + write = level_f->my_boxes[block->write.box].vectors[id_f] + level_f->my_boxes[block->write.box].ghosts*(1+level_f->my_boxes[block->write.box].jStride+level_f->my_boxes[block->write.box].kStride); + write_jStride = level_f->my_boxes[block->write.box].jStride; + write_kStride = level_f->my_boxes[block->write.box].kStride; + } + + + int i,j,k; + for(k=0;k>1)+ read_i) + (((j>>1)+ read_j)* read_jStride) + (((k>>1)+ read_k)* read_kStride); + write[write_ijk] = prescale_f*write[write_ijk] + read[read_ijk]; // CAREFUL !!! you must guarantee you zero'd the MPI buffers(write[]) and destination boxes at some point to avoid 0.0*NaN or 0.0*inf + }}} + +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// perform a (inter-level) piecewise constant interpolation +void interpolation_p0(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){ + double _timeCommunicationStart = getTime(); + double _timeStart,_timeEnd; + int my_tag = (level_f->tag<<4) | 0x6; + int buffer=0; + int n; + + + #ifdef USE_MPI + // by convention, level_f allocates a combined array of requests for both level_f recvs and level_c sends... + int nMessages = level_c->interpolation.num_sends + level_f->interpolation.num_recvs; + MPI_Request *recv_requests = level_f->interpolation.requests; + MPI_Request *send_requests = level_f->interpolation.requests + level_f->interpolation.num_recvs; + + + // loop through packed list of MPI receives and prepost Irecv's... + if(level_f->interpolation.num_recvs>0){ + _timeStart = getTime(); + #ifdef USE_MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=0;ninterpolation.num_recvs;n++){ + MPI_Irecv(level_f->interpolation.recv_buffers[n], + level_f->interpolation.recv_sizes[n], + MPI_DOUBLE, + level_f->interpolation.recv_ranks[n], + my_tag, + MPI_COMM_WORLD, + &recv_requests[n] + ); + } + _timeEnd = getTime(); + level_f->timers.interpolation_recv += (_timeEnd-_timeStart); + } + + + // pack MPI send buffers... + if(level_c->interpolation.num_blocks[0]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[0]) + for(buffer=0;bufferinterpolation.num_blocks[0];buffer++){ + // !!! prescale==0 because you don't want to increment the MPI buffer + interpolation_p0_block(level_f,id_f,0.0,level_c,id_c,&level_c->interpolation.blocks[0][buffer]); + } + _timeEnd = getTime(); + level_f->timers.interpolation_pack += (_timeEnd-_timeStart); + } + + + // loop through MPI send buffers and post Isend's... + if(level_c->interpolation.num_sends>0){ + _timeStart = getTime(); + #ifdef USE_MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=0;ninterpolation.num_sends;n++){ + MPI_Isend(level_c->interpolation.send_buffers[n], + level_c->interpolation.send_sizes[n], + MPI_DOUBLE, + level_c->interpolation.send_ranks[n], + my_tag, + MPI_COMM_WORLD, + &send_requests[n] + ); + } + _timeEnd = getTime(); + level_f->timers.interpolation_send += (_timeEnd-_timeStart); + } + #endif + + + // perform local interpolation... try and hide within Isend latency... + if(level_c->interpolation.num_blocks[1]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[1]) + for(buffer=0;bufferinterpolation.num_blocks[1];buffer++){ + interpolation_p0_block(level_f,id_f,prescale_f,level_c,id_c,&level_c->interpolation.blocks[1][buffer]); + } + _timeEnd = getTime(); + level_f->timers.interpolation_local += (_timeEnd-_timeStart); + } + + + // wait for MPI to finish... + #ifdef USE_MPI + if(nMessages>0){ + _timeStart = getTime(); + MPI_Waitall(nMessages,level_f->interpolation.requests,level_f->interpolation.status); + _timeEnd = getTime(); + level_f->timers.interpolation_wait += (_timeEnd-_timeStart); + } + + + // unpack MPI receive buffers + if(level_f->interpolation.num_blocks[2]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->interpolation.num_blocks[2]) + for(buffer=0;bufferinterpolation.num_blocks[2];buffer++){ + IncrementBlock(level_f,id_f,prescale_f,&level_f->interpolation.blocks[2][buffer]); + } + _timeEnd = getTime(); + level_f->timers.interpolation_unpack += (_timeEnd-_timeStart); + } + #endif + + + level_f->timers.interpolation_total += (double)(getTime()-_timeCommunicationStart); +} diff --git a/Util/hpgmg/finite-volume/source/operators/interpolation_p1.c b/Util/hpgmg/finite-volume/source/operators/interpolation_p1.c new file mode 100644 index 00000000..9a05232b --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/interpolation_p1.c @@ -0,0 +1,179 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +//------------------------------------------------------------------------------------------------------------------------------ +static inline void interpolation_p1_block(level_type *level_f, int id_f, double prescale_f, level_type *level_c, int id_c, blockCopy_type *block){ + // interpolate 3D array from read_i,j,k of read[] to write_i,j,k in write[] + int write_dim_i = block->dim.i<<1; // calculate the dimensions of the resultant fine block + int write_dim_j = block->dim.j<<1; + int write_dim_k = block->dim.k<<1; + + int read_i = block->read.i; + int read_j = block->read.j; + int read_k = block->read.k; + int read_jStride = block->read.jStride; + int read_kStride = block->read.kStride; + + int write_i = block->write.i; + int write_j = block->write.j; + int write_k = block->write.k; + int write_jStride = block->write.jStride; + int write_kStride = block->write.kStride; + + double * __restrict__ read = block->read.ptr; + double * __restrict__ write = block->write.ptr; + if(block->read.box >=0){ + read = level_c->my_boxes[ block->read.box].vectors[id_c] + level_c->my_boxes[ block->read.box].ghosts*(1+level_c->my_boxes[ block->read.box].jStride+level_c->my_boxes[ block->read.box].kStride); + read_jStride = level_c->my_boxes[block->read.box ].jStride; + read_kStride = level_c->my_boxes[block->read.box ].kStride; + } + if(block->write.box>=0){ + write = level_f->my_boxes[block->write.box].vectors[id_f] + level_f->my_boxes[block->write.box].ghosts*(1+level_f->my_boxes[block->write.box].jStride+level_f->my_boxes[block->write.box].kStride); + write_jStride = level_f->my_boxes[block->write.box].jStride; + write_kStride = level_f->my_boxes[block->write.box].kStride; + } + + + int i,j,k; + for(k=0;k>1)+ read_i) + (((j>>1)+ read_j)* read_jStride) + (((k>>1)+ read_k)* read_kStride); + // + // | o | o | + // +---+---+---+---+ + // | | x | x | | + // + // CAREFUL !!! you must guarantee you zero'd the MPI buffers(write[]) and destination boxes at some point to avoid 0.0*NaN or 0.0*inf + // piecewise linear interpolation... NOTE, BC's must have been previously applied + write[write_ijk] = prescale_f*write[write_ijk] + + 0.421875*read[read_ijk ] + + 0.140625*read[read_ijk +delta_k] + + 0.140625*read[read_ijk +delta_j ] + + 0.046875*read[read_ijk +delta_j+delta_k] + + 0.140625*read[read_ijk+delta_i ] + + 0.046875*read[read_ijk+delta_i +delta_k] + + 0.046875*read[read_ijk+delta_i+delta_j ] + + 0.015625*read[read_ijk+delta_i+delta_j+delta_k]; + }}} + +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// perform a (inter-level) piecewise linear interpolation +void interpolation_p1(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){ + exchange_boundary(level_c,id_c,STENCIL_SHAPE_BOX); + apply_BCs_p1(level_c,id_c,STENCIL_SHAPE_BOX); + + double _timeCommunicationStart = getTime(); + double _timeStart,_timeEnd; + int buffer=0; + int n; + int my_tag = (level_f->tag<<4) | 0x7; + + + #ifdef USE_MPI + // by convention, level_f allocates a combined array of requests for both level_f recvs and level_c sends... + int nMessages = level_c->interpolation.num_sends + level_f->interpolation.num_recvs; + MPI_Request *recv_requests = level_f->interpolation.requests; + MPI_Request *send_requests = level_f->interpolation.requests + level_f->interpolation.num_recvs; + + + // loop through packed list of MPI receives and prepost Irecv's... + if(level_f->interpolation.num_recvs>0){ + _timeStart = getTime(); + #ifdef USE_MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=0;ninterpolation.num_recvs;n++){ + MPI_Irecv(level_f->interpolation.recv_buffers[n], + level_f->interpolation.recv_sizes[n], + MPI_DOUBLE, + level_f->interpolation.recv_ranks[n], + my_tag, + MPI_COMM_WORLD, + &recv_requests[n] + ); + } + _timeEnd = getTime(); + level_f->timers.interpolation_recv += (_timeEnd-_timeStart); + } + + + // pack MPI send buffers... + if(level_c->interpolation.num_blocks[0]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[0]) + for(buffer=0;bufferinterpolation.num_blocks[0];buffer++){ + // !!! prescale==0 because you don't want to increment the MPI buffer + interpolation_p1_block(level_f,id_f,0.0,level_c,id_c,&level_c->interpolation.blocks[0][buffer]); + } + _timeEnd = getTime(); + level_f->timers.interpolation_pack += (_timeEnd-_timeStart); + } + + + // loop through MPI send buffers and post Isend's... + if(level_c->interpolation.num_sends>0){ + _timeStart = getTime(); + #ifdef USE_MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=0;ninterpolation.num_sends;n++){ + MPI_Isend(level_c->interpolation.send_buffers[n], + level_c->interpolation.send_sizes[n], + MPI_DOUBLE, + level_c->interpolation.send_ranks[n], + my_tag, + MPI_COMM_WORLD, + &send_requests[n] + ); + } + _timeEnd = getTime(); + level_f->timers.interpolation_send += (_timeEnd-_timeStart); + } + #endif + + + // perform local interpolation... try and hide within Isend latency... + if(level_c->interpolation.num_blocks[1]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[1]) + for(buffer=0;bufferinterpolation.num_blocks[1];buffer++){ + interpolation_p1_block(level_f,id_f,prescale_f,level_c,id_c,&level_c->interpolation.blocks[1][buffer]); + } + _timeEnd = getTime(); + level_f->timers.interpolation_local += (_timeEnd-_timeStart); + } + + + // wait for MPI to finish... + #ifdef USE_MPI + if(nMessages>0){ + _timeStart = getTime(); + MPI_Waitall(nMessages,level_f->interpolation.requests,level_f->interpolation.status); + _timeEnd = getTime(); + level_f->timers.interpolation_wait += (_timeEnd-_timeStart); + } + + + // unpack MPI receive buffers + if(level_f->interpolation.num_blocks[2]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->interpolation.num_blocks[2]) + for(buffer=0;bufferinterpolation.num_blocks[2];buffer++){ + IncrementBlock(level_f,id_f,prescale_f,&level_f->interpolation.blocks[2][buffer]); + } + _timeEnd = getTime(); + level_f->timers.interpolation_unpack += (_timeEnd-_timeStart); + } + #endif + + + level_f->timers.interpolation_total += (double)(getTime()-_timeCommunicationStart); +} diff --git a/Util/hpgmg/finite-volume/source/operators/interpolation_p2.c b/Util/hpgmg/finite-volume/source/operators/interpolation_p2.c new file mode 100644 index 00000000..6ad1fa62 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/interpolation_p2.c @@ -0,0 +1,338 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +//------------------------------------------------------------------------------------------------------------------------------ +static inline void interpolation_p2_block(level_type *level_f, int id_f, double prescale_f, level_type *level_c, int id_c, blockCopy_type *block){ + // interpolate 3D array from read_i,j,k of read[] to write_i,j,k in write[] + int write_dim_i = block->dim.i<<1; // calculate the dimensions of the resultant fine block + int write_dim_j = block->dim.j<<1; + int write_dim_k = block->dim.k<<1; + + int read_i = block->read.i; + int read_j = block->read.j; + int read_k = block->read.k; + int read_jStride = block->read.jStride; + int read_kStride = block->read.kStride; + + int write_i = block->write.i; + int write_j = block->write.j; + int write_k = block->write.k; + int write_jStride = block->write.jStride; + int write_kStride = block->write.kStride; + + const double * __restrict__ read = block->read.ptr; + double * __restrict__ write = block->write.ptr; + + if(block->read.box >=0){ + read_jStride = level_c->my_boxes[block->read.box ].jStride; + read_kStride = level_c->my_boxes[block->read.box ].kStride; + read = level_c->my_boxes[ block->read.box].vectors[id_c] + level_c->box_ghosts*(1+ read_jStride+ read_kStride); + } + if(block->write.box>=0){ + write_jStride = level_f->my_boxes[block->write.box].jStride; + write_kStride = level_f->my_boxes[block->write.box].kStride; + write = level_f->my_boxes[block->write.box].vectors[id_f] + level_f->box_ghosts*(1+write_jStride+write_kStride); + } + + + #ifdef USE_NAIVE_INTERP + int i,j,k; + double OneOver32Cubed = 1.0/32768.0; + for(k=0;k>1)+ read_i) + (((j>>1)+ read_j)* read_jStride) + (((k>>1)+ read_k)* read_kStride); + // + // | -3/32 | 30/32 | 5/32 | + // |---+---|---+---|---+---| + // | | | | x | | | + // + write[write_ijk] = prescale_f*write[write_ijk] + + OneOver32Cubed*( + -27.0*read[read_ijk-delta_i-delta_j-delta_k] + + 270.0*read[read_ijk -delta_j-delta_k] + + 45.0*read[read_ijk+delta_i-delta_j-delta_k] + + 270.0*read[read_ijk-delta_i -delta_k] + + -2700.0*read[read_ijk -delta_k] + + -450.0*read[read_ijk+delta_i -delta_k] + + 45.0*read[read_ijk-delta_i+delta_j-delta_k] + + -450.0*read[read_ijk +delta_j-delta_k] + + -75.0*read[read_ijk+delta_i+delta_j-delta_k] + + + 270.0*read[read_ijk-delta_i-delta_j ] + + -2700.0*read[read_ijk -delta_j ] + + -450.0*read[read_ijk+delta_i-delta_j ] + + -2700.0*read[read_ijk-delta_i ] + + 27000.0*read[read_ijk ] + + 4500.0*read[read_ijk+delta_i ] + + -450.0*read[read_ijk-delta_i+delta_j ] + + 4500.0*read[read_ijk +delta_j ] + + 750.0*read[read_ijk+delta_i+delta_j ] + + + 45.0*read[read_ijk-delta_i-delta_j+delta_k] + + -450.0*read[read_ijk -delta_j+delta_k] + + -75.0*read[read_ijk+delta_i-delta_j+delta_k] + + -450.0*read[read_ijk-delta_i +delta_k] + + 4500.0*read[read_ijk +delta_k] + + 750.0*read[read_ijk+delta_i +delta_k] + + -75.0*read[read_ijk-delta_i+delta_j+delta_k] + + 750.0*read[read_ijk +delta_j+delta_k] + + 125.0*read[read_ijk+delta_i+delta_j+delta_k] + ); + + }}} + #else + int i,j,k; + int ii,jj,kk; + double w0 = 5.0/32.0; + double w1 = 30.0/32.0; + double w2 = -3.0/32.0; + for(k=0,kk=0;k : | f | f | : + // | | | | : | | | : + // +-------+-------+-------+ :.......+---+---+.......: + // | | | | : | | | : + // | c | c | c | : | f | f | : + // | | | | : | | | : + // +-------+-------+-------+ :.......+---+---+.......: + // + const double f0c00 = ( w1*c100 + w0*c000 + w2*c200 ); + const double f1c00 = ( w1*c100 + w2*c000 + w0*c200 ); + const double f0c10 = ( w1*c110 + w0*c010 + w2*c210 ); + const double f1c10 = ( w1*c110 + w2*c010 + w0*c210 ); + const double f0c20 = ( w1*c120 + w0*c020 + w2*c220 ); + const double f1c20 = ( w1*c120 + w2*c020 + w0*c220 ); + + const double f0c01 = ( w1*c101 + w0*c001 + w2*c201 ); + const double f1c01 = ( w1*c101 + w2*c001 + w0*c201 ); + const double f0c11 = ( w1*c111 + w0*c011 + w2*c211 ); + const double f1c11 = ( w1*c111 + w2*c011 + w0*c211 ); + const double f0c21 = ( w1*c121 + w0*c021 + w2*c221 ); + const double f1c21 = ( w1*c121 + w2*c021 + w0*c221 ); + + const double f0c02 = ( w1*c102 + w0*c002 + w2*c202 ); + const double f1c02 = ( w1*c102 + w2*c002 + w0*c202 ); + const double f0c12 = ( w1*c112 + w0*c012 + w2*c212 ); + const double f1c12 = ( w1*c112 + w2*c012 + w0*c212 ); + const double f0c22 = ( w1*c122 + w0*c022 + w2*c222 ); + const double f1c22 = ( w1*c122 + w2*c022 + w0*c222 ); + + // interpolate in j to create fine ij / coarse k points... + // + // :.......+---+---+.......: :.......:.......:.......: + // : | | | : : : : : + // : | | | : : : : : + // : | | | : : : : : + // :.......+---+---+.......: :.......+---+---+.......: + // : | | | : : | | | : + // : | | | : -> : +---+---+ : + // : | | | : : | | | : + // :.......+---+---+.......: :.......+---+---+.......: + // : | | | : : : : : + // : | | | : : : : : + // : | | | : : : : : + // :.......+---+---+.......: :.......:.......:.......: + // + const double f00c0 = ( w1*f0c10 + w0*f0c00 + w2*f0c20 ); + const double f10c0 = ( w1*f1c10 + w0*f1c00 + w2*f1c20 ); + const double f01c0 = ( w1*f0c10 + w2*f0c00 + w0*f0c20 ); + const double f11c0 = ( w1*f1c10 + w2*f1c00 + w0*f1c20 ); + + const double f00c1 = ( w1*f0c11 + w0*f0c01 + w2*f0c21 ); + const double f10c1 = ( w1*f1c11 + w0*f1c01 + w2*f1c21 ); + const double f01c1 = ( w1*f0c11 + w2*f0c01 + w0*f0c21 ); + const double f11c1 = ( w1*f1c11 + w2*f1c01 + w0*f1c21 ); + + const double f00c2 = ( w1*f0c12 + w0*f0c02 + w2*f0c22 ); + const double f10c2 = ( w1*f1c12 + w0*f1c02 + w2*f1c22 ); + const double f01c2 = ( w1*f0c12 + w2*f0c02 + w0*f0c22 ); + const double f11c2 = ( w1*f1c12 + w2*f1c02 + w0*f1c22 ); + + // interpolate in k to create fine ijk points... + const double f000 = ( w1*f00c1 + w0*f00c0 + w2*f00c2 ); + const double f100 = ( w1*f10c1 + w0*f10c0 + w2*f10c2 ); + const double f010 = ( w1*f01c1 + w0*f01c0 + w2*f01c2 ); + const double f110 = ( w1*f11c1 + w0*f11c0 + w2*f11c2 ); + const double f001 = ( w1*f00c1 + w2*f00c0 + w0*f00c2 ); + const double f101 = ( w1*f10c1 + w2*f10c0 + w0*f10c2 ); + const double f011 = ( w1*f01c1 + w2*f01c0 + w0*f01c2 ); + const double f111 = ( w1*f11c1 + w2*f11c0 + w0*f11c2 ); + + // commit to memory... + #if 0 // compiler cannot infer/speculate write[ijk+write_jStride] is disjoint from write[ijk], and thus cannot vectorize... + write[write_ijk ] = prescale_f*write[write_ijk ] + f000; + write[write_ijk+1 ] = prescale_f*write[write_ijk+1 ] + f100; + write[write_ijk +write_jStride ] = prescale_f*write[write_ijk +write_jStride ] + f010; + write[write_ijk+1+write_jStride ] = prescale_f*write[write_ijk+1+write_jStride ] + f110; + write[write_ijk +write_kStride] = prescale_f*write[write_ijk +write_kStride] + f001; + write[write_ijk+1 +write_kStride] = prescale_f*write[write_ijk+1 +write_kStride] + f101; + write[write_ijk +write_jStride+write_kStride] = prescale_f*write[write_ijk +write_jStride+write_kStride] + f011; + write[write_ijk+1+write_jStride+write_kStride] = prescale_f*write[write_ijk+1+write_jStride+write_kStride] + f111; + #else // use a unique restrict pointer for each pencil... + write00[i ] = prescale_f*write00[i ] + f000; + write00[i+1] = prescale_f*write00[i+1] + f100; + write10[i ] = prescale_f*write10[i ] + f010; + write10[i+1] = prescale_f*write10[i+1] + f110; + write01[i ] = prescale_f*write01[i ] + f001; + write01[i+1] = prescale_f*write01[i+1] + f101; + write11[i ] = prescale_f*write11[i ] + f011; + write11[i+1] = prescale_f*write11[i+1] + f111; + #endif + + }}} + #endif + +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// perform a (inter-level) piecewise quadratic interpolation +void interpolation_p2(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){ + exchange_boundary(level_c,id_c,STENCIL_SHAPE_BOX); + apply_BCs_p2(level_c,id_c,STENCIL_SHAPE_BOX); + + double _timeCommunicationStart = getTime(); + double _timeStart,_timeEnd; + int buffer=0; + int n; + int my_tag = (level_f->tag<<4) | 0x7; + + + #ifdef USE_MPI + // by convention, level_f allocates a combined array of requests for both level_f recvs and level_c sends... + int nMessages = level_c->interpolation.num_sends + level_f->interpolation.num_recvs; + MPI_Request *recv_requests = level_f->interpolation.requests; + MPI_Request *send_requests = level_f->interpolation.requests + level_f->interpolation.num_recvs; + + + // loop through packed list of MPI receives and prepost Irecv's... + if(level_f->interpolation.num_recvs>0){ + _timeStart = getTime(); + #ifdef USE_MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=0;ninterpolation.num_recvs;n++){ + MPI_Irecv(level_f->interpolation.recv_buffers[n], + level_f->interpolation.recv_sizes[n], + MPI_DOUBLE, + level_f->interpolation.recv_ranks[n], + my_tag, + MPI_COMM_WORLD, + &recv_requests[n] + ); + } + _timeEnd = getTime(); + level_f->timers.interpolation_recv += (_timeEnd-_timeStart); + } + + + // pack MPI send buffers... + if(level_c->interpolation.num_blocks[0]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[0]) + for(buffer=0;bufferinterpolation.num_blocks[0];buffer++){ + // !!! prescale==0 because you don't want to increment the MPI buffer + interpolation_p2_block(level_f,id_f,0.0,level_c,id_c,&level_c->interpolation.blocks[0][buffer]); + } + _timeEnd = getTime(); + level_f->timers.interpolation_pack += (_timeEnd-_timeStart); + } + + + // loop through MPI send buffers and post Isend's... + if(level_c->interpolation.num_sends>0){ + _timeStart = getTime(); + #ifdef USE_MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=0;ninterpolation.num_sends;n++){ + MPI_Isend(level_c->interpolation.send_buffers[n], + level_c->interpolation.send_sizes[n], + MPI_DOUBLE, + level_c->interpolation.send_ranks[n], + my_tag, + MPI_COMM_WORLD, + &send_requests[n] + ); + } + _timeEnd = getTime(); + level_f->timers.interpolation_send += (_timeEnd-_timeStart); + } + #endif + + + // perform local interpolation... try and hide within Isend latency... + if(level_c->interpolation.num_blocks[1]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[1]) + for(buffer=0;bufferinterpolation.num_blocks[1];buffer++){ + interpolation_p2_block(level_f,id_f,prescale_f,level_c,id_c,&level_c->interpolation.blocks[1][buffer]); + } + _timeEnd = getTime(); + level_f->timers.interpolation_local += (_timeEnd-_timeStart); + } + + + // wait for MPI to finish... + #ifdef USE_MPI + if(nMessages>0){ + _timeStart = getTime(); + MPI_Waitall(nMessages,level_f->interpolation.requests,level_f->interpolation.status); + _timeEnd = getTime(); + level_f->timers.interpolation_wait += (_timeEnd-_timeStart); + } + + + // unpack MPI receive buffers + if(level_f->interpolation.num_blocks[2]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->interpolation.num_blocks[2]) + for(buffer=0;bufferinterpolation.num_blocks[2];buffer++){ + IncrementBlock(level_f,id_f,prescale_f,&level_f->interpolation.blocks[2][buffer]); + } + _timeEnd = getTime(); + level_f->timers.interpolation_unpack += (_timeEnd-_timeStart); + } + #endif + + + level_f->timers.interpolation_total += (double)(getTime()-_timeCommunicationStart); +} diff --git a/Util/hpgmg/finite-volume/source/operators/interpolation_v2.c b/Util/hpgmg/finite-volume/source/operators/interpolation_v2.c new file mode 100644 index 00000000..9052d9c2 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/interpolation_v2.c @@ -0,0 +1,320 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +//------------------------------------------------------------------------------------------------------------------------------ +static inline void interpolation_v2_block(level_type *level_f, int id_f, double prescale_f, level_type *level_c, int id_c, blockCopy_type *block){ + // interpolate 3D array from read_i,j,k of read[] to write_i,j,k in write[] using volume averaged quadratic prolongation + int write_dim_i = block->dim.i<<1; // calculate the dimensions of the resultant fine block + int write_dim_j = block->dim.j<<1; + int write_dim_k = block->dim.k<<1; + + int read_i = block->read.i; + int read_j = block->read.j; + int read_k = block->read.k; + int read_jStride = block->read.jStride; + int read_kStride = block->read.kStride; + + int write_i = block->write.i; + int write_j = block->write.j; + int write_k = block->write.k; + int write_jStride = block->write.jStride; + int write_kStride = block->write.kStride; + + const double * __restrict__ read = block->read.ptr; + double * __restrict__ write = block->write.ptr; + + if(block->read.box >=0){ + read_jStride = level_c->my_boxes[block->read.box ].jStride; + read_kStride = level_c->my_boxes[block->read.box ].kStride; + read = level_c->my_boxes[ block->read.box].vectors[id_c] + level_c->box_ghosts*(1+ read_jStride+ read_kStride); + } + if(block->write.box>=0){ + write_jStride = level_f->my_boxes[block->write.box].jStride; + write_kStride = level_f->my_boxes[block->write.box].kStride; + write = level_f->my_boxes[block->write.box].vectors[id_f] + level_f->box_ghosts*(1+write_jStride+write_kStride); + } + + + #ifdef USE_NAIVE_INTERP + // naive 27pt per fine grid cell + int i,j,k; + double c1 = 1.0/8.0; + for(k=0;k>1)+ read_i) + (((j>>1)+ read_j)* read_jStride) + (((k>>1)+ read_k)* read_kStride); + // + // | 1/8 | 1.0 | -1/8 | coarse grid + // |---+---|---+---|---+---| + // | | |???| | | | fine grid + // + write[write_ijk] = prescale_f*write[write_ijk] + + + c1k*( + c1j*( c1i*read[read_ijk-1-read_jStride-read_kStride] + read[read_ijk-read_jStride-read_kStride] - c1i*read[read_ijk+1-read_jStride-read_kStride] ) + + ( c1i*read[read_ijk-1 -read_kStride] + read[read_ijk -read_kStride] - c1i*read[read_ijk+1 -read_kStride] ) + - c1j*( c1i*read[read_ijk-1+read_jStride-read_kStride] + read[read_ijk+read_jStride-read_kStride] - c1i*read[read_ijk+1+read_jStride-read_kStride] ) ) + + ( + c1j*( c1i*read[read_ijk-1-read_jStride ] + read[read_ijk-read_jStride ] - c1i*read[read_ijk+1-read_jStride ] ) + + ( c1i*read[read_ijk-1 ] + read[read_ijk ] - c1i*read[read_ijk+1 ] ) + - c1j*( c1i*read[read_ijk-1+read_jStride ] + read[read_ijk+read_jStride ] - c1i*read[read_ijk+1+read_jStride ] ) ) + - c1k*( + c1j*( c1i*read[read_ijk-1-read_jStride+read_kStride] + read[read_ijk-read_jStride+read_kStride] - c1i*read[read_ijk+1-read_jStride+read_kStride] ) + + ( c1i*read[read_ijk-1 +read_kStride] + read[read_ijk +read_kStride] - c1i*read[read_ijk+1 +read_kStride] ) + - c1j*( c1i*read[read_ijk-1+read_jStride+read_kStride] + read[read_ijk+read_jStride+read_kStride] - c1i*read[read_ijk+1+read_jStride+read_kStride] ) ); + }}} + #else + int i,j,k; + int ii,jj,kk; + double c1 = 1.0/8.0; + for(k=0,kk=0;k : | f | f | : + // | | | | : | | | : + // +-------+-------+-------+ :.......+---+---+.......: + // | | | | : | | | : + // | c | c | c | : | f | f | : + // | | | | : | | | : + // +-------+-------+-------+ :.......+---+---+.......: + // + const double f0c00 = ( c100 + c1*(c000-c200) ); // same as original 3pt stencil... f0c00 = ( c1*c000 + c100 - c1*c200 ); + const double f1c00 = ( c100 - c1*(c000-c200) ); + const double f0c10 = ( c110 + c1*(c010-c210) ); + const double f1c10 = ( c110 - c1*(c010-c210) ); + const double f0c20 = ( c120 + c1*(c020-c220) ); + const double f1c20 = ( c120 - c1*(c020-c220) ); + + const double f0c01 = ( c101 + c1*(c001-c201) ); + const double f1c01 = ( c101 - c1*(c001-c201) ); + const double f0c11 = ( c111 + c1*(c011-c211) ); + const double f1c11 = ( c111 - c1*(c011-c211) ); + const double f0c21 = ( c121 + c1*(c021-c221) ); + const double f1c21 = ( c121 - c1*(c021-c221) ); + + const double f0c02 = ( c102 + c1*(c002-c202) ); + const double f1c02 = ( c102 - c1*(c002-c202) ); + const double f0c12 = ( c112 + c1*(c012-c212) ); + const double f1c12 = ( c112 - c1*(c012-c212) ); + const double f0c22 = ( c122 + c1*(c022-c222) ); + const double f1c22 = ( c122 - c1*(c022-c222) ); + + // interpolate in j to create fine ij / coarse k points... + // + // :.......+---+---+.......: :.......:.......:.......: + // : | | | : : : : : + // : | | | : : : : : + // : | | | : : : : : + // :.......+---+---+.......: :.......+---+---+.......: + // : | | | : : | | | : + // : | | | : -> : +---+---+ : + // : | | | : : | | | : + // :.......+---+---+.......: :.......+---+---+.......: + // : | | | : : : : : + // : | | | : : : : : + // : | | | : : : : : + // :.......+---+---+.......: :.......:.......:.......: + // + const double f00c0 = ( f0c10 + c1*(f0c00-f0c20) ); + const double f10c0 = ( f1c10 + c1*(f1c00-f1c20) ); + const double f01c0 = ( f0c10 - c1*(f0c00-f0c20) ); + const double f11c0 = ( f1c10 - c1*(f1c00-f1c20) ); + + const double f00c1 = ( f0c11 + c1*(f0c01-f0c21) ); + const double f10c1 = ( f1c11 + c1*(f1c01-f1c21) ); + const double f01c1 = ( f0c11 - c1*(f0c01-f0c21) ); + const double f11c1 = ( f1c11 - c1*(f1c01-f1c21) ); + + const double f00c2 = ( f0c12 + c1*(f0c02-f0c22) ); + const double f10c2 = ( f1c12 + c1*(f1c02-f1c22) ); + const double f01c2 = ( f0c12 - c1*(f0c02-f0c22) ); + const double f11c2 = ( f1c12 - c1*(f1c02-f1c22) ); + + // interpolate in k to create fine ijk points... + const double f000 = ( f00c1 + c1*(f00c0-f00c2) ); + const double f100 = ( f10c1 + c1*(f10c0-f10c2) ); + const double f010 = ( f01c1 + c1*(f01c0-f01c2) ); + const double f110 = ( f11c1 + c1*(f11c0-f11c2) ); + const double f001 = ( f00c1 - c1*(f00c0-f00c2) ); + const double f101 = ( f10c1 - c1*(f10c0-f10c2) ); + const double f011 = ( f01c1 - c1*(f01c0-f01c2) ); + const double f111 = ( f11c1 - c1*(f11c0-f11c2) ); + + // commit to memory... + #if 0 // compiler cannot infer/speculate write[ijk+write_jStride] is disjoint from write[ijk], and thus cannot vectorize... + write[write_ijk ] = prescale_f*write[write_ijk ] + f000; + write[write_ijk+1 ] = prescale_f*write[write_ijk+1 ] + f100; + write[write_ijk +write_jStride ] = prescale_f*write[write_ijk +write_jStride ] + f010; + write[write_ijk+1+write_jStride ] = prescale_f*write[write_ijk+1+write_jStride ] + f110; + write[write_ijk +write_kStride] = prescale_f*write[write_ijk +write_kStride] + f001; + write[write_ijk+1 +write_kStride] = prescale_f*write[write_ijk+1 +write_kStride] + f101; + write[write_ijk +write_jStride+write_kStride] = prescale_f*write[write_ijk +write_jStride+write_kStride] + f011; + write[write_ijk+1+write_jStride+write_kStride] = prescale_f*write[write_ijk+1+write_jStride+write_kStride] + f111; + #else // use a unique restrict pointer for each pencil... + write00[i ] = prescale_f*write00[i ] + f000; + write00[i+1] = prescale_f*write00[i+1] + f100; + write10[i ] = prescale_f*write10[i ] + f010; + write10[i+1] = prescale_f*write10[i+1] + f110; + write01[i ] = prescale_f*write01[i ] + f001; + write01[i+1] = prescale_f*write01[i+1] + f101; + write11[i ] = prescale_f*write11[i ] + f011; + write11[i+1] = prescale_f*write11[i+1] + f111; + #endif + + }}} + #endif + +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// perform a (inter-level) volumetric quadratic interpolation on vector id_c of the coarse level and increments prescale_f*vector id_f on the fine level by the result +// i.e. id_f = prescale_f*id_f + P*id_c +// prescale_f is nominally 1.0 or 0.0 +// quadratic interpolation requires a full ghost zone exchange and boundary condition +// This is a rather bulk synchronous implementation which packs all MPI buffers before initiating any sends +// Similarly, it waits for all remote data before copying any into local boxes. +// It does however attempt to overlap local interpolation with MPI +void interpolation_v2(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){ + exchange_boundary(level_c,id_c,STENCIL_SHAPE_BOX); + apply_BCs_v2(level_c,id_c,STENCIL_SHAPE_BOX); + + double _timeCommunicationStart = getTime(); + double _timeStart,_timeEnd; + int buffer=0; + int n; + int my_tag = (level_f->tag<<4) | 0x7; + + + #ifdef USE_MPI + // by convention, level_f allocates a combined array of requests for both level_f recvs and level_c sends... + int nMessages = level_c->interpolation.num_sends + level_f->interpolation.num_recvs; + MPI_Request *recv_requests = level_f->interpolation.requests; + MPI_Request *send_requests = level_f->interpolation.requests + level_f->interpolation.num_recvs; + + + // loop through packed list of MPI receives and prepost Irecv's... + if(level_f->interpolation.num_recvs>0){ + _timeStart = getTime(); + #ifdef USE_MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=0;ninterpolation.num_recvs;n++){ + MPI_Irecv(level_f->interpolation.recv_buffers[n], + level_f->interpolation.recv_sizes[n], + MPI_DOUBLE, + level_f->interpolation.recv_ranks[n], + my_tag, + MPI_COMM_WORLD, + &recv_requests[n] + ); + } + _timeEnd = getTime(); + level_f->timers.interpolation_recv += (_timeEnd-_timeStart); + } + + + // pack MPI send buffers... + if(level_c->interpolation.num_blocks[0]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[0]) + for(buffer=0;bufferinterpolation.num_blocks[0];buffer++){ + // !!! prescale==0 because you don't want to increment the MPI buffer + interpolation_v2_block(level_f,id_f,0.0,level_c,id_c,&level_c->interpolation.blocks[0][buffer]); + } + _timeEnd = getTime(); + level_f->timers.interpolation_pack += (_timeEnd-_timeStart); + } + + + // loop through MPI send buffers and post Isend's... + if(level_c->interpolation.num_sends>0){ + _timeStart = getTime(); + #ifdef USE_MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=0;ninterpolation.num_sends;n++){ + MPI_Isend(level_c->interpolation.send_buffers[n], + level_c->interpolation.send_sizes[n], + MPI_DOUBLE, + level_c->interpolation.send_ranks[n], + my_tag, + MPI_COMM_WORLD, + &send_requests[n] + ); + } + _timeEnd = getTime(); + level_f->timers.interpolation_send += (_timeEnd-_timeStart); + } + #endif + + + // perform local interpolation... try and hide within Isend latency... + if(level_c->interpolation.num_blocks[1]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[1]) + for(buffer=0;bufferinterpolation.num_blocks[1];buffer++){ + interpolation_v2_block(level_f,id_f,prescale_f,level_c,id_c,&level_c->interpolation.blocks[1][buffer]); + } + _timeEnd = getTime(); + level_f->timers.interpolation_local += (_timeEnd-_timeStart); + } + + + // wait for MPI to finish... + #ifdef USE_MPI + if(nMessages>0){ + _timeStart = getTime(); + MPI_Waitall(nMessages,level_f->interpolation.requests,level_f->interpolation.status); + _timeEnd = getTime(); + level_f->timers.interpolation_wait += (_timeEnd-_timeStart); + } + + + // unpack MPI receive buffers + if(level_f->interpolation.num_blocks[2]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->interpolation.num_blocks[2]) + for(buffer=0;bufferinterpolation.num_blocks[2];buffer++){ + IncrementBlock(level_f,id_f,prescale_f,&level_f->interpolation.blocks[2][buffer]); + } + _timeEnd = getTime(); + level_f->timers.interpolation_unpack += (_timeEnd-_timeStart); + } + #endif + + + level_f->timers.interpolation_total += (double)(getTime()-_timeCommunicationStart); +} diff --git a/Util/hpgmg/finite-volume/source/operators/interpolation_v4.c b/Util/hpgmg/finite-volume/source/operators/interpolation_v4.c new file mode 100644 index 00000000..8a0d6d89 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/interpolation_v4.c @@ -0,0 +1,386 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +//------------------------------------------------------------------------------------------------------------------------------ +static inline void interpolation_v4_block(level_type *level_f, int id_f, double prescale_f, level_type *level_c, int id_c, blockCopy_type *block){ + // interpolate 3D array from read_i,j,k of read[] to write_i,j,k in write[] using volume averaged quartic prolongation + int write_dim_i = block->dim.i<<1; // calculate the dimensions of the resultant fine block + int write_dim_j = block->dim.j<<1; + int write_dim_k = block->dim.k<<1; + + int read_i = block->read.i; + int read_j = block->read.j; + int read_k = block->read.k; + int read_jStride = block->read.jStride; + int read_kStride = block->read.kStride; + + int write_i = block->write.i; + int write_j = block->write.j; + int write_k = block->write.k; + int write_jStride = block->write.jStride; + int write_kStride = block->write.kStride; + + const double * __restrict__ read = block->read.ptr; + double * __restrict__ write = block->write.ptr; + + if(block->read.box >=0){ + read_jStride = level_c->my_boxes[block->read.box ].jStride; + read_kStride = level_c->my_boxes[block->read.box ].kStride; + read = level_c->my_boxes[ block->read.box].vectors[id_c] + level_c->box_ghosts*(1+ read_jStride+ read_kStride); + } + if(block->write.box>=0){ + write_jStride = level_f->my_boxes[block->write.box].jStride; + write_kStride = level_f->my_boxes[block->write.box].kStride; + write = level_f->my_boxes[block->write.box].vectors[id_f] + level_f->box_ghosts*(1+write_jStride+write_kStride); + } + + + #ifdef USE_NAIVE_INTERP + // naive 125pt per fine grid cell + int i,j,k; + double c2 = -3.0/128.0; + double c1 = 22.0/128.0; + int dj = read_jStride; + int dk = read_kStride; + int dj2 = 2*read_jStride; + int dk2 = 2*read_kStride; + for(k=0;k>1)+ read_i) + (((j>>1)+ read_j)* read_jStride) + (((k>>1)+ read_k)* read_kStride); + // + // | -3/128 | +22/128 | 1.0 | -22/128 | +3/128 | coarse grid + // |-----+-----|-----+-----|-----+-----|-----+-----|-----+-----| + // | | | | |?????| | | | | | fine grid + // + write[write_ijk] = prescale_f*write[write_ijk] + + + sk2*( + sj2*( si2*read[read_ijk-2-dj2-dk2] + si1*read[read_ijk-1-dj2-dk2] + read[read_ijk-dj2-dk2] - si1*read[read_ijk+1-dj2-dk2] - si2*read[read_ijk+2-dj2-dk2] ) + + sj1*( si2*read[read_ijk-2-dj -dk2] + si1*read[read_ijk-1-dj -dk2] + read[read_ijk-dj -dk2] - si1*read[read_ijk+1-dj -dk2] - si2*read[read_ijk+2-dj -dk2] ) + + ( si2*read[read_ijk-2 -dk2] + si1*read[read_ijk-1 -dk2] + read[read_ijk -dk2] - si1*read[read_ijk+1 -dk2] - si2*read[read_ijk+2 -dk2] ) + - sj1*( si2*read[read_ijk-2+dj -dk2] + si1*read[read_ijk-1+dj -dk2] + read[read_ijk+dj -dk2] - si1*read[read_ijk+1+dj -dk2] - si2*read[read_ijk+2+dj -dk2] ) + - sj2*( si2*read[read_ijk-2+dj2-dk2] + si1*read[read_ijk-1+dj2-dk2] + read[read_ijk+dj2-dk2] - si1*read[read_ijk+1+dj2-dk2] - si2*read[read_ijk+2+dj2-dk2] ) ) + + sk1*( + sj2*( si2*read[read_ijk-2-dj2-dk ] + si1*read[read_ijk-1-dj2-dk ] + read[read_ijk-dj2-dk ] - si1*read[read_ijk+1-dj2-dk ] - si2*read[read_ijk+2-dj2-dk ] ) + + sj1*( si2*read[read_ijk-2-dj -dk ] + si1*read[read_ijk-1-dj -dk ] + read[read_ijk-dj -dk ] - si1*read[read_ijk+1-dj -dk ] - si2*read[read_ijk+2-dj -dk ] ) + + ( si2*read[read_ijk-2 -dk ] + si1*read[read_ijk-1 -dk ] + read[read_ijk -dk ] - si1*read[read_ijk+1 -dk ] - si2*read[read_ijk+2 -dk ] ) + - sj1*( si2*read[read_ijk-2+dj -dk ] + si1*read[read_ijk-1+dj -dk ] + read[read_ijk+dj -dk ] - si1*read[read_ijk+1+dj -dk ] - si2*read[read_ijk+2+dj -dk ] ) + - sj2*( si2*read[read_ijk-2+dj2-dk ] + si1*read[read_ijk-1+dj2-dk ] + read[read_ijk+dj2-dk ] - si1*read[read_ijk+1+dj2-dk ] - si2*read[read_ijk+2+dj2-dk ] ) ) + + ( + sj2*( si2*read[read_ijk-2-dj2 ] + si1*read[read_ijk-1-dj2 ] + read[read_ijk-dj2 ] - si1*read[read_ijk+1-dj2 ] - si2*read[read_ijk+2-dj2 ] ) + + sj1*( si2*read[read_ijk-2-dj ] + si1*read[read_ijk-1-dj ] + read[read_ijk-dj ] - si1*read[read_ijk+1-dj ] - si2*read[read_ijk+2-dj ] ) + + ( si2*read[read_ijk-2 ] + si1*read[read_ijk-1 ] + read[read_ijk ] - si1*read[read_ijk+1 ] - si2*read[read_ijk+2 ] ) + - sj1*( si2*read[read_ijk-2+dj ] + si1*read[read_ijk-1+dj ] + read[read_ijk+dj ] - si1*read[read_ijk+1+dj ] - si2*read[read_ijk+2+dj ] ) + - sj2*( si2*read[read_ijk-2+dj2 ] + si1*read[read_ijk-1+dj2 ] + read[read_ijk+dj2 ] - si1*read[read_ijk+1+dj2 ] - si2*read[read_ijk+2+dj2 ] ) ) + - sk1*( + sj2*( si2*read[read_ijk-2-dj2+dk ] + si1*read[read_ijk-1-dj2+dk ] + read[read_ijk-dj2+dk ] - si1*read[read_ijk+1-dj2+dk ] - si2*read[read_ijk+2-dj2+dk ] ) + + sj1*( si2*read[read_ijk-2-dj +dk ] + si1*read[read_ijk-1-dj +dk ] + read[read_ijk-dj +dk ] - si1*read[read_ijk+1-dj +dk ] - si2*read[read_ijk+2-dj +dk ] ) + + ( si2*read[read_ijk-2 +dk ] + si1*read[read_ijk-1 +dk ] + read[read_ijk +dk ] - si1*read[read_ijk+1 +dk ] - si2*read[read_ijk+2 +dk ] ) + - sj1*( si2*read[read_ijk-2+dj +dk ] + si1*read[read_ijk-1+dj +dk ] + read[read_ijk+dj +dk ] - si1*read[read_ijk+1+dj +dk ] - si2*read[read_ijk+2+dj +dk ] ) + - sj2*( si2*read[read_ijk-2+dj2+dk ] + si1*read[read_ijk-1+dj2+dk ] + read[read_ijk+dj2+dk ] - si1*read[read_ijk+1+dj2+dk ] - si2*read[read_ijk+2+dj2+dk ] ) ) + - sk2*( + sj2*( si2*read[read_ijk-2-dj2+dk2] + si1*read[read_ijk-1-dj2+dk2] + read[read_ijk-dj2+dk2] - si1*read[read_ijk+1-dj2+dk2] - si2*read[read_ijk+2-dj2+dk2] ) + + sj1*( si2*read[read_ijk-2-dj +dk2] + si1*read[read_ijk-1-dj +dk2] + read[read_ijk-dj +dk2] - si1*read[read_ijk+1-dj +dk2] - si2*read[read_ijk+2-dj +dk2] ) + + ( si2*read[read_ijk-2 +dk2] + si1*read[read_ijk-1 +dk2] + read[read_ijk +dk2] - si1*read[read_ijk+1 +dk2] - si2*read[read_ijk+2 +dk2] ) + - sj1*( si2*read[read_ijk-2+dj +dk2] + si1*read[read_ijk-1+dj +dk2] + read[read_ijk+dj +dk2] - si1*read[read_ijk+1+dj +dk2] - si2*read[read_ijk+2+dj +dk2] ) + - sj2*( si2*read[read_ijk-2+dj2+dk2] + si1*read[read_ijk-1+dj2+dk2] + read[read_ijk+dj2+dk2] - si1*read[read_ijk+1+dj2+dk2] - si2*read[read_ijk+2+dj2+dk2] ) ); + }}} + #else + // exploit tensor product symmetry and perform 8 fine grid interpolations at a time... + // 50 x 5pt for i + // 20 x 5pt for j + // 8 x 5pt for k + // ---------------- + // 78 x 5pt for 8 cells (vs 8x125pt = 200x5pt in naive) + int i,j,k; + int ii,jj,kk; + double c2 = -3.0/128.0; + double c1 = 22.0/128.0; + int dj = read_jStride; + int dk = read_kStride; + int dj2 = 2*read_jStride; + int dk2 = 2*read_kStride; + for(k=0,kk=0;ktag<<4) | 0x7; + + + #ifdef USE_MPI + // by convention, level_f allocates a combined array of requests for both level_f recvs and level_c sends... + int nMessages = level_c->interpolation.num_sends + level_f->interpolation.num_recvs; + MPI_Request *recv_requests = level_f->interpolation.requests; + MPI_Request *send_requests = level_f->interpolation.requests + level_f->interpolation.num_recvs; + + + // loop through packed list of MPI receives and prepost Irecv's... + if(level_f->interpolation.num_recvs>0){ + _timeStart = getTime(); + #ifdef USE_MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=0;ninterpolation.num_recvs;n++){ + MPI_Irecv(level_f->interpolation.recv_buffers[n], + level_f->interpolation.recv_sizes[n], + MPI_DOUBLE, + level_f->interpolation.recv_ranks[n], + my_tag, + MPI_COMM_WORLD, + &recv_requests[n] + ); + } + _timeEnd = getTime(); + level_f->timers.interpolation_recv += (_timeEnd-_timeStart); + } + + + // pack MPI send buffers... + if(level_c->interpolation.num_blocks[0]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[0]) + for(buffer=0;bufferinterpolation.num_blocks[0];buffer++){ + // !!! prescale==0 because you don't want to increment the MPI buffer + interpolation_v4_block(level_f,id_f,0.0,level_c,id_c,&level_c->interpolation.blocks[0][buffer]); + } + _timeEnd = getTime(); + level_f->timers.interpolation_pack += (_timeEnd-_timeStart); + } + + + // loop through MPI send buffers and post Isend's... + if(level_c->interpolation.num_sends>0){ + _timeStart = getTime(); + #ifdef USE_MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=0;ninterpolation.num_sends;n++){ + MPI_Isend(level_c->interpolation.send_buffers[n], + level_c->interpolation.send_sizes[n], + MPI_DOUBLE, + level_c->interpolation.send_ranks[n], + my_tag, + MPI_COMM_WORLD, + &send_requests[n] + ); + } + _timeEnd = getTime(); + level_f->timers.interpolation_send += (_timeEnd-_timeStart); + } + #endif + + + // perform local interpolation... try and hide within Isend latency... + if(level_c->interpolation.num_blocks[1]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[1]) + for(buffer=0;bufferinterpolation.num_blocks[1];buffer++){ + interpolation_v4_block(level_f,id_f,prescale_f,level_c,id_c,&level_c->interpolation.blocks[1][buffer]); + } + _timeEnd = getTime(); + level_f->timers.interpolation_local += (_timeEnd-_timeStart); + } + + + // wait for MPI to finish... + #ifdef USE_MPI + if(nMessages>0){ + _timeStart = getTime(); + MPI_Waitall(nMessages,level_f->interpolation.requests,level_f->interpolation.status); + _timeEnd = getTime(); + level_f->timers.interpolation_wait += (_timeEnd-_timeStart); + } + + + // unpack MPI receive buffers + if(level_f->interpolation.num_blocks[2]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->interpolation.num_blocks[2]) + for(buffer=0;bufferinterpolation.num_blocks[2];buffer++){ + IncrementBlock(level_f,id_f,prescale_f,&level_f->interpolation.blocks[2][buffer]); + } + _timeEnd = getTime(); + level_f->timers.interpolation_unpack += (_timeEnd-_timeStart); + } + #endif + + + level_f->timers.interpolation_total += (double)(getTime()-_timeCommunicationStart); +} diff --git a/Util/hpgmg/finite-volume/source/operators/jacobi.c b/Util/hpgmg/finite-volume/source/operators/jacobi.c new file mode 100644 index 00000000..30efce4a --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/jacobi.c @@ -0,0 +1,73 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +//------------------------------------------------------------------------------------------------------------------------------ +void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ + if(NUM_SMOOTHS&1){ + fprintf(stderr,"error - NUM_SMOOTHS must be even...\n"); + exit(0); + } + + #ifdef USE_L1JACOBI + double weight = 1.0; + #else + double weight = 2.0/3.0; + #endif + + int block,s; + for(s=0;snum_my_blocks) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + const int ghosts = level->box_ghosts; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const double h2inv = 1.0/(level->h*level->h); + const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + #ifdef USE_L1JACOBI + const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); + #else + const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); + #endif + const double * __restrict__ x_n; + double * __restrict__ x_np1; + if((s&1)==0){x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); + x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride);} + else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); + x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride);} + + for(k=klo;ktimers.smooth += (double)(getTime()-_timeStart); + } // s-loop +} + +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators/misc.c b/Util/hpgmg/finite-volume/source/operators/misc.c new file mode 100644 index 00000000..a90b12a7 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/misc.c @@ -0,0 +1,508 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +void zero_vector(level_type * level, int id_a){ + // zero's the entire grid INCLUDING ghost zones... + double _timeStart = getTime(); + int block; + + PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + int ilo = level->my_blocks[block].read.i; + int jlo = level->my_blocks[block].read.j; + int klo = level->my_blocks[block].read.k; + int ihi = level->my_blocks[block].dim.i + ilo; + int jhi = level->my_blocks[block].dim.j + jlo; + int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim = level->my_boxes[box].dim; + + // expand the size of the block to include the ghost zones... + if(ilo<= 0)ilo-=ghosts; + if(jlo<= 0)jlo-=ghosts; + if(klo<= 0)klo-=ghosts; + if(ihi>=dim)ihi+=ghosts; + if(jhi>=dim)jhi+=ghosts; + if(khi>=dim)khi+=ghosts; + + double * __restrict__ grid = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); + + for(k=klo;ktimers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void init_vector(level_type * level, int id_a, double scalar){ + // initializes the grid to a scalar while zero'ing the ghost zones... + double _timeStart = getTime(); + int block; + + PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + int ilo = level->my_blocks[block].read.i; + int jlo = level->my_blocks[block].read.j; + int klo = level->my_blocks[block].read.k; + int ihi = level->my_blocks[block].dim.i + ilo; + int jhi = level->my_blocks[block].dim.j + jlo; + int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim = level->my_boxes[box].dim; + + // expand the size of the block to include the ghost zones... + if(ilo<= 0)ilo-=ghosts; + if(jlo<= 0)jlo-=ghosts; + if(klo<= 0)klo-=ghosts; + if(ihi>=dim)ihi+=ghosts; + if(jhi>=dim)jhi+=ghosts; + if(khi>=dim)khi+=ghosts; + + double * __restrict__ grid = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); + + for(k=klo;k=dim) || (j>=dim) || (k>=dim); + grid[ijk] = ghostZone ? 0.0 : scalar; + }}} + } + level->timers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// add vectors id_a (scaled by scale_a) and id_b (scaled by scale_b) and store the result in vector id_c +// i.e. c[] = scale_a*a[] + scale_b*b[] +// note, only non ghost zone values are included in this calculation +void add_vectors(level_type * level, int id_c, double scale_a, int id_a, double scale_b, int id_b){ + double _timeStart = getTime(); + + int block; + + PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); + double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); + double * __restrict__ grid_b = level->my_boxes[box].vectors[id_b] + ghosts*(1+jStride+kStride); + + for(k=klo;ktimers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// multiply each element of vector id_a by vector id_b and scale, and place the result in vector id_c +// i.e. c[]=scale*a[]*b[] +// note, only non ghost zone values are included in this calculation +void mul_vectors(level_type * level, int id_c, double scale, int id_a, int id_b){ + double _timeStart = getTime(); + + int block; + + PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); + double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); + double * __restrict__ grid_b = level->my_boxes[box].vectors[id_b] + ghosts*(1+jStride+kStride); + + for(k=klo;ktimers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// invert each element of vector id_a, scale by scale_a, and place the result in vector id_c +// i.e. c[]=scale_a/a[] +// note, only non ghost zone values are included in this calculation +void invert_vector(level_type * level, int id_c, double scale_a, int id_a){ + double _timeStart = getTime(); + + int block; + + PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); + double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); + + for(k=klo;ktimers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// scale vector id_a by scale_a and place the result in vector id_c +// i.e. c[]=scale_a*a[] +// note, only non ghost zone values are included in this calculation +void scale_vector(level_type * level, int id_c, double scale_a, int id_a){ + double _timeStart = getTime(); + + int block; + + PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); + double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); + + for(k=klo;ktimers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// return the dot product of vectors id_a and id_b +// note, only non ghost zone values are included in this calculation +double dot(level_type * level, int id_a, int id_b){ + double _timeStart = getTime(); + + + int block; + double a_dot_b_level = 0.0; + + PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,block,level->num_my_blocks,a_dot_b_level) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + double * __restrict__ grid_b = level->my_boxes[box].vectors[id_b] + ghosts*(1+jStride+kStride); + double a_dot_b_block = 0.0; + + for(k=klo;ktimers.blas1 += (double)(getTime()-_timeStart); + + #ifdef USE_MPI + double _timeStartAllReduce = getTime(); + double send = a_dot_b_level; + MPI_Allreduce(&send,&a_dot_b_level,1,MPI_DOUBLE,MPI_SUM,level->MPI_COMM_ALLREDUCE); + double _timeEndAllReduce = getTime(); + level->timers.collectives += (double)(_timeEndAllReduce-_timeStartAllReduce); + #endif + + return(a_dot_b_level); +} + +//------------------------------------------------------------------------------------------------------------------------------ +// return the max (infinity) norm of the vector id_a. +// note, only non ghost zone values are included in this calculation +double norm(level_type * level, int id_a){ // implements the max norm + double _timeStart = getTime(); + + int block; + double max_norm = 0.0; + + PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,block,level->num_my_blocks,max_norm) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + double * __restrict__ grid = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + double block_norm = 0.0; + + for(k=klo;kblock_norm){block_norm=fabs_grid_ijk;} // max norm + }}} + + if(block_norm>max_norm){max_norm = block_norm;} + } // block list + level->timers.blas1 += (double)(getTime()-_timeStart); + + #ifdef USE_MPI + double _timeStartAllReduce = getTime(); + double send = max_norm; + MPI_Allreduce(&send,&max_norm,1,MPI_DOUBLE,MPI_MAX,level->MPI_COMM_ALLREDUCE); + double _timeEndAllReduce = getTime(); + level->timers.collectives += (double)(_timeEndAllReduce-_timeStartAllReduce); + #endif + return(max_norm); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// return the mean (arithmetic average value) of vector id_a +// essentially, this is a l1 norm by a scaling by the inverse of the total (global) number of cells +// note, only non ghost zone values are included in this calculation +double mean(level_type * level, int id_a){ + double _timeStart = getTime(); + + + int block; + double sum_level = 0.0; + + PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,block,level->num_my_blocks,sum_level) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + double sum_block = 0.0; + + for(k=klo;ktimers.blas1 += (double)(getTime()-_timeStart); + double ncells_level = (double)level->dim.i*(double)level->dim.j*(double)level->dim.k; + + #ifdef USE_MPI + double _timeStartAllReduce = getTime(); + double send = sum_level; + MPI_Allreduce(&send,&sum_level,1,MPI_DOUBLE,MPI_SUM,level->MPI_COMM_ALLREDUCE); + double _timeEndAllReduce = getTime(); + level->timers.collectives += (double)(_timeEndAllReduce-_timeStartAllReduce); + #endif + + double mean_level = sum_level / ncells_level; + return(mean_level); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// add the scalar value shift_a to each element of vector id_a and store the result in vector id_c +// note, only non ghost zone values are included in this calculation +void shift_vector(level_type * level, int id_c, int id_a, double shift_a){ + double _timeStart = getTime(); + int block; + + PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + + + for(k=klo;ktimers.blas1 += (double)(getTime()-_timeStart); +} + +//------------------------------------------------------------------------------------------------------------------------------ +// calculate the error between two vectors (id_a and id_b) using either the max (infinity) norm or the L2 norm +// note, only non ghost zone values are included in this calculation +double error(level_type * level, int id_a, int id_b){ + double h3 = level->h * level->h * level->h; + add_vectors(level,VECTOR_TEMP,1.0,id_a,-1.0,id_b); // VECTOR_TEMP = id_a - id_b + double max = norm(level,VECTOR_TEMP); return(max); // max norm of error function + double L2 = sqrt( dot(level,VECTOR_TEMP,VECTOR_TEMP)*h3);return( L2); // normalized L2 error ? +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// Color the vector id_a with 1's and 0's +// The pattern is dictated by the number of colors in each dimension and the 'active' color (i,j,kcolor) +// note, only non ghost zone values are included in this calculation +// e.g. colors_in_each_dim=3, icolor=1, jcolor=2... +// -+---+---+---+- +// | 0 | 1 | 0 | +// -+---+---+---+- +// | 0 | 0 | 0 | +// -+---+---+---+- +// | 0 | 0 | 0 | +// -+---+---+---+- +// +void color_vector(level_type * level, int id_a, int colors_in_each_dim, int icolor, int jcolor, int kcolor){ + double _timeStart = getTime(); + int block; + + PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + const int boxlowi = level->my_boxes[box].low.i; + const int boxlowj = level->my_boxes[box].low.j; + const int boxlowk = level->my_boxes[box].low.k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + double * __restrict__ grid = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + int i,j,k; + + for(k=klo;ktimers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +// Initialize each element of vector id_a with a "random" value. +// For simplicity, random is defined as -1.0 or +1.0 and is based on whether the coordinates of the element are even or odd +// note, only non ghost zone values are included in this calculation +void random_vector(level_type * level, int id_a){ + double _timeStart = getTime(); + int block; + + PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + double * __restrict__ grid = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + int i,j,k; + + for(k=klo;ktimers.blas1 += (double)(getTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators/problem.fv.c b/Util/hpgmg/finite-volume/source/operators/problem.fv.c new file mode 100644 index 00000000..e6ea7481 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/problem.fv.c @@ -0,0 +1,139 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef M_PI +#define M_PI 3.14159265358979323846 // in case math.h doesn't define it +#endif +double evaluateBeta(double x, double y, double z, double h, int add_Bxx, int add_Byy, int add_Bzz){ + double b = 0.25; + double a = 2.0*M_PI; // one period on [0,1]^3 + + double B = 1.0 + b*sin(a*x)*sin(a*y)*sin(a*z); +//double Bx = a*b*cos(a*x)*sin(a*y)*sin(a*z); +//double By = a*b*sin(a*x)*cos(a*y)*sin(a*z); +//double Bz = a*b*sin(a*x)*sin(a*y)*cos(a*z); + double Bxx = -a*a*b*sin(a*x)*sin(a*y)*sin(a*z); + double Byy = -a*a*b*sin(a*x)*sin(a*y)*sin(a*z); + double Bzz = -a*a*b*sin(a*x)*sin(a*y)*sin(a*z); + + // 4th order correction to approximate the conversion of cell-centered values to cell-averaged... + if(add_Bxx)B+=(h*h/24.0)*Bxx; + if(add_Byy)B+=(h*h/24.0)*Byy; + if(add_Bzz)B+=(h*h/24.0)*Bzz; + return(B); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +double evaluateF(double x, double y, double z, double h, int add_Fxx, int add_Fyy, int add_Fzz){ + #if 0 // harder problem... not sure I manually differentiated this right... + // 8 'poles', one per octant + double cx = 0.75; + double cy = 0.75; + double cz = 0.75,sign = 1.0; + if(x<0.5){cx = 0.25;sign*=-1.0;} + if(y<0.5){cy = 0.25;sign*=-1.0;} + if(z<0.5){cz = 0.25;sign*=-1.0;} + + double r0 = 0.1; + double a = M_PI/2/r0; + double r = pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , 0.5); // euclidean distance + double rx = pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -0.5)*(x-cx); // dr/dx + double ry = pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -0.5)*(y-cy); + double rz = pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -0.5)*(z-cz); + double rxx = -pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -1.5)*(x-cx)*(x-cx) + pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -0.5); // d2r/dx2 + double ryy = -pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -1.5)*(y-cy)*(y-cy) + pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -0.5); + double rzz = -pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -1.5)*(z-cz)*(z-cz) + pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -0.5); + + double p = 6.0; + double F = sign*( pow(cos(a*r),p ) ); + double Fx = sign*( -a*p*pow(cos(a*r),p-1)*sin(a*r)*rx ); + double Fy = sign*( -a*p*pow(cos(a*r),p-1)*sin(a*r)*ry ); + double Fz = sign*( -a*p*pow(cos(a*r),p-1)*sin(a*r)*rz ); + double Fxx = sign*( -a*a*p*pow(cos(a*r),p )*rx*rx + a*a*p*(p-1)*pow(cos(a*r),p-2)*pow(sin(a*r),2)*rx*rx - a*p*pow(cos(a*r),p-1)*sin(a*r)*rxx ); + double Fyy = sign*( -a*a*p*pow(cos(a*r),p )*ry*ry + a*a*p*(p-1)*pow(cos(a*r),p-2)*pow(sin(a*r),2)*ry*ry - a*p*pow(cos(a*r),p-1)*sin(a*r)*ryy ); + double Fzz = sign*( -a*a*p*pow(cos(a*r),p )*rz*rz + a*a*p*(p-1)*pow(cos(a*r),p-2)*pow(sin(a*r),2)*rz*rz - a*p*pow(cos(a*r),p-1)*sin(a*r)*rzz ); + + if(r>=r0){ + F = 0.0; + Fx = 0.0; + Fy = 0.0; + Fz = 0.0; + Fxx = 0.0; + Fyy = 0.0; + Fzz = 0.0; + } + #else + double a = 2.0*M_PI; + double p = 7.0; + double F = pow(sin(a*x),p )*pow(sin(a*y),p )*pow(sin(a*z),p ); +//double Fx = a*p*pow(sin(a*x),p-1)*pow(sin(a*y),p )*pow(sin(a*z),p )*cos(a*x); +//double Fy = a*p*pow(sin(a*x),p )*pow(sin(a*y),p-1)*pow(sin(a*z),p )*cos(a*y); +//double Fz = a*p*pow(sin(a*x),p )*pow(sin(a*y),p )*pow(sin(a*z),p-1)*cos(a*z); + double Fxx = -a*a*p*pow(sin(a*x),p )*pow(sin(a*y),p )*pow(sin(a*z),p ) + a*a*p*(p-1)*pow(sin(a*x),p-2)*pow(sin(a*y),p )*pow(sin(a*z),p )*pow(cos(a*x),2); + double Fyy = -a*a*p*pow(sin(a*x),p )*pow(sin(a*y),p )*pow(sin(a*z),p ) + a*a*p*(p-1)*pow(sin(a*x),p )*pow(sin(a*y),p-2)*pow(sin(a*z),p )*pow(cos(a*y),2); + double Fzz = -a*a*p*pow(sin(a*x),p )*pow(sin(a*y),p )*pow(sin(a*z),p ) + a*a*p*(p-1)*pow(sin(a*x),p )*pow(sin(a*y),p )*pow(sin(a*z),p-2)*pow(cos(a*z),2); + #endif + + // 4th order correction to approximate the conversion of cell-centered values to cell-averaged... + if(add_Fxx)F+=(h*h/24.0)*Fxx; + if(add_Fyy)F+=(h*h/24.0)*Fyy; + if(add_Fzz)F+=(h*h/24.0)*Fzz; + + return(F); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void initialize_problem(level_type * level, double hLevel, double a, double b){ + level->h = hLevel; + + int box; + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim_i = level->my_boxes[box].dim; + const int dim_j = level->my_boxes[box].dim; + const int dim_k = level->my_boxes[box].dim; + #ifdef _OPENMP + #pragma omp parallel for private(k,j,i) collapse(3) + #endif + for(k=0;k<=dim_k;k++){ // include high face + for(j=0;j<=dim_j;j++){ // include high face + for(i=0;i<=dim_i;i++){ // include high face + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + int ijk = (i+ghosts) + (j+ghosts)*jStride + (k+ghosts)*kStride; + double x = hLevel*( (double)(i+level->my_boxes[box].low.i) + 0.5 ); // +0.5 to get to the center of cell + double y = hLevel*( (double)(j+level->my_boxes[box].low.j) + 0.5 ); + double z = hLevel*( (double)(k+level->my_boxes[box].low.k) + 0.5 ); + double A,Bi,Bj,Bk; + //double A,B,Bx,By,Bz,Bi,Bj,Bk; + //double U,Ux,Uy,Uz,Uxx,Uyy,Uzz; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + A = 1.0; + Bi = 1.0; + Bj = 1.0; + Bk = 1.0; + #ifdef STENCIL_VARIABLE_COEFFICIENT // variable coefficient problem... + Bi=evaluateBeta(x-hLevel*0.5,y ,z ,hLevel,0,1,1); // face-centered value of Beta for beta_i + Bj=evaluateBeta(x ,y-hLevel*0.5,z ,hLevel,1,0,1); // face-centered value of Beta for beta_j + Bk=evaluateBeta(x ,y ,z-hLevel*0.5,hLevel,1,1,0); // face-centered value of Beta for beta_k + #endif + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + double F=evaluateF(x,y,z,hLevel,1,1,1); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + level->my_boxes[box].vectors[VECTOR_ALPHA ][ijk] = A; + level->my_boxes[box].vectors[VECTOR_BETA_I][ijk] = Bi; + level->my_boxes[box].vectors[VECTOR_BETA_J][ijk] = Bj; + level->my_boxes[box].vectors[VECTOR_BETA_K][ijk] = Bk; + level->my_boxes[box].vectors[VECTOR_F ][ijk] = F; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + }}} + } + +} +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators/problem.p4.c b/Util/hpgmg/finite-volume/source/operators/problem.p4.c new file mode 100644 index 00000000..3f74ee1f --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/problem.p4.c @@ -0,0 +1,124 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +void evaluateBeta(double x, double y, double z, double *B, double *Bx, double *By, double *Bz){ + double Bmin = 1.0; + double Bmax = 10.0; + double c2 = (Bmax-Bmin)/2; // coefficients to affect this transition + double c1 = (Bmax+Bmin)/2; + double c3 = 10.0; // how sharply (B)eta transitions + double xcenter = 0.50; + double ycenter = 0.50; + double zcenter = 0.50; + // calculate distance from center of the domain (0.5,0.5,0.5) + double r2 = pow((x-xcenter),2) + pow((y-ycenter),2) + pow((z-zcenter),2); + double r2x = 2.0*(x-xcenter); + double r2y = 2.0*(y-ycenter); + double r2z = 2.0*(z-zcenter); +//double r2xx = 2.0; +//double r2yy = 2.0; +//double r2zz = 2.0; + double r = pow(r2,0.5); + double rx = 0.5*r2x*pow(r2,-0.5); + double ry = 0.5*r2y*pow(r2,-0.5); + double rz = 0.5*r2z*pow(r2,-0.5); +//double rxx = 0.5*r2xx*pow(r2,-0.5) - 0.25*r2x*r2x*pow(r2,-1.5); +//double ryy = 0.5*r2yy*pow(r2,-0.5) - 0.25*r2y*r2y*pow(r2,-1.5); +//double rzz = 0.5*r2zz*pow(r2,-0.5) - 0.25*r2z*r2z*pow(r2,-1.5); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + *B = c1+c2*tanh( c3*(r-0.25) ); + *Bx = c2*c3*rx*(1-pow(tanh( c3*(r-0.25) ),2)); + *By = c2*c3*ry*(1-pow(tanh( c3*(r-0.25) ),2)); + *Bz = c2*c3*rz*(1-pow(tanh( c3*(r-0.25) ),2)); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void evaluateU(double x, double y, double z, double *U, double *Ux, double *Uy, double *Uz, double *Uxx, double *Uyy, double *Uzz, int isPeriodic){ + // should be continuous in u, u', and u'' + // v(w) = w^4 - 2w^3 + w^2 + c + // u(x,y,z) = v(x)v(y)v(z) + // If Periodic, then the integral of the RHS should sum to zero. + // Setting shift=1/30 should ensure that the integrals of X, Y, or Z should sum to zero... + // That should(?) make the integrals of u,ux,uy,uz,uxx,uyy,uzz sum to zero and thus make the integral of f sum to zero + // If dirichlet, then w(0)=w(1) = 0.0 + // Setting shift to 0 should ensure that U(x,y,z) = 0 on boundary + double shift = 0.0;if(isPeriodic)shift= -1.0/30.0; + double X = 1.0*pow(x,4) - 2.0*pow(x,3) + 1.0*pow(x,2) + shift; + double Y = 1.0*pow(y,4) - 2.0*pow(y,3) + 1.0*pow(y,2) + shift; + double Z = 1.0*pow(z,4) - 2.0*pow(z,3) + 1.0*pow(z,2) + shift; + double Xx = 4.0*pow(x,3) - 6.0*pow(x,2) + 2.0*x; + double Yy = 4.0*pow(y,3) - 6.0*pow(y,2) + 2.0*y; + double Zz = 4.0*pow(z,3) - 6.0*pow(z,2) + 2.0*z; + double Xxx = 12.0*pow(x,2) - 12.0*x + 2.0; + double Yyy = 12.0*pow(y,2) - 12.0*y + 2.0; + double Zzz = 12.0*pow(z,2) - 12.0*z + 2.0; + *U = X*Y*Z; + *Ux = Xx*Y*Z; + *Uy = X*Yy*Z; + *Uz = X*Y*Zz; + *Uxx = Xxx*Y*Z; + *Uyy = X*Yyy*Z; + *Uzz = X*Y*Zzz; +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void initialize_problem(level_type * level, double hLevel, double a, double b){ + level->h = hLevel; + + int box; + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim_i = level->my_boxes[box].dim; + const int dim_j = level->my_boxes[box].dim; + const int dim_k = level->my_boxes[box].dim; + #ifdef _OPENMP + #pragma omp parallel for private(k,j,i) collapse(3) + #endif + for(k=0;k<=dim_k;k++){ // include high face + for(j=0;j<=dim_j;j++){ // include high face + for(i=0;i<=dim_i;i++){ // include high face + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + int ijk = (i+ghosts) + (j+ghosts)*jStride + (k+ghosts)*kStride; + double x = hLevel*( (double)(i+level->my_boxes[box].low.i) + 0.5 ); // +0.5 to get to the center of cell + double y = hLevel*( (double)(j+level->my_boxes[box].low.j) + 0.5 ); + double z = hLevel*( (double)(k+level->my_boxes[box].low.k) + 0.5 ); + double A,B,Bx,By,Bz,Bi,Bj,Bk; + double U,Ux,Uy,Uz,Uxx,Uyy,Uzz; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + A = 1.0; + B = 1.0; + Bx = 0.0; + By = 0.0; + Bz = 0.0; + Bi = 1.0; + Bj = 1.0; + Bk = 1.0; + #ifdef STENCIL_VARIABLE_COEFFICIENT // variable coefficient problem... + evaluateBeta(x-hLevel*0.5,y ,z ,&Bi,&Bx,&By,&Bz); // face-centered value of Beta for beta_i + evaluateBeta(x ,y-hLevel*0.5,z ,&Bj,&Bx,&By,&Bz); // face-centered value of Beta for beta_j + evaluateBeta(x ,y ,z-hLevel*0.5,&Bk,&Bx,&By,&Bz); // face-centered value of Beta for beta_k + evaluateBeta(x ,y ,z ,&B ,&Bx,&By,&Bz); // cell-centered value of Beta + #endif + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + evaluateU(x,y,z,&U,&Ux,&Uy,&Uz,&Uxx,&Uyy,&Uzz, (level->boundary_condition.type == BC_PERIODIC) ); + double F = a*A*U - b*( (Bx*Ux + By*Uy + Bz*Uz) + B*(Uxx + Uyy + Uzz) ); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + level->my_boxes[box].vectors[VECTOR_BETA_I][ijk] = Bi; + level->my_boxes[box].vectors[VECTOR_BETA_J][ijk] = Bj; + level->my_boxes[box].vectors[VECTOR_BETA_K][ijk] = Bk; + level->my_boxes[box].vectors[VECTOR_ALPHA ][ijk] = A; + //level->my_boxes[box].vectors[VECTOR_UTRUE ][ijk] = U; // obviated by Richardson analysis + level->my_boxes[box].vectors[VECTOR_F ][ijk] = F; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + }}} + } + +} +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators/problem.p6.c b/Util/hpgmg/finite-volume/source/operators/problem.p6.c new file mode 100644 index 00000000..49cca26b --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/problem.p6.c @@ -0,0 +1,134 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +void evaluateBeta(double x, double y, double z, double *B, double *Bx, double *By, double *Bz){ + double Bmin = 1.0; + double Bmax = 10.0; + double c2 = (Bmax-Bmin)/2; // coefficients to affect this transition + double c1 = (Bmax+Bmin)/2; + double c3 = 10.0; // how sharply (B)eta transitions + double xcenter = 0.50; + double ycenter = 0.50; + double zcenter = 0.50; + // calculate distance from center of the domain (0.5,0.5,0.5) + double r2 = pow((x-xcenter),2) + pow((y-ycenter),2) + pow((z-zcenter),2); + double r2x = 2.0*(x-xcenter); + double r2y = 2.0*(y-ycenter); + double r2z = 2.0*(z-zcenter); +//double r2xx = 2.0; +//double r2yy = 2.0; +//double r2zz = 2.0; + double r = pow(r2,0.5); + double rx = 0.5*r2x*pow(r2,-0.5); + double ry = 0.5*r2y*pow(r2,-0.5); + double rz = 0.5*r2z*pow(r2,-0.5); +//double rxx = 0.5*r2xx*pow(r2,-0.5) - 0.25*r2x*r2x*pow(r2,-1.5); +//double ryy = 0.5*r2yy*pow(r2,-0.5) - 0.25*r2y*r2y*pow(r2,-1.5); +//double rzz = 0.5*r2zz*pow(r2,-0.5) - 0.25*r2z*r2z*pow(r2,-1.5); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + *B = c1+c2*tanh( c3*(r-0.25) ); + *Bx = c2*c3*rx*(1-pow(tanh( c3*(r-0.25) ),2)); + *By = c2*c3*ry*(1-pow(tanh( c3*(r-0.25) ),2)); + *Bz = c2*c3*rz*(1-pow(tanh( c3*(r-0.25) ),2)); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void evaluateU(double x, double y, double z, double *U, double *Ux, double *Uy, double *Uz, double *Uxx, double *Uyy, double *Uzz, int isPeriodic){ + // should be continuous in u, u', u'', u''', and u'''' to guarantee high order and periodic boundaries + // v(w) = ??? + // u(x,y,z) = v(x)v(y)v(z) + // If Periodic, then the integral of the RHS should sum to zero. + // Setting shift=1.0 should ensure that the integrals of X, Y, or Z should sum to zero... + // That should(?) make the integrals of u,ux,uy,uz,uxx,uyy,uzz sum to zero and thus make the integral of f sum to zero + // If dirichlet, then w(0)=w(1) = 0.0 + // Setting shift to 0 should ensure that U(x,y,z) = 0 on boundary + // u = ax^6 + bx^5 + cx^4 + dx^3 + ex^2 + fx + g + // ux = 6ax^5 + 5bx^4 + 4cx^3 + 3dx^2 + 2ex + f + // uxx = 30ax^4 + 20bx^3 + 12cx^2 + 6dx + 2e + // a = 42.0 + // b = -126.0 + // c = 105.0 + // d = 0.0 + // e = -21.0 + // f = 0.0 + // g = 1.0 + double shift = 0.0;if(isPeriodic)shift= 1.0/21.0; + double X = 2.0*pow(x,6) - 6.0*pow(x,5) + 5.0*pow(x,4) - 1.0*pow(x,2) + shift; + double Y = 2.0*pow(y,6) - 6.0*pow(y,5) + 5.0*pow(y,4) - 1.0*pow(y,2) + shift; + double Z = 2.0*pow(z,6) - 6.0*pow(z,5) + 5.0*pow(z,4) - 1.0*pow(z,2) + shift; + double Xx = 12.0*pow(x,5) - 30.0*pow(x,4) + 20.0*pow(x,3) - 2.0*x; + double Yy = 12.0*pow(y,5) - 30.0*pow(y,4) + 20.0*pow(y,3) - 2.0*y; + double Zz = 12.0*pow(z,5) - 30.0*pow(z,4) + 20.0*pow(z,3) - 2.0*z; + double Xxx = 60.0*pow(x,4) - 120.0*pow(x,3) + 60.0*pow(x,2) - 2.0; + double Yyy = 60.0*pow(y,4) - 120.0*pow(y,3) + 60.0*pow(y,2) - 2.0; + double Zzz = 60.0*pow(z,4) - 120.0*pow(z,3) + 60.0*pow(z,2) - 2.0; + *U = X * Y * Z; + *Ux = Xx * Y * Z; + *Uy = X * Yy * Z; + *Uz = X * Y * Zz; + *Uxx = Xxx * Y * Z; + *Uyy = X * Yyy * Z; + *Uzz = X * Y * Zzz; +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void initialize_problem(level_type * level, double hLevel, double a, double b){ + level->h = hLevel; + + int box; + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim_i = level->my_boxes[box].dim; + const int dim_j = level->my_boxes[box].dim; + const int dim_k = level->my_boxes[box].dim; + #ifdef _OPENMP + #pragma omp parallel for private(k,j,i) collapse(3) + #endif + for(k=0;k<=dim_k;k++){ // include high face + for(j=0;j<=dim_j;j++){ // include high face + for(i=0;i<=dim_i;i++){ // include high face + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + int ijk = (i+ghosts) + (j+ghosts)*jStride + (k+ghosts)*kStride; + double x = hLevel*( (double)(i+level->my_boxes[box].low.i) + 0.5 ); // +0.5 to get to the center of cell + double y = hLevel*( (double)(j+level->my_boxes[box].low.j) + 0.5 ); + double z = hLevel*( (double)(k+level->my_boxes[box].low.k) + 0.5 ); + double A,B,Bx,By,Bz,Bi,Bj,Bk; + double U,Ux,Uy,Uz,Uxx,Uyy,Uzz; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + A = 1.0; + B = 1.0; + Bx = 0.0; + By = 0.0; + Bz = 0.0; + Bi = 1.0; + Bj = 1.0; + Bk = 1.0; + #ifdef STENCIL_VARIABLE_COEFFICIENT // variable coefficient problem... + evaluateBeta(x-hLevel*0.5,y ,z ,&Bi,&Bx,&By,&Bz); // face-centered value of Beta for beta_i + evaluateBeta(x ,y-hLevel*0.5,z ,&Bj,&Bx,&By,&Bz); // face-centered value of Beta for beta_j + evaluateBeta(x ,y ,z-hLevel*0.5,&Bk,&Bx,&By,&Bz); // face-centered value of Beta for beta_k + evaluateBeta(x ,y ,z ,&B ,&Bx,&By,&Bz); // cell-centered value of Beta + #endif + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + evaluateU(x,y,z,&U,&Ux,&Uy,&Uz,&Uxx,&Uyy,&Uzz, (level->boundary_condition.type == BC_PERIODIC) ); + double F = a*A*U - b*( (Bx*Ux + By*Uy + Bz*Uz) + B*(Uxx + Uyy + Uzz) ); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + level->my_boxes[box].vectors[VECTOR_BETA_I][ijk] = Bi; + level->my_boxes[box].vectors[VECTOR_BETA_J][ijk] = Bj; + level->my_boxes[box].vectors[VECTOR_BETA_K][ijk] = Bk; + level->my_boxes[box].vectors[VECTOR_ALPHA ][ijk] = A; + //level->my_boxes[box].vectors[VECTOR_UTRUE ][ijk] = U; // obviated by Richardson analysis + level->my_boxes[box].vectors[VECTOR_F ][ijk] = F; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + }}} + } + +} +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators/problem.sine.c b/Util/hpgmg/finite-volume/source/operators/problem.sine.c new file mode 100644 index 00000000..caa67f67 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/problem.sine.c @@ -0,0 +1,120 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef M_PI +#define M_PI 3.14159265358979323846 // in case math.h doesn't define it +#endif +void evaluateBeta(double x, double y, double z, double *B, double *Bx, double *By, double *Bz){ + double Bmin = 1.0; + double Bmax = 10.0; + double c2 = (Bmax-Bmin)/2; // coefficients to affect this transition + double c1 = (Bmax+Bmin)/2; + double c3 = 10.0; // how sharply (B)eta transitions + double xcenter = 0.50; + double ycenter = 0.50; + double zcenter = 0.50; + // calculate distance from center of the domain (0.5,0.5,0.5) + double r2 = pow((x-xcenter),2) + pow((y-ycenter),2) + pow((z-zcenter),2); + double r2x = 2.0*(x-xcenter); + double r2y = 2.0*(y-ycenter); + double r2z = 2.0*(z-zcenter); +//double r2xx = 2.0; +//double r2yy = 2.0; +//double r2zz = 2.0; + double r = pow(r2,0.5); + double rx = 0.5*r2x*pow(r2,-0.5); + double ry = 0.5*r2y*pow(r2,-0.5); + double rz = 0.5*r2z*pow(r2,-0.5); +//double rxx = 0.5*r2xx*pow(r2,-0.5) - 0.25*r2x*r2x*pow(r2,-1.5); +//double ryy = 0.5*r2yy*pow(r2,-0.5) - 0.25*r2y*r2y*pow(r2,-1.5); +//double rzz = 0.5*r2zz*pow(r2,-0.5) - 0.25*r2z*r2z*pow(r2,-1.5); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + *B = c1+c2*tanh( c3*(r-0.25) ); + *Bx = c2*c3*rx*(1-pow(tanh( c3*(r-0.25) ),2)); + *By = c2*c3*ry*(1-pow(tanh( c3*(r-0.25) ),2)); + *Bz = c2*c3*rz*(1-pow(tanh( c3*(r-0.25) ),2)); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void evaluateU(double x, double y, double z, double *U, double *Ux, double *Uy, double *Uz, double *Uxx, double *Uyy, double *Uzz, int isPeriodic){ + double c1 = 2.0*M_PI; + double c2 = 6.0*M_PI; + double p = 13; // must be odd(?) and allows up to p-2 order MG + *U = pow(sin(c1*x),p )*pow(sin(c1*y),p)*pow(sin(c1*z),p); + *Ux = c1*p*cos(c1*x)*pow(sin(c1*x),p-1)*pow(sin(c1*y),p)*pow(sin(c1*z),p); + *Uy = c1*p*cos(c1*y)*pow(sin(c1*y),p-1)*pow(sin(c1*x),p)*pow(sin(c1*z),p); + *Uz = c1*p*cos(c1*z)*pow(sin(c1*z),p-1)*pow(sin(c1*x),p)*pow(sin(c1*y),p); + *Uxx = c1*c1*p*( (p-1)*pow(sin(c1*x),p-2)*pow(cos(c1*x),2) - pow(sin(c1*x),p) )*pow(sin(c1*y),p)*pow(sin(c1*z),p); + *Uyy = c1*c1*p*( (p-1)*pow(sin(c1*y),p-2)*pow(cos(c1*y),2) - pow(sin(c1*y),p) )*pow(sin(c1*x),p)*pow(sin(c1*z),p); + *Uzz = c1*c1*p*( (p-1)*pow(sin(c1*z),p-2)*pow(cos(c1*z),2) - pow(sin(c1*z),p) )*pow(sin(c1*x),p)*pow(sin(c1*y),p); + + *U += pow(sin(c2*x),p )*pow(sin(c2*y),p)*pow(sin(c2*z),p); + *Ux += c2*p*cos(c2*x)*pow(sin(c2*x),p-1)*pow(sin(c2*y),p)*pow(sin(c2*z),p); + *Uy += c2*p*cos(c2*y)*pow(sin(c2*y),p-1)*pow(sin(c2*x),p)*pow(sin(c2*z),p); + *Uz += c2*p*cos(c2*z)*pow(sin(c2*z),p-1)*pow(sin(c2*x),p)*pow(sin(c2*y),p); + *Uxx += c2*c2*p*( (p-1)*pow(sin(c2*x),p-2)*pow(cos(c2*x),2) - pow(sin(c2*x),p) )*pow(sin(c2*y),p)*pow(sin(c2*z),p); + *Uyy += c2*c2*p*( (p-1)*pow(sin(c2*y),p-2)*pow(cos(c2*y),2) - pow(sin(c2*y),p) )*pow(sin(c2*x),p)*pow(sin(c2*z),p); + *Uzz += c2*c2*p*( (p-1)*pow(sin(c2*z),p-2)*pow(cos(c2*z),2) - pow(sin(c2*z),p) )*pow(sin(c2*x),p)*pow(sin(c2*y),p); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void initialize_problem(level_type * level, double hLevel, double a, double b){ + level->h = hLevel; + + int box; + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim_i = level->my_boxes[box].dim; + const int dim_j = level->my_boxes[box].dim; + const int dim_k = level->my_boxes[box].dim; + #ifdef _OPENMP + #pragma omp parallel for private(k,j,i) collapse(3) + #endif + for(k=0;k<=dim_k;k++){ // include high face + for(j=0;j<=dim_j;j++){ // include high face + for(i=0;i<=dim_i;i++){ // include high face + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + int ijk = (i+ghosts) + (j+ghosts)*jStride + (k+ghosts)*kStride; + double x = hLevel*( (double)(i+level->my_boxes[box].low.i) + 0.5 ); // +0.5 to get to the center of cell + double y = hLevel*( (double)(j+level->my_boxes[box].low.j) + 0.5 ); + double z = hLevel*( (double)(k+level->my_boxes[box].low.k) + 0.5 ); + double A,B,Bx,By,Bz,Bi,Bj,Bk; + double U,Ux,Uy,Uz,Uxx,Uyy,Uzz; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + A = 1.0; + B = 1.0; + Bx = 0.0; + By = 0.0; + Bz = 0.0; + Bi = 1.0; + Bj = 1.0; + Bk = 1.0; + #ifdef STENCIL_VARIABLE_COEFFICIENT // variable coefficient problem... + evaluateBeta(x-hLevel*0.5,y ,z ,&Bi,&Bx,&By,&Bz); // face-centered value of Beta for beta_i + evaluateBeta(x ,y-hLevel*0.5,z ,&Bj,&Bx,&By,&Bz); // face-centered value of Beta for beta_j + evaluateBeta(x ,y ,z-hLevel*0.5,&Bk,&Bx,&By,&Bz); // face-centered value of Beta for beta_k + evaluateBeta(x ,y ,z ,&B ,&Bx,&By,&Bz); // cell-centered value of Beta + #endif + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + evaluateU(x,y,z,&U,&Ux,&Uy,&Uz,&Uxx,&Uyy,&Uzz, (level->boundary_condition.type == BC_PERIODIC) ); + double F = a*A*U - b*( (Bx*Ux + By*Uy + Bz*Uz) + B*(Uxx + Uyy + Uzz) ); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + level->my_boxes[box].vectors[VECTOR_BETA_I][ijk] = Bi; + level->my_boxes[box].vectors[VECTOR_BETA_J][ijk] = Bj; + level->my_boxes[box].vectors[VECTOR_BETA_K][ijk] = Bk; + level->my_boxes[box].vectors[VECTOR_ALPHA ][ijk] = A; + //level->my_boxes[box].vectors[VECTOR_UTRUE ][ijk] = U; // obviated by Richardson analysis + level->my_boxes[box].vectors[VECTOR_F ][ijk] = F; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + }}} + } + +} +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators/rebuild.c b/Util/hpgmg/finite-volume/source/operators/rebuild.c new file mode 100644 index 00000000..04fc7978 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/rebuild.c @@ -0,0 +1,202 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +/* +// power method for calculating the dominant eigenvalue of D^{-1}A +double power_method(level_type * level, double a, double b, int max_iterations){ + int i; + int x_id = VECTOR_U; + int Ax_id = VECTOR_TEMP; + double lambda_max = 0; + + #ifdef USE_MPI + double lmax_start = MPI_Wtime(); + #endif + if(level->my_rank==0){fprintf(stdout," calculating lambda_max...");fflush(stdout);} + + random_vector(level,x_id); + for(i=0;imy_rank==0){fprintf(stdout," %1.15e (%0.6f seconds)\n",lambda_max,MPI_Wtime()-lmax_start);} + #else + if(level->my_rank==0){fprintf(stdout," %1.15e\n",lambda_max);} + #endif + return(lambda_max); +} +*/ + + +//------------------------------------------------------------------------------------------------------------------------------ +// Accurate estimates of D^{-1} are essential in realizing high-performance and stable smoothers. +// Unfortunately, complex boundary conditions can make it difficult to express D^{-1} analytically +// As such, this black-box routine will calculate D^{-1}, l1 norm, the dominant eigenvalue using only the apply_op_ijk macro +// colors_in_each_dim should be sufficiently large as to decouple the boundary condition from the operator +// e.g. with quartic BC's, colors_in_each_dim==4 (total of 64 colors in 3D) +// If using periodic BCs, one should be able to set colors_in_each_dim to stencil_get_radius(); +// NOTE, as this function is not timed, it has not been optimized for performance. +void rebuild_operator_blackbox(level_type * level, double a, double b, int colors_in_each_dim){ + + // trying to color a 1^3 grid with 8 colors won't work... reduce the number of colors... + if(level->dim.idim.i; + if(level->dim.jdim.j; + if(level->dim.kdim.k; + + if(level->my_rank==0){fprintf(stdout," calculating D^{-1} exactly for level h=%e using %d colors... ",level->h,colors_in_each_dim*colors_in_each_dim*colors_in_each_dim);fflush(stdout);} + #ifdef USE_MPI + double dinv_start = MPI_Wtime(); + #endif + + #if 0 // naive version using existing routines. Doesn't calculate l1inv or estimate the dominant eigenvalue + int x_id = VECTOR_U; + int Ax_id = VECTOR_TEMP; + int icolor,jcolor,kcolor; + zero_vector(level,VECTOR_DINV); + zero_vector(level,VECTOR_L1INV); + for(kcolor=0;kcolornum_my_blocks) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const double h2inv = 1.0/(level->h*level->h); + const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + double * __restrict__ Aii = level->my_boxes[box].vectors[ Aii_id] + ghosts*(1+jStride+kStride); + double * __restrict__ sumAbsAij = level->my_boxes[box].vectors[ sumAbsAij_id] + ghosts*(1+jStride+kStride); + + int i,j,k; + for(k=klo;knum_my_blocks,dominant_eigenvalue) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const double h2inv = 1.0/(level->h*level->h); + double * __restrict__ Aii = level->my_boxes[box].vectors[ Aii_id] + ghosts*(1+jStride+kStride); + double * __restrict__ sumAbsAij = level->my_boxes[box].vectors[sumAbsAij_id] + ghosts*(1+jStride+kStride); + + double block_eigenvalue = -1e9; + int i,j,k; + for(k=klo;kmy_boxes[box].low.i,j+level->my_boxes[box].low.j,k+level->my_boxes[box].low.k); + Aii[ijk] = a+b*h2inv; // FIX !!! + } + + // upper limit to Gershgorin disc == bound on dominant eigenvalue + double Di = (Aii[ijk] + sumAbsAij[ijk])/Aii[ijk];if(Di>block_eigenvalue)block_eigenvalue=Di; + + // inverse of the L1 row norm... L1inv = ( D+D^{L1} )^{-1} + // sumAbsAij[ijk] = 1.0/(Aii[ijk]+sumAbsAij[ijk]); + // alternately, as suggested by eq 6.5 in Baker et al, "Multigrid smoothers for ultra-parallel computing: additional theory and discussion"... + if(Aii[ijk]>=1.5*sumAbsAij[ijk])sumAbsAij[ijk] = 1.0/(Aii[ijk] ); // VECTOR_L1INV = ... + else sumAbsAij[ijk] = 1.0/(Aii[ijk]+0.5*sumAbsAij[ijk]); // VECTOR_L1INV = ... + + // inverse of the diagonal... + Aii[ijk] = 1.0/Aii[ijk]; // VECTOR_DINV = ... + + }}} + if(block_eigenvalue>dominant_eigenvalue){dominant_eigenvalue = block_eigenvalue;} + } + #ifdef USE_MPI + if(level->my_rank==0){fprintf(stdout,"done (%0.6f seconds)\n",MPI_Wtime()-dinv_start);} + #else + if(level->my_rank==0){fprintf(stdout,"done\n");} + #endif + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Reduce the local estimate of the dominant eigenvalue to a global estimate + #ifdef USE_MPI + double _timeStartAllReduce = getTime(); + double send = dominant_eigenvalue; + MPI_Allreduce(&send,&dominant_eigenvalue,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD); + double _timeEndAllReduce = getTime(); + level->timers.collectives += (double)(_timeEndAllReduce-_timeStartAllReduce); + #endif + if(level->my_rank==0){fprintf(stdout," estimating lambda_max... <%1.15e\n",dominant_eigenvalue);fflush(stdout);} + level->dominant_eigenvalue_of_DinvA = dominant_eigenvalue; + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + //level->dominant_eigenvalue_of_DinvA = power_method(level,a,b,10); + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #endif +} +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/operators/residual.c b/Util/hpgmg/finite-volume/source/operators/residual.c new file mode 100644 index 00000000..36a50f27 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/residual.c @@ -0,0 +1,50 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +// This routines calculates the residual (res=rhs-Ax) using the linear operator specified in the apply_op_ijk macro +// This requires exchanging a ghost zone and/or enforcing a boundary condition. +// NOTE, x_id must be distinct from rhs_id and res_id +void residual(level_type * level, int res_id, int x_id, int rhs_id, double a, double b){ + // exchange the boundary for x in prep for Ax... + exchange_boundary(level,x_id,stencil_get_shape()); + apply_BCs(level,x_id,stencil_get_shape()); + + // now do residual/restriction proper... + double _timeStart = getTime(); + int block; + + PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) + for(block=0;blocknum_my_blocks;block++){ + const int box = level->my_blocks[block].read.box; + const int ilo = level->my_blocks[block].read.i; + const int jlo = level->my_blocks[block].read.j; + const int klo = level->my_blocks[block].read.k; + const int ihi = level->my_blocks[block].dim.i + ilo; + const int jhi = level->my_blocks[block].dim.j + jlo; + const int khi = level->my_blocks[block].dim.k + klo; + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const double h2inv = 1.0/(level->h*level->h); + const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + double * __restrict__ res = level->my_boxes[box].vectors[ res_id] + ghosts*(1+jStride+kStride); + + for(k=klo;ktimers.residual += (double)(getTime()-_timeStart); +} + diff --git a/Util/hpgmg/finite-volume/source/operators/restriction.c b/Util/hpgmg/finite-volume/source/operators/restriction.c new file mode 100644 index 00000000..d6e94659 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/restriction.c @@ -0,0 +1,206 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +static inline void restriction_pc_block(level_type *level_c, int id_c, level_type *level_f, int id_f, blockCopy_type *block, int restrictionType){ + // restrict 3D array from read_i,j,k of read[] to write_i,j,k in write[] using piecewise constant restriction (cell averaged) + int dim_i = block->dim.i; // calculate the dimensions of the resultant coarse block + int dim_j = block->dim.j; + int dim_k = block->dim.k; + + int read_i = block->read.i; + int read_j = block->read.j; + int read_k = block->read.k; + int read_jStride = block->read.jStride; + int read_kStride = block->read.kStride; + + int write_i = block->write.i; + int write_j = block->write.j; + int write_k = block->write.k; + int write_jStride = block->write.jStride; + int write_kStride = block->write.kStride; + + double * __restrict__ read = block->read.ptr; + double * __restrict__ write = block->write.ptr; + if(block->read.box >=0){ + read_jStride = level_f->my_boxes[block->read.box ].jStride; + read_kStride = level_f->my_boxes[block->read.box ].kStride; + read = level_f->my_boxes[ block->read.box].vectors[id_f] + level_f->my_boxes[ block->read.box].ghosts*(1+ read_jStride+ read_kStride); + } + if(block->write.box>=0){ + write_jStride = level_c->my_boxes[block->write.box].jStride; + write_kStride = level_c->my_boxes[block->write.box].kStride; + write = level_c->my_boxes[block->write.box].vectors[id_c] + level_c->my_boxes[block->write.box].ghosts*(1+write_jStride+write_kStride); + } + + + + int i,j,k; + int ii,jj,kk; + switch(restrictionType){ + case RESTRICT_CELL: + for(k=0,kk=0;ktag<<4) | 0x5; + + + + + #ifdef USE_MPI + // by convention, level_f allocates a combined array of requests for both level_f sends and level_c recvs... + int nMessages = level_c->restriction[restrictionType].num_recvs + level_f->restriction[restrictionType].num_sends; + MPI_Request *recv_requests = level_f->restriction[restrictionType].requests; + MPI_Request *send_requests = level_f->restriction[restrictionType].requests + level_c->restriction[restrictionType].num_recvs; + + + // loop through packed list of MPI receives and prepost Irecv's... + if(level_c->restriction[restrictionType].num_recvs>0){ + _timeStart = getTime(); + #ifdef USE_MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=0;nrestriction[restrictionType].num_recvs;n++){ + MPI_Irecv(level_c->restriction[restrictionType].recv_buffers[n], + level_c->restriction[restrictionType].recv_sizes[n], + MPI_DOUBLE, + level_c->restriction[restrictionType].recv_ranks[n], + my_tag, + MPI_COMM_WORLD, + &recv_requests[n] + ); + } + _timeEnd = getTime(); + level_f->timers.restriction_recv += (_timeEnd-_timeStart); + } + + + // pack MPI send buffers... + if(level_f->restriction[restrictionType].num_blocks[0]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->restriction[restrictionType].num_blocks[0]) + for(buffer=0;bufferrestriction[restrictionType].num_blocks[0];buffer++){ + restriction_pc_block(level_c,id_c,level_f,id_f,&level_f->restriction[restrictionType].blocks[0][buffer],restrictionType); + } + _timeEnd = getTime(); + level_f->timers.restriction_pack += (_timeEnd-_timeStart); + } + + + // loop through MPI send buffers and post Isend's... + if(level_f->restriction[restrictionType].num_sends>0){ + _timeStart = getTime(); + #ifdef USE_MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=0;nrestriction[restrictionType].num_sends;n++){ + MPI_Isend(level_f->restriction[restrictionType].send_buffers[n], + level_f->restriction[restrictionType].send_sizes[n], + MPI_DOUBLE, + level_f->restriction[restrictionType].send_ranks[n], + my_tag, + MPI_COMM_WORLD, + &send_requests[n] + ); + } + _timeEnd = getTime(); + level_f->timers.restriction_send += (_timeEnd-_timeStart); + } + #endif + + + // perform local restriction[restrictionType]... try and hide within Isend latency... + if(level_f->restriction[restrictionType].num_blocks[1]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->restriction[restrictionType].num_blocks[1]) + for(buffer=0;bufferrestriction[restrictionType].num_blocks[1];buffer++){ + restriction_pc_block(level_c,id_c,level_f,id_f,&level_f->restriction[restrictionType].blocks[1][buffer],restrictionType); + } + _timeEnd = getTime(); + level_f->timers.restriction_local += (_timeEnd-_timeStart); + } + + + // wait for MPI to finish... + #ifdef USE_MPI + if(nMessages){ + _timeStart = getTime(); + MPI_Waitall(nMessages,level_f->restriction[restrictionType].requests,level_f->restriction[restrictionType].status); + _timeEnd = getTime(); + level_f->timers.restriction_wait += (_timeEnd-_timeStart); + } + + + // unpack MPI receive buffers + if(level_c->restriction[restrictionType].num_blocks[2]>0){ + _timeStart = getTime(); + PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->restriction[restrictionType].num_blocks[2]) + for(buffer=0;bufferrestriction[restrictionType].num_blocks[2];buffer++){ + CopyBlock(level_c,id_c,&level_c->restriction[restrictionType].blocks[2][buffer]); + } + _timeEnd = getTime(); + level_f->timers.restriction_unpack += (_timeEnd-_timeStart); + } + #endif + + + level_f->timers.restriction_total += (double)(getTime()-_timeCommunicationStart); +} diff --git a/Util/hpgmg/finite-volume/source/operators/symgs.c b/Util/hpgmg/finite-volume/source/operators/symgs.c new file mode 100644 index 00000000..e554006a --- /dev/null +++ b/Util/hpgmg/finite-volume/source/operators/symgs.c @@ -0,0 +1,57 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +void smooth(level_type * level, int phi_id, int rhs_id, double a, double b){ + int box,s; + + for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps (forward/backward) per GS smooth + exchange_boundary(level,phi_id,stencil_get_shape()); + apply_BCs(level,phi_id,stencil_get_shape()); + + double _timeStart = getTime(); + #ifdef _OPENMP + #pragma omp parallel for private(box) + #endif + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int ghosts = level->box_ghosts; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int dim = level->my_boxes[box].dim; + const double h2inv = 1.0/(level->h*level->h); + double * __restrict__ phi = level->my_boxes[box].vectors[ phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); + const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); + const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); + const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); + + + if( (s&0x1)==0 ){ // forward sweep... hard to thread + for(k=0;k=0;k--){ + for(j=dim-1;j>=0;j--){ + for(i=dim-1;i>=0;i--){ + int ijk = i + j*jStride + k*kStride; + double Ax = apply_op_ijk(phi); + phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax); + }}} + } + + } // boxes + level->timers.smooth += (double)(getTime()-_timeStart); + } // s-loop +} + + +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/solvers.c b/Util/hpgmg/finite-volume/source/solvers.c new file mode 100644 index 00000000..158970c4 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/solvers.c @@ -0,0 +1,101 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#include "timers.h" +#include "defines.h" +#include "level.h" +#include "operators.h" +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef USE_BICGSTAB +#include "solvers/bicgstab.c" +#elif USE_CG +#include "solvers/cg.c" +#elif USE_CABICGSTAB +#include "solvers/cabicgstab.c" +#elif USE_CACG +#include "solvers/cacg.c" +#endif +//------------------------------------------------------------------------------------------------------------------------------ +void IterativeSolver(level_type * level, int u_id, int f_id, double a, double b, double desired_reduction_in_norm){ + if(!level->active)return; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + if(level->must_subtract_mean==-1){ + level->must_subtract_mean=0; + int alpha_is_zero = (dot(level,VECTOR_ALPHA,VECTOR_ALPHA) == 0.0); + if( (level->boundary_condition.type==BC_PERIODIC) && ((a==0) || (alpha_is_zero)) )level->must_subtract_mean = 1; // Poisson with Periodic BCs + } + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #if 0 + if( (level->dim.i==1)&&(level->dim.j==1)&&(level->dim.k==1) ){ + // I have reduced the system to 1 equation and 1 unknown and know D^{-1} exactly + // therefore A^{-1} == D^{-1} = 1/a00 + // u = A^{-1}f == D^{-1}f + mul_vectors(level,u_id,1.0,VECTOR_DINV,f_id); // u = A^{-1}f = D^{-1}f + if(level->must_subtract_mean == 1){ + double mean_of_u = mean(level,u_id); + shift_vector(level,u_id,u_id,-mean_of_u); + } + return; + } + #endif + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #ifdef USE_BICGSTAB + BiCGStab(level,u_id,f_id,a,b,desired_reduction_in_norm); + #elif USE_CG + CG(level,u_id,f_id,a,b,desired_reduction_in_norm); + #elif USE_CABICGSTAB + CABiCGStab(level,u_id,f_id,a,b,desired_reduction_in_norm); + #elif USE_CACG + CACG(level,u_id,f_id,a,b,desired_reduction_in_norm); + #else + // just point relaxation via multiple smooth()'s + if(level->must_subtract_mean == 1){ + double mean_of_u = mean(level,u_id); + shift_vector(level,u_id,u_id,-mean_of_u); + } + residual(level,VECTOR_TEMP,u_id,f_id,a,b); + //mul_vectors(level,VECTOR_TEMP,1.0,VECTOR_TEMP,VECTOR_DINV); // Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria... + double norm_of_r0 = norm(level,VECTOR_TEMP); + int s=0,maxSmoothsBottom=200,converged=0; + while( (sKrylov_iterations++; + smooth(level,u_id,f_id,a,b); + if(level->must_subtract_mean == 1){ + double mean_of_u = mean(level,u_id); + shift_vector(level,u_id,u_id,-mean_of_u); + } + residual(level,VECTOR_TEMP,u_id,f_id,a,b); + //mul_vectors(level,VECTOR_TEMP,1.0,VECTOR_TEMP,VECTOR_DINV); // Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria... + double norm_of_r = norm(level,VECTOR_TEMP); + if(norm_of_r == 0.0){converged=1;break;} + if(norm_of_r < desired_reduction_in_norm*norm_of_r0){converged=1;break;} + } + #endif + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +} + + +//------------------------------------------------------------------------------------------------------------------------------ +int IterativeSolver_NumVectors(){ + // additionally number of vectors required by an iterative solver... + #ifdef USE_BICGSTAB + return(8); // BiCGStab requires additional vectors r0,r,p,s,Ap,As + #elif USE_CG + return(5); // CG requires extra vectors r0,r,p,Ap,z + #elif USE_CABICGSTAB + return(4+4*CA_KRYLOV_S); // CABiCGStab requires additional vectors rt,p,r,P[2s+1],R[2s]. + #elif USE_CACG + return(4+2*CA_KRYLOV_S); // CACG requires additional vectors r0,p,r,P[s+1],R[s]. + #endif + return(0); // simply doing multiple smooths requires no extra vectors +} +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/solvers.h b/Util/hpgmg/finite-volume/source/solvers.h new file mode 100644 index 00000000..7ba580c4 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/solvers.h @@ -0,0 +1,12 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef SOLVERS_H +#define SOLVERS_H +//------------------------------------------------------------------------------------------------------------------------------ +void IterativeSolver(level_type *level, int u_id, int f_id, double a, double b, double desired_reduction_in_norm); +int IterativeSolver_NumVectors(); +//------------------------------------------------------------------------------------------------------------------------------ +#endif diff --git a/Util/hpgmg/finite-volume/source/solvers/bicgstab.c b/Util/hpgmg/finite-volume/source/solvers/bicgstab.c new file mode 100644 index 00000000..38b4f063 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/solvers/bicgstab.c @@ -0,0 +1,97 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#define KRYLOV_DIAGONAL_PRECONDITION +//------------------------------------------------------------------------------------------------------------------------------ +void BiCGStab(level_type * level, int x_id, int R_id, double a, double b, double desired_reduction_in_norm){ + // Algorithm 7.7 in Iterative Methods for Sparse Linear Systems(Yousef Saad) + // Algorithm 1 in Analysis and Practical use of Flexible BiCGStab (Jie Chen) + int r0_id = VECTORS_RESERVED+0; + int r_id = VECTORS_RESERVED+1; + int p_id = VECTORS_RESERVED+2; + int q_id = VECTORS_RESERVED+3; // q = D^{-1}p + int s_id = VECTORS_RESERVED+4; + int t_id = VECTORS_RESERVED+5; // t = D^{-1}s + int Ap_id = VECTORS_RESERVED+6; + int As_id = VECTORS_RESERVED+7; + + int jMax=200; + int j=0; + int BiCGStabFailed = 0; + int BiCGStabConverged = 0; + residual(level,r0_id,x_id,R_id,a,b); // r0[] = R_id[] - A(x_id) + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + if(level->must_subtract_mean == 1){ + double mean_of_r0 = mean(level,r0_id); + shift_vector(level,r0_id,r0_id,-mean_of_r0); + } + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + scale_vector(level,r_id,1.0,r0_id); // r[] = r0[] + scale_vector(level,p_id,1.0,r0_id); // p[] = r0[] + double r_dot_r0 = dot(level,r_id,r0_id); // r_dot_r0 = dot(r,r0) + double norm_of_r0 = norm(level,r_id); // the norm of the initial residual... + if(r_dot_r0 == 0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution + if(norm_of_r0 == 0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution + while( (jKrylov_iterations++; // + #ifdef KRYLOV_DIAGONAL_PRECONDITION // + mul_vectors(level,q_id,1.0,VECTOR_DINV,p_id); // q[] = Dinv[]*p[] + #else // + scale_vector(level,q_id,1.0,p_id); // q[] = p[] + #endif // + apply_op(level,Ap_id,q_id,a,b); // Ap[] = AM^{-1}(p) + double Ap_dot_r0 = dot(level,Ap_id,r0_id); // Ap_dot_r0 = dot(Ap,r0) + if(Ap_dot_r0 == 0.0){BiCGStabFailed=1;break;} // pivot breakdown ??? + double alpha = r_dot_r0 / Ap_dot_r0; // alpha = r_dot_r0 / Ap_dot_r0 + if(isinf(alpha)){BiCGStabFailed=2;break;} // pivot breakdown ??? + add_vectors(level,x_id,1.0,x_id, alpha, q_id); // x_id[] = x_id[] + alpha*q[] + add_vectors(level,s_id,1.0,r_id,-alpha,Ap_id); // s[] = r[] - alpha*Ap[] (intermediate residual?) + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + if(level->must_subtract_mean == 1){ + double mean_of_s = mean(level,s_id); + shift_vector(level,s_id,s_id,-mean_of_s); + } + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + double norm_of_s = norm(level,s_id); // FIX - redundant?? norm of intermediate residual + if(norm_of_s == 0.0){BiCGStabConverged=1;break;} // FIX - redundant?? if As_dot_As==0, then As must be 0 which implies s==0 + if(norm_of_s < desired_reduction_in_norm*norm_of_r0){BiCGStabConverged=1;break;} + #ifdef KRYLOV_DIAGONAL_PRECONDITION // + mul_vectors(level,t_id,1.0,VECTOR_DINV,s_id); // t[] = Dinv[]*s[] + #else // + scale_vector(level,t_id,1.0,s_id); // t[] = s[] + #endif // + apply_op(level,As_id,t_id,a,b); // As = AM^{-1}(s) + double As_dot_As = dot(level,As_id,As_id); // As_dot_As = dot(As,As) + double As_dot_s = dot(level,As_id, s_id); // As_dot_s = dot(As, s) + if(As_dot_As == 0.0){BiCGStabConverged=1;break;} // converged ? + double omega = As_dot_s / As_dot_As; // omega = As_dot_s / As_dot_As + if(omega == 0.0){BiCGStabFailed=3;break;} // stabilization breakdown ??? + if(isinf(omega)){BiCGStabFailed=4;break;} // stabilization breakdown ??? + add_vectors(level,x_id,1.0,x_id, omega, t_id); // x_id[] = x_id[] + omega*t[] + add_vectors(level,r_id,1.0,s_id,-omega,As_id); // r[] = s[] - omega*As[] (recursively computed / updated residual) + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + if(level->must_subtract_mean == 1){ + double mean_of_r = mean(level,r_id); + shift_vector(level,r_id,r_id,-mean_of_r); + } + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + double norm_of_r = norm(level,r_id); // norm of recursively computed residual (good enough??) + if(norm_of_r == 0.0){BiCGStabConverged=1;break;} // + if(norm_of_r < desired_reduction_in_norm*norm_of_r0){BiCGStabConverged=1;break;} + double r_dot_r0_new = dot(level,r_id,r0_id); // r_dot_r0_new = dot(r,r0) + if(r_dot_r0_new == 0.0){BiCGStabFailed=5;break;} // Lanczos breakdown ??? + double beta = (r_dot_r0_new/r_dot_r0) * (alpha/omega); // beta = (r_dot_r0_new/r_dot_r0) * (alpha/omega) + if(isinf(beta)){BiCGStabFailed=6;break;} // ??? + add_vectors(level,VECTOR_TEMP,1.0,p_id,-omega, Ap_id); // VECTOR_TEMP = (p[]-omega*Ap[]) + add_vectors(level, p_id,1.0,r_id, beta,VECTOR_TEMP); // p[] = r[] + beta*(p[]-omega*Ap[]) + r_dot_r0 = r_dot_r0_new; // r_dot_r0 = r_dot_r0_new (save old r_dot_r0) + } // } +} diff --git a/Util/hpgmg/finite-volume/source/solvers/cabicgstab.c b/Util/hpgmg/finite-volume/source/solvers/cabicgstab.c new file mode 100644 index 00000000..97527afa --- /dev/null +++ b/Util/hpgmg/finite-volume/source/solvers/cabicgstab.c @@ -0,0 +1,518 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +//#define KRYLOV_DIAGONAL_PRECONDITION +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef CA_KRYLOV_TELESCOPING +#define CA_KRYLOV_TELESCOPING +#endif +#ifndef CA_KRYLOV_S +#define CA_KRYLOV_S 4 +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#include "matmul.c" +//------------------------------------------------------------------------------------------------------------------------------ +// z[r] = alpha*A[r][c]*x[c]+beta*y[r] // [row][col] +// z[r] = alpha*A[r][c]*x[c]+beta*y[r] // [row][col] +#define gemv(z,alpha,A,x,beta,y,rows,cols) {int r,c;double sum;for(r=0;r<(rows);r++){sum=0.0;for(c=0;c<(cols);c++){sum+=(A)[r][c]*(x)[c];}(z)[r]=(alpha)*sum+(beta)*(y)[r];}} +static inline void axpy(double * z, double alpha, double * x, double beta, double * y, int n){ // z[n] = alpha*x[n]+beta*y[n] + int nn; + for(nn=0;nnmy_rank==0)ffprintf(stderr,stderr,"m=%8d, norm =%0.20f\n",m,norm_of_rt); + #endif + if(norm_of_rt == 0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution + delta = dot(level,r_id,rt_id); // delta = dot(r,rt) + if(delta==0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution (square of L2 norm of r_id) + L2_norm_of_rt = sqrt(delta); + + int ca_krylov_s = 1; + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + while( (mCAKrylov_formations_of_G++; // Record the number of times CABiCGStab formed G[][] + matmul(level,Gg,PRrt,PRrt,4*ca_krylov_s+1,4*ca_krylov_s+2,1); + for(i=0,k=0;i<4*ca_krylov_s+1;i++){ // extract G[][] and g[] from Gg[] + for(j=0 ;j<4*ca_krylov_s+1;j++){G[i][j] = Gg[k++];} // first 4*ca_krylov_s+1 elements in each row go to G[][]. + g[i] = Gg[k++]; // last element in row goes to g[]. + } + + for(i=0;i<4*ca_krylov_s+1;i++)aj[i]=0.0;aj[ 0]=1.0; // initialized based on (3.26) + for(i=0;i<4*ca_krylov_s+1;i++)cj[i]=0.0;cj[2*ca_krylov_s+1]=1.0; // initialized based on (3.26) + for(i=0;i<4*ca_krylov_s+1;i++)ej[i]=0.0; // initialized based on (3.26) + + for(n=0;nKrylov_iterations++; // record number of inner-loop (j) iterations for comparison + gemv( Tpaj, 1.0, Tp, aj, 0.0, Tpaj,4*ca_krylov_s+1,4*ca_krylov_s+1); // T'aj + gemv( Tpcj, 1.0, Tp, cj, 0.0, Tpcj,4*ca_krylov_s+1,4*ca_krylov_s+1); // T'cj + gemv(Tppaj, 1.0,Tpp, aj, 0.0,Tppaj,4*ca_krylov_s+1,4*ca_krylov_s+1); // T''aj + g_dot_Tpaj = vdotv(g,Tpaj,4*ca_krylov_s+1); // (g,T'aj) + if(g_dot_Tpaj == 0.0){ // pivot breakdown ??? + #ifdef VERBOSE // + if(level->my_rank==0){ffprintf(stderr,stderr,"g_dot_Tpaj == 0.0\n");} // + #endif // + BiCGStabFailed=1;break; // + } // + alpha = delta / g_dot_Tpaj; // delta / (g,T'aj) + if(isinf(alpha)){ // alpha = big/tiny(overflow) = inf -> breakdown + #ifdef VERBOSE // + if(level->my_rank==0){ffprintf(stderr,stderr,"alpha == inf\n");} // + #endif // + BiCGStabFailed=1;break; // + } // + #if 0 // seems to have accuracy problems in finite precision... + gemv(temp1,-alpha, G, Tpaj, 0.0,temp1,4*ca_krylov_s+1,4*ca_krylov_s+1); // temp1[] = - alpha*GT'aj + gemv(temp1, 1.0, G, cj, 1.0,temp1,4*ca_krylov_s+1,4*ca_krylov_s+1); // temp1[] = Gcj - alpha*GT'aj + gemv(temp2,-alpha, G,Tppaj, 0.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1); // temp2[] = − alpha*GT′′aj + gemv(temp2, 1.0, G, Tpcj, 1.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1); // temp2[] = GT′cj − alpha*GT′′aj + axpy(temp3, 1.0, Tpcj,-alpha,Tppaj,4*ca_krylov_s+1); // temp3[] = T′cj − alpha*T′′aj + omega_numerator = vdotv(temp3,temp1,4*ca_krylov_s+1); // (temp3,temp1) = ( T'cj-alpha*T''aj , Gcj-alpha*GT'aj ) + omega_denominator = vdotv(temp3,temp2,4*ca_krylov_s+1); // (temp3,temp2) = ( T′cj−alpha*T′′aj , GT′cj−alpha*GT′′aj ) + #else // better to change the order of operations Gx-Gy -> G(x-y) ... (note, G is symmetric) + axpy(temp1, 1.0, Tpcj,-alpha,Tppaj,4*ca_krylov_s+1); // temp1[] = (T'cj - alpha*T''aj) + gemv(temp2, 1.0, G,temp1, 0.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1); // temp2[] = G(T'cj - alpha*T''aj) + axpy(temp3, 1.0, cj,-alpha, Tpaj,4*ca_krylov_s+1); // temp3[] = cj - alpha*T'aj + omega_numerator = vdotv(temp3,temp2,4*ca_krylov_s+1); // (temp3,temp2) = ( ( cj - alpha*T'aj ) , G(T'cj - alpha*T''aj) ) + omega_denominator = vdotv(temp1,temp2,4*ca_krylov_s+1); // (temp1,temp2) = ( (T'cj - alpha*T''aj) , G(T'cj - alpha*T''aj) ) + #endif // + // NOTE: omega_numerator/omega_denominator can be 0/x or 0/0, but should never be x/0 + // If omega_numerator==0, and ||s||==0, then convergence, x=x+alpha*aj + // If omega_numerator==0, and ||s||!=0, then stabilization breakdown + + // !!! PARTIAL UPDATE OF ej MUST HAPPEN BEFORE THE CHECK ON OMEGA TO ENSURE FORWARD PROGRESS !!! + axpy( ej,1.0,ej, alpha, aj,4*ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + + // calculate the norm of Saad's vector 's' to check intra s-step convergence... + axpy(temp1, 1.0, cj,-alpha, Tpaj,4*ca_krylov_s+1); // temp1[] = cj - alpha*T'aj + gemv(temp2, 1.0, G,temp1, 0.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1); // temp2[] = G(cj - alpha*T'aj) + L2_norm_of_s = vdotv(temp1,temp2,4*ca_krylov_s+1); // (temp1,temp2) = ( (cj - alpha*T'aj) , G(cj - alpha*T'aj) ) == square of L2 norm of s in exact arithmetic + if(L2_norm_of_s<0)L2_norm_of_s=0;else L2_norm_of_s=sqrt(L2_norm_of_s); // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0) + #ifdef VERBOSE // + if(level->my_rank==0){fprintf(stderr,"m=%8d, norm(s)=%0.20f\n",m+n,L2_norm_of_s);} // + #endif // + if(L2_norm_of_s < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop + + + if(omega_denominator == 0.0){ // ??? breakdown + #ifdef VERBOSE // + if(level->my_rank==0){if(omega_denominator == 0.0)fprintf(stderr,"omega_denominator == 0.0\n");}// + #endif // + BiCGStabFailed=1;break; // + } // + omega = omega_numerator / omega_denominator; // + if(isinf(omega)){ // omega = big/tiny(oveflow) = inf + #ifdef VERBOSE // + if(level->my_rank==0){if(isinf(omega))fprintf(stderr,"omega == inf\n");} // + #endif // + BiCGStabFailed=1;break; // + } // + // !!! COMPLETE THE UPDATE OF ej & cj now that omega is known to be ok // + axpy( ej,1.0,ej, omega, cj,4*ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + omega*cj[] + axpy( ej,1.0,ej,-omega*alpha, Tpaj,4*ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + omega*cj[] - omega*alpha*T'aj[] + axpy( cj,1.0,cj, -omega, Tpcj,4*ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] + axpy( cj,1.0,cj, -alpha, Tpaj,4*ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] + axpy( cj,1.0,cj, omega*alpha,Tppaj,4*ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] + omega*alpha*T''aj[] + + + // calculate the norm of the incremental residual (Saad's vector 'r') to check intra s-step convergence... + gemv(temp1, 1.0, G, cj, 0.0,temp1,4*ca_krylov_s+1,4*ca_krylov_s+1); // temp1[] = Gcj + cj_dot_Gcj = vdotv(cj,temp1,4*ca_krylov_s+1); // sqrt( (cj,Gcj) ) == L2 norm of the intermediate residual in exact arithmetic + L2_norm_of_residual = 0.0;if(cj_dot_Gcj>0)L2_norm_of_residual=sqrt(cj_dot_Gcj); // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0) + #ifdef VERBOSE + if(level->my_rank==0){fprintf(stderr,"m=%8d, norm(r)=%0.20f (cj_dot_Gcj=%0.20e)\n",m+n,L2_norm_of_residual,cj_dot_Gcj);} + #endif + if(L2_norm_of_residual < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop + + + delta_next = vdotv( g,cj,4*ca_krylov_s+1); // (g,cj) + #ifdef VERBOSE // + if(level->my_rank==0){ // + if(isinf(delta_next) ){fprintf(stderr,"delta == inf\n");} // delta = big/tiny(overflow) = inf + if(delta_next == 0.0){fprintf(stderr,"delta == 0.0\n");} // Lanczos breakdown + if(omega_numerator == 0.0){fprintf(stderr,"omega_numerator == 0.0\n");} // stabilization breakdown + if(omega == 0.0){fprintf(stderr,"omega == 0.0\n");} // stabilization breakdown + } // + #endif // + if(isinf(delta_next)){BiCGStabFailed =1;break;} // delta = inf? + if(delta_next ==0.0){BiCGStabFailed =1;break;} // Lanczos breakdown... + if(omega ==0.0){BiCGStabFailed =1;break;} // stabilization breakdown + beta = (delta_next/delta)*(alpha/omega); // (delta_next/delta)*(alpha/omega) + #ifdef VERBOSE // + if(level->my_rank==0){ // + if(isinf(beta) ){fprintf(stderr,"beta == inf\n");} // beta = inf? + if(beta == 0.0){fprintf(stderr,"beta == 0.0\n");} // beta = 0? can't make further progress(?) + } // + #endif // + if(isinf(beta) ){BiCGStabFailed =1;break;} // beta = inf? + if(beta == 0.0){BiCGStabFailed =1;break;} // beta = 0? can't make further progress(?) + axpy( aj,1.0,cj, beta, aj,4*ca_krylov_s+1); // aj[] = cj[] + beta*aj[] + axpy( aj,1.0,aj, -omega*beta, Tpaj,4*ca_krylov_s+1); // aj[] = cj[] + beta*aj[] - omega*beta*T'aj + delta = delta_next; // delta = delta_next + + } // inner n (j) loop + + // update iterates... + for(i=0;i<4*ca_krylov_s+1;i++){add_vectors(level,e_id,1.0,e_id,ej[i],PRrt[i]);} // e_id[] = [P,R]ej + e_id[] + if(!BiCGStabFailed && !BiCGStabConverged){ // if we're done, then there is no point in updating these + add_vectors(level, p_id,0.0, p_id,aj[0],PRrt[0]); // p[] = [P,R]aj + for(i=1;i<4*ca_krylov_s+1;i++){add_vectors(level, p_id,1.0, p_id,aj[i],PRrt[i]);} // ... + add_vectors(level, r_id,0.0, r_id,cj[0],PRrt[0]); // r[] = [P,R]cj + for(i=1;i<4*ca_krylov_s+1;i++){add_vectors(level, r_id,1.0, r_id,cj[i],PRrt[i]);} // ... + } // + m+=ca_krylov_s; // m+=ca_krylov_s; + ca_krylov_s*=2;if(ca_krylov_s>CA_KRYLOV_S)ca_krylov_s=CA_KRYLOV_S; + } // } // outer m loop + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #ifdef KRYLOV_DIAGONAL_PRECONDITION + mul_vectors(level,e_id,1.0, VECTOR_DINV,e_id); // e_id[] = Dinv[]*e_id[] // i.e. e = D^{-1}e' + #endif + +} +//------------------------------------------------------------------------------------------------------------------------------ +#else // CA_KRYLOV_TELESCOPING =0 +void CABiCGStab(level_type * level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm){ + // based on Erin Carson/Jim Demmel/Nick Knight's s-Step BiCGStab Algorithm 3.4 + int rt_id = VECTORS_RESERVED+0; + int r_id = VECTORS_RESERVED+1; + int p_id = VECTORS_RESERVED+2; + int PRrt_id = VECTORS_RESERVED+3; + + // note: CA_KRYLOV_S should be tiny (2-8?). As such, 4*CA_KRYLOV_S+1 is also tiny (9-33). Just allocate on the stack... + double temp1[4*CA_KRYLOV_S+1]; // + double temp2[4*CA_KRYLOV_S+1]; // + double temp3[4*CA_KRYLOV_S+1]; // + double Tp[4*CA_KRYLOV_S+1][4*CA_KRYLOV_S+1]; // T' indexed as [row][col] + double Tpp[4*CA_KRYLOV_S+1][4*CA_KRYLOV_S+1]; // T'' indexed as [row][col] + double aj[4*CA_KRYLOV_S+1]; // + double cj[4*CA_KRYLOV_S+1]; // + double ej[4*CA_KRYLOV_S+1]; // + double Tpaj[4*CA_KRYLOV_S+1]; // + double Tpcj[4*CA_KRYLOV_S+1]; // + double Tppaj[4*CA_KRYLOV_S+1]; // + double G[4*CA_KRYLOV_S+1][4*CA_KRYLOV_S+1]; // extracted from first 4*CA_KRYLOV_S+1 columns of Gg[][]. indexed as [row][col] + double g[4*CA_KRYLOV_S+1]; // extracted from last [4*CA_KRYLOV_S+1] column of Gg[][]. + double Gg[(4*CA_KRYLOV_S+1)*(4*CA_KRYLOV_S+2)]; // buffer to hold the Gram-like matrix produced by matmul(). indexed as [row*(4*CA_KRYLOV_S+2) + col] + int PRrt[4*CA_KRYLOV_S+2]; // vector_id's of the concatenation of the 2S+1 matrix powers of P, 2S matrix powers of R, and rt + int *P = PRrt+ 0; // vector_id's of the 2S+1 Matrix Powers of P. P[i] is the vector_id of A^i(p) + int *R = PRrt+2*CA_KRYLOV_S+1; // vector_id's of the 2S Matrix Powers of R. R[i] is the vector_id of A^i(r) + + int mMax=200; + int m=0,n; + int i,j,k; + int BiCGStabFailed = 0; + int BiCGStabConverged = 0; + double g_dot_Tpaj,alpha,omega_numerator,omega_denominator,omega,delta,delta_next,beta; + double L2_norm_of_rt,L2_norm_of_residual,cj_dot_Gcj,L2_norm_of_s; + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + residual(level,rt_id,e_id,R_id,a,b); // rt[] = R_id[] - A(e_id)... note, if DPC, then rt = R-AD^-1De + scale_vector(level,r_id,1.0,rt_id); // r[] = rt[] + scale_vector(level, p_id,1.0,rt_id); // p[] = rt[] + double norm_of_rt = norm(level,rt_id); // the norm of the initial residual... + #ifdef VERBOSE + if(level->my_rank==0)fprintf(stderr,"m=%8d, norm =%0.20f\n",m,norm_of_rt); + #endif + if(norm_of_rt == 0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution + delta = dot(level,r_id,rt_id); // delta = dot(r,rt) + if(delta==0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution (square of L2 norm of r_id) + L2_norm_of_rt = sqrt(delta); + + int ca_krylov_s = CA_KRYLOV_S; // by making this a variable, I prevent the compiler from optimizing more than the telescoping version, thus preserving a bit-identcal result + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + for(i=0;i<4*ca_krylov_s+1;i++)for(j=0;j<4*ca_krylov_s+1;j++) Tp[i][j]=0; // initialize Tp[][] and Tpp[][] ... + for(i=0;i<4*ca_krylov_s+1;i++)for(j=0;j<4*ca_krylov_s+1;j++)Tpp[i][j]=0; // + for(i= 0;i<2*ca_krylov_s ;i++){ Tp[i+1][i]=1;} // monomial basis... Fixed (typo in SIAM paper) + for(i=2*ca_krylov_s+1;i<4*ca_krylov_s ;i++){ Tp[i+1][i]=1;} // + for(i= 0;i<2*ca_krylov_s-1;i++){Tpp[i+2][i]=1;} // + for(i=2*ca_krylov_s+1;i<4*ca_krylov_s-1;i++){Tpp[i+2][i]=1;} // + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + for(i=0;i<4*ca_krylov_s+1;i++){PRrt[ i] = PRrt_id+i;} // columns of PRrt map to the consecutive spare grid indices starting at PRrt_id + PRrt[4*ca_krylov_s+1] = rt_id; // last column or PRrt (r tilde) maps to rt + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + while( (mCAKrylov_formations_of_G++; // Record the number of times CABiCGStab formed G[][] + matmul(level,Gg,PRrt,PRrt,4*ca_krylov_s+1,4*ca_krylov_s+2,1); + for(i=0,k=0;i<4*ca_krylov_s+1;i++){ // extract G[][] and g[] from Gg[] + for(j=0 ;j<4*ca_krylov_s+1;j++){G[i][j] = Gg[k++];} // first 4*ca_krylov_s+1 elements in each row go to G[][]. + g[i] = Gg[k++]; // last element in row goes to g[]. + } + + for(i=0;i<4*ca_krylov_s+1;i++)aj[i]=0.0;aj[ 0]=1.0; // initialized based on (3.26) + for(i=0;i<4*ca_krylov_s+1;i++)cj[i]=0.0;cj[2*ca_krylov_s+1]=1.0; // initialized based on (3.26) + for(i=0;i<4*ca_krylov_s+1;i++)ej[i]=0.0; // initialized based on (3.26) + + for(n=0;nKrylov_iterations++; // record number of inner-loop (j) iterations for comparison + gemv( Tpaj, 1.0, Tp, aj, 0.0, Tpaj,4*ca_krylov_s+1,4*ca_krylov_s+1); // T'aj + gemv( Tpcj, 1.0, Tp, cj, 0.0, Tpcj,4*ca_krylov_s+1,4*ca_krylov_s+1); // T'cj + gemv(Tppaj, 1.0,Tpp, aj, 0.0,Tppaj,4*ca_krylov_s+1,4*ca_krylov_s+1); // T''aj + g_dot_Tpaj = vdotv(g,Tpaj,4*ca_krylov_s+1); // (g,T'aj) + if(g_dot_Tpaj == 0.0){ // pivot breakdown ??? + #ifdef VERBOSE // + if(level->my_rank==0){fprintf(stderr,"g_dot_Tpaj == 0.0\n");} // + #endif // + BiCGStabFailed=1;break; // + } // + alpha = delta / g_dot_Tpaj; // delta / (g,T'aj) + if(isinf(alpha)){ // alpha = big/tiny(overflow) = inf -> breakdown + #ifdef VERBOSE // + if(level->my_rank==0){fprintf(stderr,"alpha == inf\n");} // + #endif // + BiCGStabFailed=1;break; // + } // + #if 0 // seems to have accuracy problems in finite precision... + gemv(temp1,-alpha, G, Tpaj, 0.0,temp1,4*ca_krylov_s+1,4*ca_krylov_s+1); // temp1[] = - alpha*GT'aj + gemv(temp1, 1.0, G, cj, 1.0,temp1,4*ca_krylov_s+1,4*ca_krylov_s+1); // temp1[] = Gcj - alpha*GT'aj + gemv(temp2,-alpha, G,Tppaj, 0.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1); // temp2[] = − alpha*GT′′aj + gemv(temp2, 1.0, G, Tpcj, 1.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1); // temp2[] = GT′cj − alpha*GT′′aj + axpy(temp3, 1.0, Tpcj,-alpha,Tppaj,4*ca_krylov_s+1); // temp3[] = T′cj − alpha*T′′aj + omega_numerator = vdotv(temp3,temp1,4*ca_krylov_s+1); // (temp3,temp1) = ( T'cj-alpha*T''aj , Gcj-alpha*GT'aj ) + omega_denominator = vdotv(temp3,temp2,4*ca_krylov_s+1); // (temp3,temp2) = ( T′cj−alpha*T′′aj , GT′cj−alpha*GT′′aj ) + #else // better to change the order of operations Gx-Gy -> G(x-y) ... (note, G is symmetric) + axpy(temp1, 1.0, Tpcj,-alpha,Tppaj,4*ca_krylov_s+1); // temp1[] = (T'cj - alpha*T''aj) + gemv(temp2, 1.0, G,temp1, 0.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1); // temp2[] = G(T'cj - alpha*T''aj) + axpy(temp3, 1.0, cj,-alpha, Tpaj,4*ca_krylov_s+1); // temp3[] = cj - alpha*T'aj + omega_numerator = vdotv(temp3,temp2,4*ca_krylov_s+1); // (temp3,temp2) = ( ( cj - alpha*T'aj ) , G(T'cj - alpha*T''aj) ) + omega_denominator = vdotv(temp1,temp2,4*ca_krylov_s+1); // (temp1,temp2) = ( (T'cj - alpha*T''aj) , G(T'cj - alpha*T''aj) ) + #endif // + // NOTE: omega_numerator/omega_denominator can be 0/x or 0/0, but should never be x/0 + // If omega_numerator==0, and ||s||==0, then convergence, x=x+alpha*aj + // If omega_numerator==0, and ||s||!=0, then stabilization breakdown + + // !!! PARTIAL UPDATE OF ej MUST HAPPEN BEFORE THE CHECK ON OMEGA TO ENSURE FORWARD PROGRESS !!! + axpy( ej,1.0,ej, alpha, aj,4*ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + + // calculate the norm of Saad's vector 's' to check intra s-step convergence... + axpy(temp1, 1.0, cj,-alpha, Tpaj,4*ca_krylov_s+1); // temp1[] = cj - alpha*T'aj + gemv(temp2, 1.0, G,temp1, 0.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1); // temp2[] = G(cj - alpha*T'aj) + L2_norm_of_s = vdotv(temp1,temp2,4*ca_krylov_s+1); // (temp1,temp2) = ( (cj - alpha*T'aj) , G(cj - alpha*T'aj) ) == square of L2 norm of s in exact arithmetic + if(L2_norm_of_s<0)L2_norm_of_s=0;else L2_norm_of_s=sqrt(L2_norm_of_s); // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0) + #ifdef VERBOSE // + if(level->my_rank==0){fprintf(stderr,"m=%8d, norm(s)=%0.20f\n",m+n,L2_norm_of_s);} // + #endif // + if(L2_norm_of_s < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop + + + if(omega_denominator == 0.0){ // ??? breakdown + #ifdef VERBOSE // + if(level->my_rank==0){if(omega_denominator == 0.0)fprintf(stderr,"omega_denominator == 0.0\n");}// + #endif // + BiCGStabFailed=1;break; // + } // + omega = omega_numerator / omega_denominator; // + if(isinf(omega)){ // omega = big/tiny(oveflow) = inf + #ifdef VERBOSE // + if(level->my_rank==0){if(isinf(omega))fprintf(stderr,"omega == inf\n");} // + #endif // + BiCGStabFailed=1;break; // + } // + // !!! COMPLETE THE UPDATE OF ej & cj now that omega is known to be ok // + axpy( ej,1.0,ej, omega, cj,4*ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + omega*cj[] + axpy( ej,1.0,ej,-omega*alpha, Tpaj,4*ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + omega*cj[] - omega*alpha*T'aj[] + axpy( cj,1.0,cj, -omega, Tpcj,4*ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] + axpy( cj,1.0,cj, -alpha, Tpaj,4*ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] + axpy( cj,1.0,cj, omega*alpha,Tppaj,4*ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] + omega*alpha*T''aj[] + + + // calculate the norm of the incremental residual (Saad's vector 'r') to check intra s-step convergence... + gemv(temp1, 1.0, G, cj, 0.0,temp1,4*ca_krylov_s+1,4*ca_krylov_s+1); // temp1[] = Gcj + cj_dot_Gcj = vdotv(cj,temp1,4*ca_krylov_s+1); // sqrt( (cj,Gcj) ) == L2 norm of the intermediate residual in exact arithmetic + L2_norm_of_residual = 0.0;if(cj_dot_Gcj>0)L2_norm_of_residual=sqrt(cj_dot_Gcj); // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0) + #ifdef VERBOSE + if(level->my_rank==0){fprintf(stderr,"m=%8d, norm(r)=%0.20f (cj_dot_Gcj=%0.20e)\n",m+n,L2_norm_of_residual,cj_dot_Gcj);} + #endif + if(L2_norm_of_residual < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop + + + delta_next = vdotv( g,cj,4*ca_krylov_s+1); // (g,cj) + #ifdef VERBOSE // + if(level->my_rank==0){ // + if(isinf(delta_next) ){fprintf(stderr,"delta == inf\n");} // delta = big/tiny(overflow) = inf + if(delta_next == 0.0){fprintf(stderr,"delta == 0.0\n");} // Lanczos breakdown + if(omega_numerator == 0.0){fprintf(stderr,"omega_numerator == 0.0\n");} // stabilization breakdown + if(omega == 0.0){fprintf(stderr,"omega == 0.0\n");} // stabilization breakdown + } // + #endif // + if(isinf(delta_next)){BiCGStabFailed =1;break;} // delta = inf? + if(delta_next ==0.0){BiCGStabFailed =1;break;} // Lanczos breakdown... + if(omega ==0.0){BiCGStabFailed =1;break;} // stabilization breakdown + beta = (delta_next/delta)*(alpha/omega); // (delta_next/delta)*(alpha/omega) + #ifdef VERBOSE // + if(level->my_rank==0){ // + if(isinf(beta) ){fprintf(stderr,"beta == inf\n");} // beta = inf? + if(beta == 0.0){fprintf(stderr,"beta == 0.0\n");} // beta = 0? can't make further progress(?) + } // + #endif // + if(isinf(beta) ){BiCGStabFailed =1;break;} // beta = inf? + if(beta == 0.0){BiCGStabFailed =1;break;} // beta = 0? can't make further progress(?) + axpy( aj,1.0,cj, beta, aj,4*ca_krylov_s+1); // aj[] = cj[] + beta*aj[] + axpy( aj,1.0,aj, -omega*beta, Tpaj,4*ca_krylov_s+1); // aj[] = cj[] + beta*aj[] - omega*beta*T'aj + delta = delta_next; // delta = delta_next + + } // inner n (j) loop + + // update iterates... + for(i=0;i<4*ca_krylov_s+1;i++){add_vectors(level,e_id,1.0,e_id,ej[i],PRrt[i]);} // e_id[] = [P,R]ej + e_id[] + if(!BiCGStabFailed && !BiCGStabConverged){ // if we're done, then there is no point in updating these + add_vectors(level, p_id,0.0, p_id,aj[0],PRrt[0]); // p[] = [P,R]aj + for(i=1;i<4*ca_krylov_s+1;i++){add_vectors(level, p_id,1.0, p_id,aj[i],PRrt[i]);} // ... + add_vectors(level, r_id,0.0, r_id,cj[0],PRrt[0]); // r[] = [P,R]cj + for(i=1;i<4*ca_krylov_s+1;i++){add_vectors(level, r_id,1.0, r_id,cj[i],PRrt[i]);} // ... + } // + m+=ca_krylov_s; // m+=ca_krylov_s; + } // } // outer m loop + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #ifdef KRYLOV_DIAGONAL_PRECONDITION + mul_vectors(level,e_id,1.0, VECTOR_DINV,e_id); // e_id[] = Dinv[]*e_id[] // i.e. e = D^{-1}e' + #endif +} +#endif // CA_KRYLOV_TELESCOPING +//------------------------------------------------------------------------------------------------------------------------------ diff --git a/Util/hpgmg/finite-volume/source/solvers/cacg.c b/Util/hpgmg/finite-volume/source/solvers/cacg.c new file mode 100644 index 00000000..61449226 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/solvers/cacg.c @@ -0,0 +1,170 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef CA_KRYLOV_S +#define CA_KRYLOV_S 4 +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#include "matmul.c" +//------------------------------------------------------------------------------------------------------------------------------ +// z[r] = alpha*A[r][c]*x[c]+beta*y[r] // [row][col] +// z[r] = alpha*A[r][c]*x[c]+beta*y[r] // [row][col] +#define gemv(z,alpha,A,x,beta,y,rows,cols) {int r,c;double sum;for(r=0;r<(rows);r++){sum=0.0;for(c=0;c<(cols);c++){sum+=(A)[r][c]*(x)[c];}(z)[r]=(alpha)*sum+(beta)*(y)[r];}} +static inline void axpy(double * z, double alpha, double * x, double beta, double * y, int n){ // z[n] = alpha*x[n]+beta*y[n] + int nn; + for(nn=0;nnCAKrylov_formations_of_G++; // Record the number of times CACG formed G[][] + matmul(level,Gbuf,PR,PR,2*CA_KRYLOV_S+1,2*CA_KRYLOV_S+1,1); // Compute Gbuf[][] = [P,R]^T * [P,R] (Matmul with grids but only one MPI_AllReduce) + for(i=0,k=0;i<2*CA_KRYLOV_S+1;i++){ // extract G[][] from Gbuf[] + for(j=0 ;j<2*CA_KRYLOV_S+1;j++){G[i][j] = Gbuf[k++];} // first 2*CA_KRYLOV_S+1 elements in each row go to G[][]. + } + + + for(i=0;i<2*CA_KRYLOV_S+1;i++)aj[i]=0.0;aj[ 0]=1.0; // initialized based on (???) + for(i=0;i<2*CA_KRYLOV_S+1;i++)cj[i]=0.0;cj[CA_KRYLOV_S+1]=1.0; // initialized based on (???) + for(i=0;i<2*CA_KRYLOV_S+1;i++)ej[i]=0.0; // initialized based on (???) + + for(n=0;nKrylov_iterations++; // record number of inner-loop (j) iterations for comparison + gemv( Tpaj,1.0,Tp, aj,0.0, Tpaj,2*CA_KRYLOV_S+1,2*CA_KRYLOV_S+1); // T'aj + gemv(temp1,1.0, G,Tpaj,0.0,temp1,2*CA_KRYLOV_S+1,2*CA_KRYLOV_S+1); // temp1[] = GT'aj + gemv(temp2,1.0, G, cj,0.0,temp2,2*CA_KRYLOV_S+1,2*CA_KRYLOV_S+1); // temp2[] = Gcj + aj_dot_GTpaj = vdotv(aj,temp1,2*CA_KRYLOV_S+1); // (aj,GT'aj) + cj_dot_Gcj = vdotv(cj,temp2,2*CA_KRYLOV_S+1); // (cj, Gcj) + // FIX, can cj_dot_Gcj ever be zero ? + if(aj_dot_GTpaj == 0.0){ // pivot breakdown ??? + CGFailed=1;break; // + } // + alpha = cj_dot_Gcj / aj_dot_GTpaj; // alpha = (cj,Gcj) / (aj,GT'aj) + if(isinf(alpha)){ // alpha = big/tiny(overflow) = inf -> breakdown + CGFailed=1;break; // + } // + axpy( ej,1.0,ej, alpha, aj,2*CA_KRYLOV_S+1); // ej[] = ej[] + alpha*aj[] + axpy( cj,1.0,cj, -alpha, Tpaj,2*CA_KRYLOV_S+1); // cj[] = cj[] - alpha*T'*aj[] + gemv(temp2,1.0, G, cj,0.0,temp2,2*CA_KRYLOV_S+1,2*CA_KRYLOV_S+1); // temp2[] = Gcj + cj_dot_Gcj_new = vdotv(cj,temp2,2*CA_KRYLOV_S+1); // (cj, Gcj) + // calculate the norm of the incremental residual (Saad's vector 'r') to check intra s-step convergence... == cj_dot_Gcj_new?? + L2_norm_of_residual = 0.0;if(cj_dot_Gcj_new>0)L2_norm_of_residual=sqrt(cj_dot_Gcj_new); // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0) + if(L2_norm_of_residual < desired_reduction_in_norm*L2_norm_of_r0){CGConverged=1;break;} // terminate the inner n-loop + if(cj_dot_Gcj_new == 0.0){ // Lanczos breakdown ??? + CGFailed=1;break; // + } // + beta = cj_dot_Gcj_new / cj_dot_Gcj; // + if(isinf(beta)){CGFailed=1;break;} // beta = inf? + if(beta == 0.0){CGFailed=1;break;} // beta = 0? can't make further progress(?) + axpy( aj,1.0,cj, beta, aj,2*CA_KRYLOV_S+1); // cj[] = cj[] + beta*aj[] + + } // inner n (j) loop + + // update iterates... + for(i=0;i<2*CA_KRYLOV_S+1;i++){add_vectors(level,e_id,1.0,e_id,ej[i],PR[i]);} // e_id[] = [P,R]ej + e_id[] + if(!CGFailed && !CGConverged){ // if we're done, then there is no point in updating these + add_vectors(level, p_id,0.0, p_id,aj[0],PR[0]); // p[] = [P,R]aj + for(i=1;i<2*CA_KRYLOV_S+1;i++){add_vectors(level, p_id,1.0, p_id,aj[i],PR[i]);} // ... + add_vectors(level, r_id,0.0, r_id,cj[0],PR[0]); // r[] = [P,R]cj + for(i=1;i<2*CA_KRYLOV_S+1;i++){add_vectors(level, r_id,1.0, r_id,cj[i],PR[i]);} // ... + } + m+=CA_KRYLOV_S; // m+=CA_KRYLOV_S; + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + } // } // outer m loop + +} diff --git a/Util/hpgmg/finite-volume/source/solvers/cg.c b/Util/hpgmg/finite-volume/source/solvers/cg.c new file mode 100644 index 00000000..eaa386a3 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/solvers/cg.c @@ -0,0 +1,73 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#define KRYLOV_DIAGONAL_PRECONDITION +//------------------------------------------------------------------------------------------------------------------------------ +void CG(level_type * level, int x_id, int R_id, double a, double b, double desired_reduction_in_norm){ + // Algorithm 9.1 in Iterative Methods for Sparse Linear Systems(Yousef Saad) + int r0_id = VECTORS_RESERVED+0; + int r_id = VECTORS_RESERVED+1; + int p_id = VECTORS_RESERVED+2; + int Ap_id = VECTORS_RESERVED+3; + int z_id = VECTORS_RESERVED+4; + + int jMax=200; + int j=0; + int CGFailed = 0; + int CGConverged = 0; + residual(level,r0_id,x_id,R_id,a,b); // r0[] = R_id[] - A(x_id) + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + if(level->must_subtract_mean == 1){ + double mean_of_r0 = mean(level,r0_id); + shift_vector(level,r0_id,r0_id,-mean_of_r0); + } + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + scale_vector(level,r_id,1.0,r0_id); // r[] = r0[] + #ifdef KRYLOV_DIAGONAL_PRECONDITION // + mul_vectors(level,z_id,1.0,VECTOR_DINV,r0_id); // z[] = Dinv[]*r0[] + #else // + scale_vector(level,z_id,1.0,r0_id); // z[] = I*r0[] + #endif // + scale_vector(level,p_id,1.0,z_id); // p[] = z[] + double norm_of_r0 = norm(level,r_id); // the norm of the initial residual... + if(norm_of_r0 == 0.0){CGConverged=1;} // entered CG with exact solution + double r_dot_z = dot(level,r_id,z_id); // r_dot_z = dot(r,z) + while( (jKrylov_iterations++; // + apply_op(level,Ap_id,p_id,a,b); // Ap[] = A(p) + double Ap_dot_p = dot(level,Ap_id,p_id); // Ap_dot_p = dot(Ap,p) + if(Ap_dot_p == 0.0){CGFailed=1;break;} // pivot breakdown ??? + double alpha = r_dot_z / Ap_dot_p; // alpha = r_dot_z / Ap_dot_p + if(isinf(alpha)){CGFailed=1;break;} // ??? + add_vectors(level,x_id,1.0,x_id, alpha,p_id ); // x_id[] = x_id[] + alpha*p[] + add_vectors(level,r_id,1.0,r_id,-alpha,Ap_id); // r[] = r[] - alpha*Ap[] (intermediate residual?) + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + if(level->must_subtract_mean == 1){ + double mean_of_r = mean(level,r_id); + shift_vector(level,r_id,r_id,-mean_of_r); + } + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + double norm_of_r = norm(level,r_id); // norm of intermediate residual + if(norm_of_r == 0.0){CGConverged=1;break;} // + if(norm_of_r < desired_reduction_in_norm*norm_of_r0){CGConverged=1;break;} // + #ifdef KRYLOV_DIAGONAL_PRECONDITION // + mul_vectors(level,z_id,1.0,VECTOR_DINV,r_id); // z[] = Dinv[]*r[] + #else // + scale_vector(level,z_id,1.0,r_id); // z[] = I*r[] + #endif // + double r_dot_z_new = dot(level,r_id,z_id); // r_dot_z_new = dot(r_{j+1},z_{j+1}) + if(r_dot_z_new == 0.0){CGFailed=1;break;} // Lanczos breakdown ??? + double beta = (r_dot_z_new/r_dot_z); // beta = (r_dot_z_new/r_dot_z) + if(isinf(beta)){CGFailed=1;break;} // ??? + add_vectors(level,p_id,1.0,z_id,beta,p_id ); // p[] = z[] + beta*p[] + r_dot_z = r_dot_z_new; // r_dot_r = r_dot_r_new (save old r_dot_r) + } // } +} diff --git a/Util/hpgmg/finite-volume/source/solvers/matmul.c b/Util/hpgmg/finite-volume/source/solvers/matmul.c new file mode 100644 index 00000000..37883d55 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/solvers/matmul.c @@ -0,0 +1,64 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +void matmul(level_type * level, double *C, int * id_A, int * id_B, int rows, int cols, int A_equals_B_transpose){ + // *id_A = m vector_id's (conceptually pointers to the rows of a m x level->num_my_boxes*volume matrix) + // *id_B = n vector_id's (conceptually pointers to the columns of a level->num_my_boxes*volume matrix x n) + // *C is a mxn matrix where C[rows][cols] = dot(id_A[rows],id_B[cols]) + + // FIX, id_A and id_B are likely the same and thus C[][] will be symmetric (modulo missing row?) + // if(A_equals_B_transpose && (cols>=rows)) then use id_B and only run for nn>=mm // common case for s-step Krylov methods + // C_is_symmetric && cols< rows (use id_A) + int mm,nn; + + + double _timeStart = getTime(); + // FIX... rather than performing an all_reduce on the essentially symmetric [G,g], do the all_reduce on the upper triangle and then duplicate (saves BW) + #ifdef _OPENMP + #pragma omp parallel for schedule(static,1) collapse(2) + #endif + for(mm=0;mm=mm){ // upper triangular + int box; + double a_dot_b_level = 0.0; + for(box=0;boxnum_my_boxes;box++){ + int i,j,k; + const int jStride = level->my_boxes[box].jStride; + const int kStride = level->my_boxes[box].kStride; + const int ghosts = level->my_boxes[box].ghosts; + const int dim = level->my_boxes[box].dim; + double * __restrict__ grid_a = level->my_boxes[box].vectors[id_A[mm]] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point + double * __restrict__ grid_b = level->my_boxes[box].vectors[id_B[nn]] + ghosts*(1+jStride+kStride); + double a_dot_b_box = 0.0; + for(k=0;ktimers.blas3 += (double)(getTime()-_timeStart); + + #ifdef USE_MPI + double *send_buffer = (double*)malloc(rows*cols*sizeof(double)); + for(mm=0;mmMPI_COMM_ALLREDUCE); + double _timeEndAllReduce = getTime(); + level->timers.collectives += (double)(_timeEndAllReduce-_timeStartAllReduce); + free(send_buffer); + #endif + +} + diff --git a/Util/hpgmg/finite-volume/source/timers.c b/Util/hpgmg/finite-volume/source/timers.c new file mode 100644 index 00000000..cec93e68 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/timers.c @@ -0,0 +1,14 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef _OPENMP +// getTime in OpenMP is now defined as a preprocessor macro +//#include "./timers/omp.c" +#elif USE_MPI +// getTime in MPI is now defined as a preprocessor macro +//#include "./timers/mpi.c" +#else +#include "./timers/x86.c" +#endif diff --git a/Util/hpgmg/finite-volume/source/timers.h b/Util/hpgmg/finite-volume/source/timers.h new file mode 100644 index 00000000..27384357 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/timers.h @@ -0,0 +1,25 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef TIMER_H +#define TIMER_H + + #include + + #ifdef _OPENMP + #include + #define getTime() (omp_get_wtime()) + + #elif USE_MPI + #include + #define getTime() (MPI_Wtime()) + + #else + // user must provide a function getTime and include it in timers.c + // if calibration is necesary, then the user must #define CALIBRATE_TIMER + double getTime(); + #endif + +#endif diff --git a/Util/hpgmg/finite-volume/source/timers/mpi.c b/Util/hpgmg/finite-volume/source/timers/mpi.c new file mode 100644 index 00000000..adc0970e --- /dev/null +++ b/Util/hpgmg/finite-volume/source/timers/mpi.c @@ -0,0 +1,10 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +double getTime(){ + return(MPI_Wtime()); // timers are in units of seconds; no conversion is necessary +} diff --git a/Util/hpgmg/finite-volume/source/timers/omp.c b/Util/hpgmg/finite-volume/source/timers/omp.c new file mode 100644 index 00000000..bdf453e6 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/timers/omp.c @@ -0,0 +1,10 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +double getTime(){ + return(omp_get_wtime()); // timers are in units of seconds; no conversion is necessary +} diff --git a/Util/hpgmg/finite-volume/source/timers/x86.c b/Util/hpgmg/finite-volume/source/timers/x86.c new file mode 100644 index 00000000..a361b4b9 --- /dev/null +++ b/Util/hpgmg/finite-volume/source/timers/x86.c @@ -0,0 +1,12 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#define CALIBRATE_TIMER // mg.c will calibrate the timer to determine seconds per cycle +double getTime(){ + uint64_t lo, hi; + __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); + return( 1e-9*((double)( (((uint64_t)hi) << 32) | ((uint64_t)lo) )) ); // timers are in units of seconds; assume 1GHz cycle counter and convert later +}