diff --git a/Exec/DrivenTurbulence/GNUmakefile b/Exec/DrivenTurbulence/GNUmakefile
index 9aa34622..c3a2a381 100644
--- a/Exec/DrivenTurbulence/GNUmakefile
+++ b/Exec/DrivenTurbulence/GNUmakefile
@@ -1,5 +1,5 @@
 # AMREX_HOME defines the directory in which we will find all the BoxLib code
-AMREX_HOME ?= /project/projectdirs/nyx/src/amrex
+AMREX_HOME ?= ../../../amrex
 
 # TOP defines the directory in which we will find Source, Exec, etc
 TOP = ../..
diff --git a/Exec/DrivenTurbulence/heat_cool_stubs.f90 b/Exec/DrivenTurbulence/heat_cool_stubs.f90
index f7f956a1..75e1ca53 100644
--- a/Exec/DrivenTurbulence/heat_cool_stubs.f90
+++ b/Exec/DrivenTurbulence/heat_cool_stubs.f90
@@ -53,3 +53,12 @@ subroutine integrate_state(lo, hi, &
                                dx, time, a, half_dt)
 
 end subroutine integrate_state
+
+
+! unused VODE stubs if we are not doing heating/cooling
+module vode_aux_module
+  use amrex_fort_module, only : rt => amrex_real
+  implicit none
+
+  real(rt) :: z_vode
+end module vode_aux_module
diff --git a/Exec/DrivenTurbulence/inputs b/Exec/DrivenTurbulence/inputs
index ab674a06..6b378880 100644
--- a/Exec/DrivenTurbulence/inputs
+++ b/Exec/DrivenTurbulence/inputs
@@ -84,7 +84,6 @@ amr.derive_plot_vars = magvort divu MachNumber
 amr.probin_file    = probin
 
 # DIAGNOSTICS & VERBOSITY
-nyx.show_timings   = 0       # show timings
 nyx.sum_interval   = 1       # timesteps between computing mass
 nyx.v              = 2       # verbosity in Castro.cpp
 amr.v              = 2       # verbosity in Amr.cpp
diff --git a/Exec/DrivenTurbulence/inputs.regtest b/Exec/DrivenTurbulence/inputs.regtest
index 172e04a7..0f470189 100644
--- a/Exec/DrivenTurbulence/inputs.regtest
+++ b/Exec/DrivenTurbulence/inputs.regtest
@@ -81,7 +81,6 @@ amr.derive_plot_vars = forcex forcey forcez magvort MachNumber
 amr.probin_file    = probin
 
 # DIAGNOSTICS & VERBOSITY
-nyx.show_timings   = 0       # show timings
 nyx.sum_interval   = 1       # timesteps between computing mass
 nyx.v              = 2       # verbosity in Castro.cpp
 amr.v              = 2       # verbosity in Amr.cpp
diff --git a/Exec/GravityTests/MacLaurin/GNUmakefile b/Exec/GravityTests/MacLaurin/GNUmakefile
index 2f332f68..f5430eef 100644
--- a/Exec/GravityTests/MacLaurin/GNUmakefile
+++ b/Exec/GravityTests/MacLaurin/GNUmakefile
@@ -1,5 +1,5 @@
 # AMREX_HOME defines the directory in which we will find all the BoxLib code
-AMREX_HOME ?= /project/projectdirs/nyx/src/amrex
+AMREX_HOME ?= ../../../../amrex
 
 # TOP defines the directory in which we will find Source, Exec, etc
 TOP = ../../..
diff --git a/Exec/HydroTests/DoubleRarefaction/GNUmakefile b/Exec/HydroTests/DoubleRarefaction/GNUmakefile
index d17a3b21..2e694fcd 100644
--- a/Exec/HydroTests/DoubleRarefaction/GNUmakefile
+++ b/Exec/HydroTests/DoubleRarefaction/GNUmakefile
@@ -1,5 +1,5 @@
 # AMREX_HOME defines the directory in which we will find all the BoxLib code
-AMREX_HOME ?= /project/projectdirs/nyx/src/amrex
+AMREX_HOME ?= ../../../../amrex
 
 # TOP defines the directory in which we will find Source, Exec, etc
 TOP = ../../..
@@ -11,7 +11,6 @@ USE_MPI = FALSE
 USE_OMP = FALSE
 
 PRECISION = DOUBLE
-DEBUG     = TRUE
 DEBUG     = FALSE
 
 # physics
diff --git a/Exec/HydroTests/Sedov/GNUmakefile b/Exec/HydroTests/Sedov/GNUmakefile
index a7170983..b645faf6 100644
--- a/Exec/HydroTests/Sedov/GNUmakefile
+++ b/Exec/HydroTests/Sedov/GNUmakefile
@@ -1,17 +1,16 @@
 # AMREX_HOME defines the directory in which we will find all the BoxLib code
-AMREX_HOME ?= /project/projectdirs/nyx/src/amrex
+AMREX_HOME ?= ../../../../amrex
 
 # TOP defines the directory in which we will find Source, Exec, etc
 TOP = ../../..
 
 # compilation options
-COMP    = gcc
+COMP    = intel #gcc
 
 USE_MPI = FALSE
 USE_OMP = FALSE
 
 PRECISION = DOUBLE
-DEBUG     = TRUE
 DEBUG     = FALSE
 
 # physics
diff --git a/Exec/HydroTests/Sod/GNUmakefile b/Exec/HydroTests/Sod/GNUmakefile
index d17a3b21..2e694fcd 100644
--- a/Exec/HydroTests/Sod/GNUmakefile
+++ b/Exec/HydroTests/Sod/GNUmakefile
@@ -1,5 +1,5 @@
 # AMREX_HOME defines the directory in which we will find all the BoxLib code
-AMREX_HOME ?= /project/projectdirs/nyx/src/amrex
+AMREX_HOME ?= ../../../../amrex
 
 # TOP defines the directory in which we will find Source, Exec, etc
 TOP = ../../..
@@ -11,7 +11,6 @@ USE_MPI = FALSE
 USE_OMP = FALSE
 
 PRECISION = DOUBLE
-DEBUG     = TRUE
 DEBUG     = FALSE
 
 # physics
diff --git a/Exec/HydroTests/StrongShockTube/GNUmakefile b/Exec/HydroTests/StrongShockTube/GNUmakefile
index d17a3b21..2e694fcd 100644
--- a/Exec/HydroTests/StrongShockTube/GNUmakefile
+++ b/Exec/HydroTests/StrongShockTube/GNUmakefile
@@ -1,5 +1,5 @@
 # AMREX_HOME defines the directory in which we will find all the BoxLib code
-AMREX_HOME ?= /project/projectdirs/nyx/src/amrex
+AMREX_HOME ?= ../../../../amrex
 
 # TOP defines the directory in which we will find Source, Exec, etc
 TOP = ../../..
@@ -11,7 +11,6 @@ USE_MPI = FALSE
 USE_OMP = FALSE
 
 PRECISION = DOUBLE
-DEBUG     = TRUE
 DEBUG     = FALSE
 
 # physics
diff --git a/Exec/HydroTests/TurbForce/GNUmakefile b/Exec/HydroTests/TurbForce/GNUmakefile
index 8c58c1b2..5787b11f 100644
--- a/Exec/HydroTests/TurbForce/GNUmakefile
+++ b/Exec/HydroTests/TurbForce/GNUmakefile
@@ -1,5 +1,5 @@
 # AMREX_HOME defines the directory in which we will find all the BoxLib code
-AMREX_HOME ?= /project/projectdirs/nyx/src/amrex
+AMREX_HOME ?= ../../../../amrex
 
 # TOP defines the directory in which we will find Source, Exec, etc
 TOP = ../../..
diff --git a/Exec/HydroTests/TurbForce/Nyx_setup.cpp b/Exec/HydroTests/TurbForce/Nyx_setup.cpp
index 6ccff42e..58a777b5 100644
--- a/Exec/HydroTests/TurbForce/Nyx_setup.cpp
+++ b/Exec/HydroTests/TurbForce/Nyx_setup.cpp
@@ -233,7 +233,7 @@ Nyx::hydro_setup()
          ppm_flatten_before_integrals,
          use_colglaz, use_flattening, corner_coupling, version_2,
          use_const_species, gamma, normalize_species,
-         heat_cool_type, ParallelDescriptor::Communicator());
+         heat_cool_type);
 
     if (use_const_species == 1)
         fort_set_eos_params(h_species, he_species);
@@ -664,7 +664,7 @@ Nyx::no_hydro_setup()
          ppm_flatten_before_integrals,
          use_colglaz, use_flattening, corner_coupling, version_2,
          use_const_species, gamma, normalize_species,
-         heat_cool_type, ParallelDescriptor::Communicator());
+         heat_cool_type);
 
     int coord_type = Geometry::Coord();
     fort_set_problem_params(dm, phys_bc.lo(), phys_bc.hi(), Outflow, Symmetry, coord_type);
diff --git a/Exec/LyA/32.nyx b/Exec/LyA/32.nyx
deleted file mode 100644
index 827a0059..00000000
Binary files a/Exec/LyA/32.nyx and /dev/null differ
diff --git a/Exec/LyA/64sssss_20mpc.nyx b/Exec/LyA/64sssss_20mpc.nyx
new file mode 100644
index 00000000..019faac7
Binary files /dev/null and b/Exec/LyA/64sssss_20mpc.nyx differ
diff --git a/Exec/LyA/GNUmakefile b/Exec/LyA/GNUmakefile
index e68c175b..99d40ed7 100644
--- a/Exec/LyA/GNUmakefile
+++ b/Exec/LyA/GNUmakefile
@@ -1,50 +1,35 @@
-# AMREX_HOME defines the directory in which we will find all the BoxLib code
-AMREX_HOME ?= /project/projectdirs/nyx/src/amrex
+# AMREX_HOME defines the directory in which we will find all the AMReX code
+AMREX_HOME ?= ../../../amrex
 
-HPGMG_DIR ?= /global/homes/f/friesen/hpgmg/finite-volume
+HPGMG_DIR ?= ../../Util/hpgmg/finite-volume
+CVODE_LIB_DIR ?= ../../../sundials/sundials-intel/lib
 
 # TOP defines the directory in which we will find Source, Exec, etc
 TOP = ../..
 
 # compilation options
-COMP    = gcc
+COMP    = intel  # gnu
 USE_MPI = TRUE
 USE_OMP = TRUE
 
-PROFILE       = FALSE
+PROFILE       = TRUE
 TRACE_PROFILE = FALSE
 COMM_PROFILE  = FALSE
 
 PRECISION = DOUBLE
+USE_SINGLE_PRECISION_PARTICLES = TRUE
 DEBUG     = FALSE
 
 GIMLET = FALSE
 REEBER = FALSE
 
-GIMLET_DIR ?= /home/vince/Development/gimlet
-# Gimlet needs FFTW MPI.
-FFTW_INC ?= /usr/include
-FFTW_DIR ?= /usr/lib/x86_64-linux-gnu
-
-REEBER_HOME ?= /project/projectdirs/nyx/ghweber/reeber2
-# Reeber needs Boost (both headers and libraries) and diy2.
-BOOST_INCLUDE_DIR ?= /project/projectdirs/nyx/ghweber/boost-1.61.0-noarch/include
-DIY_INCLUDE_DIR ?= /project/projectdirs/nyx/ghweber/diy/include
-
-USE_HPGMG = FALSE
-HPGMG_FCYCLES = FALSE
-HPGMG_POST_F_CYCLE_TYPE = V
-HPGMG_HELMHOLTZ = FALSE
-HPGMG_STENCIL_VARIABLE_COEFFICIENT = FALSE
-HPGMG_USE_SUBCOMM = TRUE
-HPGMG_BOTTOM_SOLVER= BICGSTAB
-HPGMG_SMOOTHER = GSRB
+USE_HPGMG = TRUE
 
 # physics
 DIM      = 3
 USE_GRAV = TRUE
 USE_HEATCOOL = TRUE
-
+USE_AGN = FALSE
 USE_CVODE = FALSE
 
 Bpack := ./Make.package
diff --git a/Exec/LyA/Make.package b/Exec/LyA/Make.package
index ce986e64..13af1531 100644
--- a/Exec/LyA/Make.package
+++ b/Exec/LyA/Make.package
@@ -1,5 +1,2 @@
 f90EXE_sources += Prob_${DIM}d.f90
 f90EXE_sources += probdata.f90	
-ifeq ($(USE_CVODE), TRUE)
-  f90EXE_sources += fcvode_extras.f90
-endif
diff --git a/Exec/LyA/Prob_3d.f90 b/Exec/LyA/Prob_3d.f90
index f1727bea..2e82541f 100644
--- a/Exec/LyA/Prob_3d.f90
+++ b/Exec/LyA/Prob_3d.f90
@@ -65,10 +65,11 @@ subroutine fort_initdata(level,time,lo,hi, &
                                bind(C, name="fort_initdata")
 
       use amrex_fort_module, only : rt => amrex_real
+      use amrex_parmparse_module
       use probdata_module
       use atomic_rates_module, only : XHYDROGEN
       use meth_params_module, only : URHO, UMX, UMZ, UEDEN, UEINT, UFS, &
-                                     small_dens, TEMP_COMP, NE_COMP
+                                     small_dens, TEMP_COMP, NE_COMP, ZHI_COMP
  
       implicit none
  
@@ -81,6 +82,13 @@ subroutine fort_initdata(level,time,lo,hi, &
       real(rt) diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,nd)
 
       integer i,j,k
+      real(rt) z_in
+
+      type(amrex_parmparse) :: pp
+
+      call amrex_parmparse_build(pp, "nyx")
+      call pp%query("initial_z", z_in)
+      call amrex_parmparse_destroy(pp)
 
       ! This is the case where we have compiled with states defined 
       !  but they have only one component each so we fill them this way.
@@ -90,7 +98,7 @@ subroutine fort_initdata(level,time,lo,hi, &
          diag_eos(:,:,:,1)    = 0.0d0
 
       ! This is the regular case with NO_HYDRO = FALSE
-      else if (ns.gt.1 .and. nd.eq.2) then
+      else if (ns.gt.1 .and. nd.ge.2) then
 
          do k = lo(3), hi(3)
          do j = lo(2), hi(2)
@@ -108,8 +116,13 @@ subroutine fort_initdata(level,time,lo,hi, &
                state(i,j,k,UFS+1) = (1.d0 - XHYDROGEN)
             end if
 
-            diag_eos(i,j,k,TEMP_COMP) = 1000.d0
-            diag_eos(i,j,k,  NE_COMP) =    0.d0
+            diag_eos(i,j,k,TEMP_COMP) = 0.021d0*(1.0d0 + z_in)**2
+            diag_eos(i,j,k,  NE_COMP) = 0.d0
+
+            if (ZHI_COMP .gt. -1) then
+               diag_eos(i,j,k, ZHI_COMP) = 7.5d0
+            endif
+
          enddo
          enddo
          enddo
diff --git a/Exec/LyA/fcvode_extras.f90 b/Exec/LyA/fcvode_extras.f90
deleted file mode 100644
index 450a4c0e..00000000
--- a/Exec/LyA/fcvode_extras.f90
+++ /dev/null
@@ -1,90 +0,0 @@
-module fcvode_extras
-
-  implicit none
-
-  contains
-
-    subroutine fcvode_wrapper(dt, rho_in, T_in, ne_in, e_in, neq, cvmem, &
-                              sunvec_y, yvec, T_out, ne_out, e_out)
-
-        use amrex_fort_module, only : rt => amrex_real
-        use vode_aux_module, only: rho_vode, T_vode, ne_vode
-        use cvode_interface
-        use fnvector_serial
-        use, intrinsic :: iso_c_binding
-
-        implicit none
-
-        real(rt), intent(in   ) :: dt
-        real(rt), intent(in   ) :: rho_in, T_in, ne_in, e_in
-        type(c_ptr), value :: cvmem
-        type(c_ptr), value :: sunvec_y
-        real(rt), intent(  out) ::         T_out,ne_out,e_out
-
-        real(c_double) :: atol, rtol
-        real(c_double) :: time, tout
-        integer(c_long), intent(in) :: neq
-        real(c_double), pointer, intent(in) :: yvec(:)
-
-        integer(c_int) :: ierr
-
-        real(c_double) :: t_soln
-
-        T_vode   = T_in
-        ne_vode  = ne_in
-        rho_vode = rho_in
-
-        ! Initialize the integration time
-        time = 0.d0
-
-        ! We will integrate "e" in time. 
-        yvec(1) = e_in
-
-        ! Set the tolerances.  
-        atol = 1.d-4 * e_in
-        rtol = 1.d-4
-
-        ierr = FCVodeReInit(cvmem, time, sunvec_y)
-        ierr = FCVodeSStolerances(CVmem, rtol, atol)
-
-        ierr = FCVode(CVmem, dt, sunvec_y, time, CV_NORMAL)
-
-        e_out  = yvec(1)
-        T_out  = T_vode
-        ne_out = ne_vode
-
-    end subroutine fcvode_wrapper
-
-    integer(c_int) function RhsFn(tn, sunvec_y, sunvec_f, user_data) &
-           result(ierr) bind(C,name='RhsFn')
-
-      use, intrinsic :: iso_c_binding
-      use fnvector_serial
-      use cvode_interface
-      implicit none
-
-      real(c_double), value :: tn
-      type(c_ptr), value    :: sunvec_y
-      type(c_ptr), value    :: sunvec_f
-      type(c_ptr), value    :: user_data
-
-      ! pointers to data in SUNDAILS vectors
-      real(c_double), pointer :: yvec(:)
-      real(c_double), pointer :: fvec(:)
-
-      real(c_double) :: energy
-
-      integer(c_long), parameter :: neq = 1
-
-      ! get data arrays from SUNDIALS vectors
-      call N_VGetData_Serial(sunvec_y, neq, yvec)
-      call N_VGetData_Serial(sunvec_f, neq, fvec)
-
-      call f_rhs(1, tn, yvec(1), energy, 0.0, 0)
-
-      fvec(1) = energy
-
-      ierr = 0
-    end function RhsFn
-
-end module fcvode_extras
diff --git a/Exec/LyA/inputs b/Exec/LyA/inputs
index 51144b63..539fa4a7 100644
--- a/Exec/LyA/inputs
+++ b/Exec/LyA/inputs
@@ -1,28 +1,25 @@
 # ------------------  INPUTS TO MAIN PROGRAM  -------------------
 max_step = 10000000
 
-nyx.ppm_type         = 0
-nyx.use_colglaz      = 1
-nyx.add_ext_src      = 1
-nyx.heat_cool_type   = 1
-nyx.strang_split     = 1
-gravity.show_timings = 1
-nyx.show_timings     = 1
+nyx.ppm_type         = 1
+nyx.ppm_reference    = 1
+nyx.use_colglaz      = 0
+nyx.corner_coupling  = 1
 
-#This is 1e-8 times the lowest density in plt00000
-nyx.small_dens = 5.162470e1
+nyx.strang_split     = 1
+nyx.add_ext_src      = 1
+nyx.heat_cool_type   = 3
+#nyx.simd_width       = 8
 
-#This is 1e-5 times the constant temparature in plt00000
+nyx.small_dens = 1.e-2
 nyx.small_temp = 1.e-2
 
-#This is 1e-8 times the lowest pressure in plt00000
-nyx.small_pres = 3.487507e2
-
 nyx.do_santa_barbara = 1
 nyx.init_sb_vels     = 1
-gravity.sl_tol = 1.e-12
+gravity.ml_tol = 1.e-10
+gravity.sl_tol = 1.e-10
 
-nyx.initial_z = 100.0
+nyx.initial_z = 159.0
 nyx.final_z = 2.0
 
 #File written during the run: nstep | time | dt | redshift | a
@@ -31,13 +28,15 @@ amr.data_log = runlog
 
 #This is how we restart from a checkpoint and write an ascii particle file
 #Leave this commented out in cvs version
-#amr.restart = chk00070
+#amr.restart = chk00100
 #max_step = 4
 #particles.particle_output_file = particle_output
 
 gravity.gravity_type = PoissonGrav
 gravity.no_sync      = 1
 gravity.no_composite = 1
+gravity.solve_with_cpp = 0
+gravity.solve_with_hpgmg = 1
 
 mg.bottom_solver = 4
 
@@ -48,11 +47,12 @@ geometry.coord_sys   =  0
 geometry.prob_lo     =  0     0     0
 
 #Domain size in Mpc
-geometry.prob_hi     =  8.0  8.0  8.0
-
-amr.n_cell           = 32 32 32
-amr.max_grid_size    = 16
+geometry.prob_hi     =  28.49002849  28.49002849  28.49002849
 
+amr.n_cell           =  64  64  64
+amr.max_grid_size    = 32
+#fabarray.mfiter_tile_size = 128 8 8
+fabarray.mfiter_tile_size = 1024000 8 8
 
 # >>>>>>>>>>>>>  BC FLAGS <<<<<<<<<<<<<<<<
 # 0 = Interior           3 = Symmetry
@@ -66,32 +66,44 @@ nyx.hi_bc       =  0   0   0
 nyx.do_hydro = 1
 nyx.do_grav  = 1
 
-# COMOVING
-nyx.comoving_OmM = 0.27
-nyx.comoving_OmB = 0.045
-nyx.comoving_h   = 0.71d0
+# COSMOLOGY
+nyx.comoving_OmM = 0.275
+nyx.comoving_OmB = 0.046
+nyx.comoving_h   = 0.702d0
+
+# UVB and reionization
+nyx.inhomo_reion     = 0
+nyx.inhomo_zhi_file  = "zhi.bin"
+nyx.inhomo_grid      = 512
+nyx.uvb_rates_file   = "TREECOOL_middle"
+nyx.uvb_density_A    = 1.0
+nyx.uvb_density_B    = 0.0
+nyx.reionization_zHI_flash   = -1.0
+nyx.reionization_zHeII_flash = -1.0
+nyx.reionization_T_zHI       = 2.0e4
+nyx.reionization_T_zHeII     = 1.5e4
 
 # PARTICLES
 nyx.do_dm_particles = 1
 
 # >>>>>>>>>>>>>  PARTICLE INIT OPTIONS <<<<<<<<<<<<<<<<
-#  "AsciiFile"        "Random"	    "Cosmological"
+#  "AsciiFile"        "Random"      "Cosmological"
 # >>>>>>>>>>>>>  PARTICLE INIT OPTIONS <<<<<<<<<<<<<<<<
 nyx.particle_init_type = BinaryFile
-nyx.binary_particle_file = 32.nyx
+nyx.binary_particle_file = 64sssss_20mpc.nyx
+particles.nparts_per_read = 2097152
 
 # >>>>>>>>>>>>>  PARTICLE MOVE OPTIONS <<<<<<<<<<<<<<<<
 #  "Gravitational"    "Random"
 # >>>>>>>>>>>>>  PARTICLE MOVE OPTIONS <<<<<<<<<<<<<<<<
 nyx.particle_move_type = Gravitational
 
-
 # TIME STEP CONTROL
 nyx.relative_max_change_a = 0.01    # max change in scale factor
 particles.cfl             = 0.5     # 'cfl' for particles 
-nyx.cfl                   = 0.9     # cfl number for hyperbolic system
+nyx.cfl                   = 0.5     # cfl number for hyperbolic system
 nyx.init_shrink           = 1.0     # scale back initial timestep
-nyx.change_max            = 1.1     # factor by which timestep can change
+nyx.change_max            = 2.0     # factor by which timestep can change
 nyx.dt_cutoff             = 5.e-20  # level 0 timestep below which we halt
 
 # DIAGNOSTICS & VERBOSITY
@@ -109,20 +121,27 @@ amr.max_level          = 0        # maximum level number allowed
 #amr.regrid_int         = 4 4 4 4
 #amr.n_error_buf        = 0 0 0 8
 #amr.refine_grid_layout = 1
-#amr.regrid_on_restart  = 1
+amr.regrid_on_restart  = 1
 #amr.blocking_factor    = 32
+#amr.nosub              = 1
 
 # CHECKPOINT FILES
-amr.check_file      = chk
-amr.check_int       = 1000
+amr.checkpoint_files_output = 1
+amr.check_file        = chk
+amr.check_int         = 100
+amr.checkpoint_nfiles = 64
 
 # PLOTFILES
+fab.format          = NATIVE_32
+amr.plot_files_output = 1
 amr.plot_file       = plt
-amr.plot_int        = 1000
+amr.plot_int        = -1
+amr.plot_nfiles     = 64
+nyx.plot_z_values   = 7.0 6.0 5.0 4.0 3.0 2.0
+particles.write_in_plotfile = 1
 
-amr.plot_vars        = ALL
-amr.derive_plot_vars = particle_count particle_mass_density pressure magvel
+amr.plot_vars        = density xmom ymom zmom rho_e Temp phi_grav
+amr.derive_plot_vars = particle_mass_density
 
 #PROBIN FILENAME
 amr.probin_file = probin
-
diff --git a/Exec/LyA/inputs.rt b/Exec/LyA/inputs.rt
index 4ecdc100..d78fa0de 100644
--- a/Exec/LyA/inputs.rt
+++ b/Exec/LyA/inputs.rt
@@ -7,8 +7,8 @@ nyx.add_ext_src      = 1
 nyx.heat_cool_type   = 3
 nyx.strang_split     = 1
 
-gravity.show_timings = 0
-nyx.show_timings     = 0
+nyx.inhomo_reion     = 0
+nyx.uvb_rates_file   = "TREECOOL_middle"
 
 #This is 1e-8 times the lowest density in plt00000
 nyx.small_dens = 5.162470e1
@@ -16,9 +16,6 @@ nyx.small_dens = 5.162470e1
 #This is 1e-5 times the constant temparature in plt00000
 nyx.small_temp = 1.e-2
 
-#This is 1e-8 times the lowest pressure in plt00000
-nyx.small_pres = 3.487507e2
-
 nyx.do_santa_barbara = 1
 nyx.init_sb_vels     = 1
 gravity.sl_tol = 1.e-12
diff --git a/Exec/LyA/inputs.small.dsc b/Exec/LyA/inputs.small.dsc
index 1080f4e0..8b4009fe 100644
--- a/Exec/LyA/inputs.small.dsc
+++ b/Exec/LyA/inputs.small.dsc
@@ -13,8 +13,6 @@ nyx.use_colglaz      = 1
 nyx.add_ext_src      = 1
 nyx.heat_cool_type   = 1
 nyx.strang_split     = 1
-gravity.show_timings = 1
-nyx.show_timings     = 1
 
 #This is 1e-8 times the lowest density in plt00000
 nyx.small_dens = 5.162470e1
@@ -22,9 +20,6 @@ nyx.small_dens = 5.162470e1
 #This is 1e-5 times the constant temparature in plt00000
 nyx.small_temp = 1.e-2
 
-#This is 1e-8 times the lowest pressure in plt00000
-nyx.small_pres = 3.487507e2
-
 nyx.do_santa_barbara = 1
 nyx.init_sb_vels     = 1
 gravity.sl_tol = 1.e-12
diff --git a/Exec/LyA/inputs_gimlet_in_transit.dsc b/Exec/LyA/inputs_gimlet_in_transit.dsc
index fff008ce..ef3ceca4 100644
--- a/Exec/LyA/inputs_gimlet_in_transit.dsc
+++ b/Exec/LyA/inputs_gimlet_in_transit.dsc
@@ -14,18 +14,12 @@ nyx.add_ext_src      = 1
 nyx.heat_cool_type   = 3
 nyx.strang_split     = 1
 
-gravity.show_timings = 1
-nyx.show_timings     = 1
-
 #This is 1e-8 times the lowest density in plt00000
 nyx.small_dens = 5.162470e1
 
 #This is 1e-5 times the constant temparature in plt00000
 nyx.small_temp = 1.e-2
 
-#This is 1e-8 times the lowest pressure in plt00000
-nyx.small_pres = 3.487507e2
-
 nyx.do_santa_barbara = 1
 nyx.init_sb_vels     = 1
 gravity.ml_tol = 1.e-10
diff --git a/Exec/LyA/integrate_state_vode_3d.f90 b/Exec/LyA/integrate_state_vode_3d.f90
deleted file mode 100644
index a866d3ac..00000000
--- a/Exec/LyA/integrate_state_vode_3d.f90
+++ /dev/null
@@ -1,243 +0,0 @@
-subroutine integrate_state_vode(lo, hi, &
-                                state   , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, &
-                                diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, &
-                                a, half_dt, min_iter, max_iter)
-!
-!   Calculates the sources to be added later on.
-!
-!   Parameters
-!   ----------
-!   lo : double array (3)
-!       The low corner of the current box.
-!   hi : double array (3)
-!       The high corner of the current box.
-!   state_* : double arrays
-!       The state vars
-!   diag_eos_* : double arrays
-!       Temp and Ne
-!   src_* : doubles arrays
-!       The source terms to be added to state (iterative approx.)
-!   double array (3)
-!       The low corner of the entire domain
-!   a : double
-!       The current a
-!   half_dt : double
-!       time step size, in Mpc km^-1 s ~ 10^12 yr.
-!
-!   Returns
-!   -------
-!   state : double array (dims) @todo
-!       The state vars
-!
-    use amrex_fort_module, only : rt => amrex_real
-    use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, &
-                                   TEMP_COMP, NE_COMP, gamma_minus_1
-    use bl_constants_module, only: M_PI
-    use eos_params_module
-    use network
-    use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT
-    use fundamental_constants_module
-    use comoving_module, only: comoving_h, comoving_OmB
-    use atomic_rates_module, only: tabulate_rates, interp_to_this_z, YHELIUM
-    use vode_aux_module    , only: z_vode, i_vode, j_vode, k_vode
-
-    implicit none
-
-    integer         , intent(in) :: lo(3), hi(3)
-    integer         , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3
-    integer         , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3
-    real(rt), intent(inout) ::    state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR)
-    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2)
-    real(rt), intent(in)    :: a, half_dt
-    integer         , intent(inout) :: max_iter, min_iter
-
-    integer :: i, j, k
-    real(rt) :: z, rho
-    real(rt) :: T_orig, ne_orig, e_orig
-    real(rt) :: T_out , ne_out , e_out, mu, mean_rhob
-
-    z = 1.d0/a - 1.d0
-
-    z_vode = z
-    mean_rhob = comoving_OmB * 3.d0*(comoving_h*100.d0)**2 / (8.d0*M_PI*Gconst)
-
-    ! Interpolate from the table to this redshift
-    call interp_to_this_z(z)
-
-    ! Note that (lo,hi) define the region of the box containing the grow cells
-    ! Do *not* assume this is just the valid region
-    ! apply heating-cooling to UEDEN and UEINT
-
-    do k = lo(3),hi(3)
-        do j = lo(2),hi(2)
-            do i = lo(1),hi(1)
-
-                ! Original values
-                rho     = state(i,j,k,URHO)
-                e_orig  = state(i,j,k,UEINT) / rho
-                T_orig  = diag_eos(i,j,k,TEMP_COMP)
-                ne_orig = diag_eos(i,j,k,  NE_COMP)
-
-                if (e_orig .lt. 0.d0) then
-                    print *,'negative e entering strang integration ',z, i,j,k, rho/mean_rhob, e_orig
-                    call bl_abort('bad e in strang')
-                end if
-
-                i_vode = i
-                j_vode = j
-                k_vode = k
-
-                call vode_wrapper(half_dt,rho,T_orig,ne_orig,e_orig, &
-                                              T_out ,ne_out ,e_out)
-
-                if (e_out .lt. 0.d0) then
-                    print *,'negative e exiting strang integration ',z, i,j,k, rho/mean_rhob, e_out
-                    T_out  = 10.0
-                    ne_out = 0.0
-                    mu     = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne_out)
-                    e_out  = T_out / (gamma_minus_1 * mp_over_kB * mu)
-                    call flush(6)
-!                    call bl_abort('bad e out of strang')
-                end if
-
-                ! Update (rho e) and (rho E)
-                state(i,j,k,UEINT) = state(i,j,k,UEINT) + rho * (e_out-e_orig)
-                state(i,j,k,UEDEN) = state(i,j,k,UEDEN) + rho * (e_out-e_orig)
-
-                ! Update T and ne (do not use stuff computed in f_rhs, per vode manual)
-                call nyx_eos_T_given_Re(T_out, ne_out, rho, e_out, a)
-                diag_eos(i,j,k,TEMP_COMP) = T_out
-                diag_eos(i,j,k,  NE_COMP) = ne_out
-
-            end do ! i
-        end do ! j
-    end do ! k
-
-end subroutine integrate_state_vode
-
-subroutine vode_wrapper(dt, rho_in, T_in, ne_in, e_in, T_out, ne_out, e_out)
-
-    use amrex_fort_module, only : rt => amrex_real
-    use vode_aux_module, only: rho_vode, T_vode, ne_vode, &
-                               i_vode, j_vode, k_vode
-
-    implicit none
-
-    real(rt), intent(in   ) :: dt
-    real(rt), intent(in   ) :: rho_in, T_in, ne_in, e_in
-    real(rt), intent(  out) ::         T_out,ne_out,e_out
-
-    ! Set the number of independent variables -- this should be just "e"
-    integer, parameter :: NEQ = 1
-  
-    ! Allocate storage for the input state
-    real(rt) :: y(NEQ)
-
-    ! Our problem is stiff, tell ODEPACK that. 21 means stiff, jacobian 
-    ! function is supplied, 22 means stiff, figure out my jacobian through 
-    ! differencing
-    integer, parameter :: MF_ANALYTIC_JAC = 21, MF_NUMERICAL_JAC = 22
-
-    ! Tolerance parameters:
-    !
-    !  itol specifies whether to use an single absolute tolerance for
-    !  all variables (1), or to pass an array of absolute tolerances, one
-    !  for each variable with a scalar relative tol (2), a scalar absolute
-    !  and array of relative tolerances (3), or arrays for both (4)
-    !  
-    !  The error is determined as e(i) = rtol*abs(y(i)) + atol, and must
-    !  be > 0.  
-    !
-    ! We will use arrays for both the absolute and relative tolerances, 
-    ! since we want to be easier on the temperature than the species
-
-    integer, parameter :: ITOL = 1
-    real(rt) :: atol(NEQ), rtol(NEQ)
-    
-    ! We want to do a normal computation, and get the output values of y(t)
-    ! after stepping though dt
-    integer, PARAMETER :: ITASK = 1
-  
-    ! istate determines the state of the calculation.  A value of 1 meeans
-    ! this is the first call to the problem -- this is what we will want.
-    ! Note, istate is changed over the course of the calculation, so it
-    ! cannot be a parameter
-    integer :: istate
-
-    ! we will override the maximum number of steps, so turn on the 
-    ! optional arguments flag
-    integer, parameter :: IOPT = 1
-    
-    ! declare a real work array of size 22 + 9*NEQ + 2*NEQ**2 and an
-    ! integer work array of since 30 + NEQ
-
-    integer, parameter :: LRW = 22 + 9*NEQ + 2*NEQ**2
-    real(rt)   :: rwork(LRW)
-    real(rt)   :: time
-    ! real(rt)   :: dt4
-    
-    integer, parameter :: LIW = 30 + NEQ
-    integer, dimension(LIW) :: iwork
-    
-    real(rt) :: rpar
-    integer          :: ipar
-
-    EXTERNAL jac, f_rhs
-    
-    logical, save :: firstCall = .true.
-
-    T_vode   = T_in
-    ne_vode  = ne_in
-    rho_vode = rho_in
-
-    ! We want VODE to re-initialize each time we call it
-    istate = 1
-    
-    rwork(:) = 0.d0
-    iwork(:) = 0
-    
-    ! Set the maximum number of steps allowed (the VODE default is 500)
-    iwork(6) = 2000
-    
-    ! Initialize the integration time
-    time = 0.d0
-    
-    ! We will integrate "e" in time. 
-    y(1) = e_in
-
-    ! Set the tolerances.  
-    atol(1) = 1.d-4 * e_in
-    rtol(1) = 1.d-4
-
-    ! call the integration routine
-    call dvode(f_rhs, NEQ, y, time, dt, ITOL, rtol, atol, ITASK, &
-               istate, IOPT, rwork, LRW, iwork, LIW, jac, MF_NUMERICAL_JAC, &
-               rpar, ipar)
-
-    e_out  = y(1)
-    T_out  = T_vode
-    ne_out = ne_vode
-
-    if (istate < 0) then
-       print *, 'istate = ', istate, 'at (i,j,k) ',i_vode,j_vode,k_vode
-       call bl_error("ERROR in vode_wrapper: integration failed")
-    endif
-
-!      print *,'Calling vode with 1/4 the time step'
-!      dt4 = 0.25d0  * dt
-!      y(1) = e_in
-
-!      do n = 1,4
-!         call dvode(f_rhs, NEQ, y, time, dt4, ITOL, rtol, atol, ITASK, &
-!                    istate, IOPT, rwork, LRW, iwork, LIW, jac, MF_NUMERICAL_JAC, &
-!                    rpar, ipar)
-!         if (istate < 0) then
-!            print *, 'doing subiteration ',n
-!            print *, 'istate = ', istate, 'at (i,j,k) ',i,j,k
-!            call bl_error("ERROR in vode_wrapper: sub-integration failed")
-!         end if
-
-!      end do
-!   endif
-
-end subroutine vode_wrapper
diff --git a/Exec/LyA_AGN/64sssss_20mpc.nyx b/Exec/LyA_AGN/64sssss_20mpc.nyx
deleted file mode 100644
index 019faac7..00000000
Binary files a/Exec/LyA_AGN/64sssss_20mpc.nyx and /dev/null differ
diff --git a/Exec/LyA_AGN/64sssss_20mpc.nyx b/Exec/LyA_AGN/64sssss_20mpc.nyx
new file mode 120000
index 00000000..3c7b0271
--- /dev/null
+++ b/Exec/LyA_AGN/64sssss_20mpc.nyx
@@ -0,0 +1 @@
+../LyA/64sssss_20mpc.nyx
\ No newline at end of file
diff --git a/Exec/LyA_AGN/GNUmakefile b/Exec/LyA_AGN/GNUmakefile
index 763081fe..02919213 100644
--- a/Exec/LyA_AGN/GNUmakefile
+++ b/Exec/LyA_AGN/GNUmakefile
@@ -1,55 +1,36 @@
-# AMREX_HOME defines the directory in which we will find all the BoxLib code
+# AMREX_HOME defines the directory in which we will find all the AMReX code
 AMREX_HOME ?= ../../../amrex
-#AMREX_HOME = /home/vince/Development/BLMaster/amrex
 
-HPGMG_DIR ?= ../../../hpgmg/finite-volume
+HPGMG_DIR ?= ../../Util/hpgmg/finite-volume
+CVODE_LIB_DIR ?= ../../../sundials/sundials-intel/lib
 
 # TOP defines the directory in which we will find Source, Exec, etc
 TOP = ../..
 
 # compilation options
-COMP    = gnu
+COMP    = intel  # gnu
 USE_MPI = TRUE
-USE_OMP = FALSE
+USE_OMP = TRUE
 
-PROFILE       = FALSE
+PROFILE       = TRUE
 TRACE_PROFILE = FALSE
 COMM_PROFILE  = FALSE
 
 PRECISION = DOUBLE
+USE_SINGLE_PRECISION_PARTICLES = TRUE
 DEBUG     = FALSE
 
 GIMLET = FALSE
 REEBER = FALSE
 
-GIMLET_DIR ?= /home/vince/Development/gimlet
-# Gimlet needs FFTW MPI.
-FFTW_INC ?= /usr/include
-FFTW_DIR ?= /usr/lib/x86_64-linux-gnu
-
-REEBER_HOME ?= /project/projectdirs/nyx/ghweber/reeber2
-# Reeber needs Boost (both headers and libraries) and diy2.
-BOOST_INCLUDE_DIR ?= /project/projectdirs/nyx/ghweber/boost-1.61.0-noarch/include
-DIY_INCLUDE_DIR ?= /project/projectdirs/nyx/ghweber/diy/include
-
-USE_HPGMG = FALSE
-HPGMG_FCYCLES = FALSE
-HPGMG_POST_F_CYCLE_TYPE = V
-HPGMG_HELMHOLTZ = FALSE
-HPGMG_STENCIL_VARIABLE_COEFFICIENT = FALSE
-HPGMG_USE_SUBCOMM = TRUE
-HPGMG_BOTTOM_SOLVER= BICGSTAB
-HPGMG_SMOOTHER = GSRB
+USE_HPGMG = TRUE
 
 # physics
 DIM      = 3
 USE_GRAV = TRUE
 USE_HEATCOOL = TRUE
-
 USE_AGN = TRUE
-
-# units
-#USE_CGS = TRUE
+USE_CVODE = FALSE
 
 Bpack := ./Make.package
 Blocs := .
diff --git a/Exec/LyA_AGN/Prob_3d.f90 b/Exec/LyA_AGN/Prob_3d.f90
index 801fe970..b9e3f078 100644
--- a/Exec/LyA_AGN/Prob_3d.f90
+++ b/Exec/LyA_AGN/Prob_3d.f90
@@ -68,10 +68,11 @@ subroutine fort_initdata(level,time,lo,hi, &
                                delta,xlo,xhi)  &
                                bind(C, name="fort_initdata")
       use amrex_fort_module, only : rt => amrex_real
+      use amrex_parmparse_module
       use probdata_module
       use atomic_rates_module, only : XHYDROGEN
       use meth_params_module, only : URHO, UMX, UMZ, UEDEN, UEINT, UFS, &
-                                     small_dens, TEMP_COMP, NE_COMP
+                                     small_dens, TEMP_COMP, NE_COMP, ZHI_COMP
  
       implicit none
  
@@ -84,6 +85,13 @@ subroutine fort_initdata(level,time,lo,hi, &
       real(rt) diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,nd)
 
       integer i,j,k
+      real(rt) z_in
+
+      type(amrex_parmparse) :: pp
+
+      call amrex_parmparse_build(pp, "nyx")
+      call pp%query("initial_z", z_in)
+      call amrex_parmparse_destroy(pp)
 
       ! This is the case where we have compiled with states defined 
       !  but they have only one component each so we fill them this way.
@@ -93,7 +101,7 @@ subroutine fort_initdata(level,time,lo,hi, &
          diag_eos(:,:,:,1)    = 0.0d0
 
       ! This is the regular case with NO_HYDRO = FALSE
-      else if (ns.gt.1 .and. nd.eq.2) then
+      else if (ns.gt.1 .and. nd.ge.2) then
 
          do k = lo(3), hi(3)
          do j = lo(2), hi(2)
@@ -111,8 +119,13 @@ subroutine fort_initdata(level,time,lo,hi, &
                state(i,j,k,UFS+1) = (1.d0 - XHYDROGEN)
             end if
 
-            diag_eos(i,j,k,TEMP_COMP) = 1000.d0
-            diag_eos(i,j,k,  NE_COMP) =    0.d0
+            diag_eos(i,j,k,TEMP_COMP) = 0.021d0*(1.0d0 + z_in)**2
+            diag_eos(i,j,k,  NE_COMP) = 0.d0
+
+            if (ZHI_COMP .gt. -1) then
+               diag_eos(i,j,k, ZHI_COMP) = 7.5d0
+            endif
+
          enddo
          enddo
          enddo
diff --git a/Exec/LyA_AGN/inputs b/Exec/LyA_AGN/inputs
index 9d018d69..4f95a3b0 100644
--- a/Exec/LyA_AGN/inputs
+++ b/Exec/LyA_AGN/inputs
@@ -1,7 +1,5 @@
 # ------------------  INPUTS TO MAIN PROGRAM  -------------------
-max_step = 2
-
-#amr.restart = chk00001
+max_step = 10000000
 
 #Number of time steps between calls to halo finder
 reeber.halo_int = 1
@@ -11,22 +9,14 @@ nyx.ppm_reference    = 1
 nyx.use_colglaz      = 0
 nyx.corner_coupling  = 1
 
+nyx.strang_split     = 1
 nyx.add_ext_src      = 1
 nyx.heat_cool_type   = 3
-nyx.strang_split     = 1
-
-gravity.show_timings = 1
-nyx.show_timings     = 1
+#nyx.simd_width       = 8
 
-#This is 1e-8 times the lowest density in plt00000
 nyx.small_dens = 1.e-2
-
-#This is 1e-5 times the constant temparature in plt00000
 nyx.small_temp = 1.e-2
 
-#This is 1e-8 times the lowest pressure in plt00000
-nyx.small_pres = 1.0e-4
-
 nyx.do_santa_barbara = 1
 nyx.init_sb_vels     = 1
 gravity.ml_tol = 1.e-10
@@ -48,7 +38,8 @@ amr.data_log = runlog
 gravity.gravity_type = PoissonGrav
 gravity.no_sync      = 1
 gravity.no_composite = 1
-gravity.solve_with_cpp = 1
+gravity.solve_with_cpp = 0
+gravity.solve_with_hpgmg = 1
 
 mg.bottom_solver = 4
 
@@ -61,10 +52,11 @@ geometry.prob_lo     =  0     0     0
 #Domain size in Mpc
 geometry.prob_hi     =  28.49002849  28.49002849  28.49002849
 
-amr.n_cell           = 256 256 256
 amr.n_cell           =  64  64  64
 amr.max_grid_size    = 32
 #fabarray.mfiter_tile_size = 128 8 8
+fabarray.mfiter_tile_size = 1024000 8 8
+
 # >>>>>>>>>>>>>  BC FLAGS <<<<<<<<<<<<<<<<
 # 0 = Interior           3 = Symmetry
 # 1 = Inflow             4 = SlipWall
@@ -77,11 +69,23 @@ nyx.hi_bc       =  0   0   0
 nyx.do_hydro = 1
 nyx.do_grav  = 1
 
-# COMOVING
+# COSMOLOGY
 nyx.comoving_OmM = 0.275
 nyx.comoving_OmB = 0.046
 nyx.comoving_h   = 0.702d0
 
+# UVB and reionization
+nyx.inhomo_reion     = 0
+nyx.inhomo_zhi_file  = "zhi.bin"
+nyx.inhomo_grid      = 512
+nyx.uvb_rates_file   = "TREECOOL_middle"
+nyx.uvb_density_A    = 1.0
+nyx.uvb_density_B    = 0.0
+nyx.reionization_zHI_flash   = -1.0
+nyx.reionization_zHeII_flash = -1.0
+nyx.reionization_T_zHI       = 2.0e4
+nyx.reionization_T_zHeII     = 1.5e4
+
 # PARTICLES
 nyx.do_dm_particles = 1
 
@@ -98,7 +102,7 @@ particles.nparts_per_read = 2097152
 nyx.particle_move_type = Gravitational
 
 # TIME STEP CONTROL
-nyx.relative_max_change_a = 0.02    # max change in scale factor
+nyx.relative_max_change_a = 0.01    # max change in scale factor
 particles.cfl             = 0.5     # 'cfl' for particles 
 nyx.cfl                   = 0.5     # cfl number for hyperbolic system
 nyx.init_shrink           = 1.0     # scale back initial timestep
@@ -125,19 +129,18 @@ amr.regrid_on_restart  = 1
 #amr.nosub              = 1
 
 # CHECKPOINT FILES
-amr.checkpoint_files_output = 1 # no output
+amr.checkpoint_files_output = 1
 amr.check_file        = chk
 amr.check_int         = 100
 amr.checkpoint_nfiles = 64
 
 # PLOTFILES
-#fab.format          = IEEE32
 fab.format          = NATIVE_32
-#amr.plot_files_output = 0
+amr.plot_files_output = 1
 amr.plot_file       = plt
 amr.plot_int        = -1
 amr.plot_nfiles     = 64
-nyx.plot_z_values   = 5.4 5.0 4.6 4.2
+nyx.plot_z_values   = 7.0 6.0 5.0 4.0 3.0 2.0
 particles.write_in_plotfile = 1
 
 amr.plot_vars        = density xmom ymom zmom rho_e Temp phi_grav
diff --git a/Exec/LyA_AGN/inputs.rt b/Exec/LyA_AGN/inputs.rt
index 54aefe32..72a8b21e 100644
--- a/Exec/LyA_AGN/inputs.rt
+++ b/Exec/LyA_AGN/inputs.rt
@@ -7,18 +7,12 @@ nyx.add_ext_src      = 1
 nyx.heat_cool_type   = 3
 nyx.strang_split     = 1
 
-gravity.show_timings = 0
-nyx.show_timings     = 0
-
 #This is 1e-8 times the lowest density in plt00000
 nyx.small_dens = 5.162470e1
 
 #This is 1e-5 times the constant temparature in plt00000
 nyx.small_temp = 1.e-2
 
-#This is 1e-8 times the lowest pressure in plt00000
-nyx.small_pres = 3.487507e2
-
 nyx.do_santa_barbara = 1
 nyx.init_sb_vels     = 1
 gravity.sl_tol = 1.e-12
diff --git a/Exec/LyA_AGN/integrate_state_vode_3d.f90 b/Exec/LyA_AGN/integrate_state_vode_3d.f90
deleted file mode 100644
index 2c124b9e..00000000
--- a/Exec/LyA_AGN/integrate_state_vode_3d.f90
+++ /dev/null
@@ -1,243 +0,0 @@
-subroutine integrate_state_vode(lo, hi, &
-                                state   , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, &
-                                diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, &
-                                a, half_dt, min_iter, max_iter)
-!
-!   Calculates the sources to be added later on.
-!
-!   Parameters
-!   ----------
-!   lo : double array (3)
-!       The low corner of the current box.
-!   hi : double array (3)
-!       The high corner of the current box.
-!   state_* : double arrays
-!       The state vars
-!   diag_eos_* : double arrays
-!       Temp and Ne
-!   src_* : doubles arrays
-!       The source terms to be added to state (iterative approx.)
-!   double array (3)
-!       The low corner of the entire domain
-!   a : double
-!       The current a
-!   half_dt : double
-!       time step size, in Mpc km^-1 s ~ 10^12 yr.
-!
-!   Returns
-!   -------
-!   state : double array (dims) @todo
-!       The state vars
-!
-    use amrex_fort_module, only : rt => amrex_real
-    use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, &
-                                   TEMP_COMP, NE_COMP, gamma_minus_1
-    use bl_constants_module, only: M_PI
-    use eos_params_module
-    use network
-    use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT
-    use fundamental_constants_module
-    use comoving_module, only: comoving_h, comoving_OmB
-    use atomic_rates_module, only: tabulate_rates, interp_to_this_z, YHELIUM
-    use vode_aux_module    , only: z_vode, i_vode, j_vode, k_vode
-
-    implicit none
-
-    integer         , intent(in) :: lo(3), hi(3)
-    integer         , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3
-    integer         , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3
-    real(rt), intent(inout) ::    state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR)
-    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2)
-    real(rt), intent(in)    :: a, half_dt
-    integer         , intent(inout) :: max_iter, min_iter
-
-    integer :: i, j, k
-    real(rt) :: z, rho
-    real(rt) :: T_orig, ne_orig, e_orig
-    real(rt) :: T_out , ne_out , e_out, mu, mean_rhob
-
-    z = 1.d0/a - 1.d0
-
-    z_vode = z
-    mean_rhob = comoving_OmB * 3.d0*(comoving_h*100.d0)**2 / (8.d0*M_PI*Gconst)
-
-    ! Interpolate from the table to this redshift
-    call interp_to_this_z(z)
-
-    ! Note that (lo,hi) define the region of the box containing the grow cells
-    ! Do *not* assume this is just the valid region
-    ! apply heating-cooling to UEDEN and UEINT
-
-    do k = lo(3),hi(3)
-        do j = lo(2),hi(2)
-            do i = lo(1),hi(1)
-
-                ! Original values
-                rho     = state(i,j,k,URHO)
-                e_orig  = state(i,j,k,UEINT) / rho
-                T_orig  = diag_eos(i,j,k,TEMP_COMP)
-                ne_orig = diag_eos(i,j,k,  NE_COMP)
-
-                if (e_orig .lt. 0.d0) then
-                    print *,'negative e entering strang integration ',z, i,j,k, rho/mean_rhob, e_orig
-                    call bl_abort('bad e in strang')
-                end if
-
-                i_vode = i
-                j_vode = j
-                k_vode = k
-
-                call vode_wrapper(half_dt,rho,T_orig,ne_orig,e_orig, &
-                                              T_out ,ne_out ,e_out)
-
-                if (e_out .lt. 0.d0) then
-                    print *,'negative e exiting strang integration ',z, i,j,k, rho/mean_rhob, e_out
-                    T_out  = 10.0
-                    ne_out = 0.0
-                    mu     = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne_out)
-                    e_out  = T_out / (gamma_minus_1 * mp_over_kB * mu)
-                    call flush(6)
-!                    call bl_abort('bad e out of strang')
-                end if
-
-                ! Update (rho e) and (rho E)
-                state(i,j,k,UEINT) = state(i,j,k,UEINT) + rho * (e_out-e_orig)
-                state(i,j,k,UEDEN) = state(i,j,k,UEDEN) + rho * (e_out-e_orig)
-
-                ! Update T and ne (do not use stuff computed in f_rhs, per vode manual)
-                call nyx_eos_T_given_Re(T_out, ne_out, rho, e_out, a)
-                diag_eos(i,j,k,TEMP_COMP) = T_out
-                diag_eos(i,j,k,  NE_COMP) = ne_out
-
-            end do ! i
-        end do ! j
-    end do ! k
-
-end subroutine integrate_state_vode
-
-subroutine vode_wrapper(dt, rho_in, T_in, ne_in, e_in, T_out, ne_out, e_out)
-
-    use vode_aux_module, only: rho_vode, T_vode, ne_vode, &
-                               i_vode, j_vode, k_vode
-
-    use amrex_fort_module, only : rt => amrex_real
-    implicit none
-
-    real(rt), intent(in   ) :: dt
-    real(rt), intent(in   ) :: rho_in, T_in, ne_in, e_in
-    real(rt), intent(  out) ::         T_out,ne_out,e_out
-
-    ! Set the number of independent variables -- this should be just "e"
-    integer, parameter :: NEQ = 1
-  
-    ! Allocate storage for the input state
-    real(rt) :: y(NEQ)
-
-    ! Our problem is stiff, tell ODEPACK that. 21 means stiff, jacobian 
-    ! function is supplied, 22 means stiff, figure out my jacobian through 
-    ! differencing
-    integer, parameter :: MF_ANALYTIC_JAC = 21, MF_NUMERICAL_JAC = 22
-
-    ! Tolerance parameters:
-    !
-    !  itol specifies whether to use an single absolute tolerance for
-    !  all variables (1), or to pass an array of absolute tolerances, one
-    !  for each variable with a scalar relative tol (2), a scalar absolute
-    !  and array of relative tolerances (3), or arrays for both (4)
-    !  
-    !  The error is determined as e(i) = rtol*abs(y(i)) + atol, and must
-    !  be > 0.  
-    !
-    ! We will use arrays for both the absolute and relative tolerances, 
-    ! since we want to be easier on the temperature than the species
-
-    integer, parameter :: ITOL = 1
-    real(rt) :: atol(NEQ), rtol(NEQ)
-    
-    ! We want to do a normal computation, and get the output values of y(t)
-    ! after stepping though dt
-    integer, PARAMETER :: ITASK = 1
-  
-    ! istate determines the state of the calculation.  A value of 1 meeans
-    ! this is the first call to the problem -- this is what we will want.
-    ! Note, istate is changed over the course of the calculation, so it
-    ! cannot be a parameter
-    integer :: istate
-
-    ! we will override the maximum number of steps, so turn on the 
-    ! optional arguments flag
-    integer, parameter :: IOPT = 1
-    
-    ! declare a real work array of size 22 + 9*NEQ + 2*NEQ**2 and an
-    ! integer work array of since 30 + NEQ
-
-    integer, parameter :: LRW = 22 + 9*NEQ + 2*NEQ**2
-    real(rt)   :: rwork(LRW)
-    real(rt)   :: time
-    ! real(rt)   :: dt4
-    
-    integer, parameter :: LIW = 30 + NEQ
-    integer, dimension(LIW) :: iwork
-    
-    real(rt) :: rpar
-    integer          :: ipar
-
-    EXTERNAL jac, f_rhs
-    
-    logical, save :: firstCall = .true.
-
-    T_vode   = T_in
-    ne_vode  = ne_in
-    rho_vode = rho_in
-
-    ! We want VODE to re-initialize each time we call it
-    istate = 1
-    
-    rwork(:) = 0.d0
-    iwork(:) = 0
-    
-    ! Set the maximum number of steps allowed (the VODE default is 500)
-    iwork(6) = 2000
-    
-    ! Initialize the integration time
-    time = 0.d0
-    
-    ! We will integrate "e" in time. 
-    y(1) = e_in
-
-    ! Set the tolerances.  
-    atol(1) = 1.d-4 * e_in
-    rtol(1) = 1.d-4
-
-    ! call the integration routine
-    call dvode(f_rhs, NEQ, y, time, dt, ITOL, rtol, atol, ITASK, &
-               istate, IOPT, rwork, LRW, iwork, LIW, jac, MF_NUMERICAL_JAC, &
-               rpar, ipar)
-
-    e_out  = y(1)
-    T_out  = T_vode
-    ne_out = ne_vode
-
-    if (istate < 0) then
-       print *, 'istate = ', istate, 'at (i,j,k) ',i_vode,j_vode,k_vode
-       call bl_error("ERROR in vode_wrapper: integration failed")
-    endif
-
-!      print *,'Calling vode with 1/4 the time step'
-!      dt4 = 0.25d0  * dt
-!      y(1) = e_in
-
-!      do n = 1,4
-!         call dvode(f_rhs, NEQ, y, time, dt4, ITOL, rtol, atol, ITASK, &
-!                    istate, IOPT, rwork, LRW, iwork, LIW, jac, MF_NUMERICAL_JAC, &
-!                    rpar, ipar)
-!         if (istate < 0) then
-!            print *, 'doing subiteration ',n
-!            print *, 'istate = ', istate, 'at (i,j,k) ',i,j,k
-!            call bl_error("ERROR in vode_wrapper: sub-integration failed")
-!         end if
-
-!      end do
-!   endif
-
-end subroutine vode_wrapper
diff --git a/Exec/Make.Nyx b/Exec/Make.Nyx
index 4308625e..1e2b11d3 100644
--- a/Exec/Make.Nyx
+++ b/Exec/Make.Nyx
@@ -39,6 +39,16 @@ endif
 
 DEFINES += -DBL_NOLINEVALUES
 
+GIMLET_DIR ?= /home/vince/Development/gimlet
+# Gimlet needs FFTW MPI.
+FFTW_INC ?= /usr/include
+FFTW_DIR ?= /usr/lib/x86_64-linux-gnu
+
+REEBER_HOME ?= /project/projectdirs/nyx/ghweber/reeber2
+# Reeber needs Boost (both headers and libraries) and diy2.
+BOOST_INCLUDE_DIR ?= /project/projectdirs/nyx/ghweber/boost-1.61.0-noarch/include
+DIY_INCLUDE_DIR ?= /project/projectdirs/nyx/ghweber/diy/include
+
 ifeq ($(REEBER), TRUE)
   DEFINES += -DREEBER
   DEFINES += -DREEBER_USE_BOXLIB_READER -DREEBER_IN_SITU
@@ -133,6 +143,16 @@ ifeq ($(USE_MG), TRUE)
   VPATH_LOCATIONS   += $(AMREX_HOME)/Src/LinearSolvers/F_MG
 endif
 
+ifeq ($(USE_HPGMG), TRUE)
+   HPGMG_FCYCLES = TRUE
+   HPGMG_POST_F_CYCLE_TYPE = V
+   HPGMG_HELMHOLTZ = FALSE
+   HPGMG_STENCIL_VARIABLE_COEFFICIENT = FALSE
+   HPGMG_USE_SUBCOMM = TRUE
+   HPGMG_BOTTOM_SOLVER= CG
+   HPGMG_SMOOTHER = GSRB
+endif
+
 include $(AMREX_HOME)/Src/F_BaseLib/FParallelMG.mak
 INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/F_BaseLib
 VPATH_LOCATIONS   += $(AMREX_HOME)/Src/F_BaseLib
diff --git a/Exec/MiniSB/GNUmakefile b/Exec/MiniSB/GNUmakefile
index 41fac4a2..488bdf5a 100644
--- a/Exec/MiniSB/GNUmakefile
+++ b/Exec/MiniSB/GNUmakefile
@@ -1,34 +1,21 @@
 # AMREX_HOME defines the directory in which we will find all the BoxLib code
-AMREX_HOME ?= /project/projectdirs/nyx/src/amrex
+AMREX_HOME ?= ../../../amrex
 
-HPGMG_DIR ?= $(HOME)/hpgmg/finite-volume
+HPGMG_DIR ?= ../../Util/hpgmg/finite-volume
 
 # TOP defines the directory in which we will find Source, Exec, etc
 TOP = ../..
 
-# Reeber (Edison)
-BOOST_DIR ?= /project/projectdirs/nyx/ghweber/boost-1.58.0-edison-gcc-4.9.2
-DIY2_INCLUDE_DIR ?= /project/projectdirs/nyx/ghweber/diy2/include
-REEBER_HOME ?= /project/projectdirs/nyx/ghweber/reeber2
-
 # compilation options
-COMP    = gcc
+COMP    = intel  # gnu
 
 USE_MPI = TRUE
 USE_OMP = FALSE
 
 # Analysis
 REEBER = FALSE
-#DEFINES += -DREEBER_PERSISTENT_INTEGRAL_TRACE_VTCS
 
 USE_HPGMG = FALSE
-HPGMG_FCYCLES = TRUE
-HPGMG_POST_F_CYCLE_TYPE = V
-HPGMG_HELMHOLTZ = FALSE
-HPGMG_STENCIL_VARIABLE_COEFFICIENT = FALSE
-HPGMG_USE_SUBCOMM = TRUE
-HPGMG_BOTTOM_SOLVER= BICGSTAB
-HPGMG_SMOOTHER = GSRB
 
 PRECISION = DOUBLE
 DEBUG     = FALSE
diff --git a/Exec/MiniSB/inputs.32 b/Exec/MiniSB/inputs.32
index 78d94936..88ca697b 100644
--- a/Exec/MiniSB/inputs.32
+++ b/Exec/MiniSB/inputs.32
@@ -13,10 +13,6 @@ amr.plot_files_output = 1
 
 nyx.print_fortran_warnings = 0
 
-// Show timings in different routines
-nyx.show_timings = 0
-gravity.show_timings = 0
-
 nyx.ppm_type = 0
 nyx.use_colglaz = 1
 
diff --git a/Exec/MiniSB/inputs.32.plot_z b/Exec/MiniSB/inputs.32.plot_z
index 03f81cff..6801a552 100644
--- a/Exec/MiniSB/inputs.32.plot_z
+++ b/Exec/MiniSB/inputs.32.plot_z
@@ -17,10 +17,6 @@ amr.plot_files_output = 1
 
 nyx.print_fortran_warnings = 0
 
-// Show timings in different routines
-nyx.show_timings = 0
-gravity.show_timings = 0
-
 nyx.ppm_type = 0
 nyx.use_colglaz = 1
 
diff --git a/Exec/MiniSB/inputs.32.ref b/Exec/MiniSB/inputs.32.ref
index 13a373fa..23cebc8c 100644
--- a/Exec/MiniSB/inputs.32.ref
+++ b/Exec/MiniSB/inputs.32.ref
@@ -15,10 +15,6 @@ amr.plot_files_output = 1
 
 nyx.print_fortran_warnings = 0
 
-// Show timings in different routines
-nyx.show_timings = 0
-gravity.show_timings = 0
-
 nyx.ppm_type = 0
 nyx.use_colglaz = 1
 
diff --git a/Exec/MiniSB/inputs.analysis b/Exec/MiniSB/inputs.analysis
index a8f26a9f..8b0bc9f8 100644
--- a/Exec/MiniSB/inputs.analysis
+++ b/Exec/MiniSB/inputs.analysis
@@ -7,10 +7,6 @@ amr.refine_grid_layout = 0
 amr.checkpoint_files_output = 1
 amr.plot_files_output = 1
 
-// Show timings in different routines
-nyx.show_timings = 1
-gravity.show_timings = 1
-
 nyx.ppm_type = 0
 nyx.use_colglaz = 1
 
diff --git a/Exec/RegressionTest/GNUmakefile b/Exec/RegressionTest/GNUmakefile
index 59f62960..9a35b23f 100644
--- a/Exec/RegressionTest/GNUmakefile
+++ b/Exec/RegressionTest/GNUmakefile
@@ -1,5 +1,5 @@
 # AMREX_HOME defines the directory in which we will find all the BoxLib code
-AMREX_HOME ?= /project/projectdirs/nyx/src/amrex
+AMREX_HOME ?= ../../../amrex
 
 # TOP defines the directory in which we will find Source, Exec, etc
 TOP = ../..
diff --git a/Exec/SantaBarbara/GNUmakefile b/Exec/SantaBarbara/GNUmakefile
index 04466511..19dd050e 100644
--- a/Exec/SantaBarbara/GNUmakefile
+++ b/Exec/SantaBarbara/GNUmakefile
@@ -1,39 +1,21 @@
 # AMREX_HOME defines the directory in which we will find all the BoxLib code
-AMREX_HOME ?= /project/projectdirs/nyx/src/amrex
+AMREX_HOME ?= ../../../amrex
 
-HPGMG_DIR ?= /global/homes/f/friesen/hpgmg/finite-volume
+HPGMG_DIR ?= ../../Util/hpgmg/finite-volume
 
 # TOP defines the directory in which we will find Source, Exec, etc
 TOP = ../..
 
-# Reeber
-BOOST_DIR ?= /opt/local
-REEBER_HOME ?= $(HOME)/devel/Reeber/dev
-
 # compilation options
-COMP    = gcc
+COMP    = intel  # gnu
 USE_OMP = FALSE
 USE_MPI = TRUE
 
 PRECISION = DOUBLE
+USE_SINGLE_PRECISION_PARTICLES = FALSE
 DEBUG     = FALSE
-DEBUG     = TRUE
 
 USE_HPGMG = FALSE
-HPGMG_FCYCLES = TRUE
-HPGMG_POST_F_CYCLE_TYPE = V
-HPGMG_HELMHOLTZ = FALSE
-HPGMG_STENCIL_VARIABLE_COEFFICIENT = FALSE
-HPGMG_USE_SUBCOMM = TRUE
-HPGMG_BOTTOM_SOLVER= BICGSTAB
-HPGMG_SMOOTHER = GSRB
-
-# Use single precision for particles?
-#
-# If you set this to be TRUE be sure to do a "make realclean"
-# to make sure all your code is consistent.
-#
-USE_SINGLE_PRECISION_PARTICLES = FALSE
 
 # physics
 DIM      = 3
diff --git a/Exec/SantaBarbara/inputs b/Exec/SantaBarbara/inputs
index a27330ae..db74e055 100644
--- a/Exec/SantaBarbara/inputs
+++ b/Exec/SantaBarbara/inputs
@@ -4,8 +4,6 @@ max_step = 10000000
 nyx.ppm_type = 0
 nyx.use_colglaz = 1
 nyx.add_ext_src = 0
-nyx.show_timings = 1
-gravity.show_timings = 1
 
 #This is 1e-8 times the lowest density in plt00000
 nyx.small_dens = 5.162470e1
@@ -13,9 +11,6 @@ nyx.small_dens = 5.162470e1
 #This is 1e-5 times the constant temparature in plt00000
 nyx.small_temp = 1.e-2
 
-#This is 1e-8 times the lowest pressure in plt00000
-nyx.small_pres = 3.487507e2
-
 nyx.do_santa_barbara = 1
 nyx.init_sb_vels     = 1
 gravity.sl_tol = 1.e-12
diff --git a/Exec/Scaling/GNUmakefile b/Exec/Scaling/GNUmakefile
index 98595f9c..79e64ebf 100644
--- a/Exec/Scaling/GNUmakefile
+++ b/Exec/Scaling/GNUmakefile
@@ -1,52 +1,39 @@
 # AMREX_HOME defines the directory in which we will find all the AMReX code
-AMREX_HOME ?= /global/homes/a/almgren/GitCode/amrex
+AMREX_HOME ?= ../../../amrex
 
-HPGMG_DIR ?= ../../../hpgmg/finite-volume
+HPGMG_DIR ?= ../../Util/hpgmg/finite-volume
+CVODE_LIB_DIR ?= ../../../sundials/sundials-intel/lib
 
 # TOP defines the directory in which we will find Source, Exec, etc
 TOP = ../..
 
 # compilation options
-COMP    = gcc
+COMP    = intel  # gnu
 USE_MPI = TRUE
 USE_OMP = TRUE
 
-PROFILE       = FALSE
+PROFILE       = TRUE
 TRACE_PROFILE = FALSE
 COMM_PROFILE  = FALSE
 
 PRECISION = DOUBLE
+USE_SINGLE_PRECISION_PARTICLES = FALSE
 DEBUG     = FALSE
 
 GIMLET = FALSE
 REEBER = FALSE
 
-GIMLET_DIR ?= /home/vince/Development/gimlet
-# Gimlet needs FFTW MPI.
-FFTW_INC ?= /usr/include
-FFTW_DIR ?= /usr/lib/x86_64-linux-gnu
-
-REEBER_HOME ?= $(HOME)/devel/Reeber/dev
-# Reeber needs Boost (both headers and libraries) and diy2.
-BOOST_INCLUDE_DIR ?= /usr/local/include
-BOOST_LIB_DIR ?= /usr/local/lib
-DIY2_INCLUDE_DIR ?= /usr/local/include
-
-#USE_HPGMG = TRUE
-HPGMG_FCYCLES = FALSE
-HPGMG_POST_F_CYCLE_TYPE = V
-HPGMG_HELMHOLTZ = FALSE
-HPGMG_STENCIL_VARIABLE_COEFFICIENT = FALSE
-HPGMG_USE_SUBCOMM = TRUE
-HPGMG_BOTTOM_SOLVER= CG
-HPGMG_SMOOTHER = GSRB
+USE_HPGMG = TRUE
 
 # physics
 DIM      = 3
 USE_GRAV = TRUE
 USE_HEATCOOL = TRUE
+USE_AGN = FALSE
+USE_CVODE = FALSE
 
 Bpack := ./Make.package
 Blocs := .
 
 include $(TOP)/Exec/Make.Nyx
+
diff --git a/Exec/Scaling/Prob_3d.f90 b/Exec/Scaling/Prob_3d.f90
index f1727bea..2e82541f 100644
--- a/Exec/Scaling/Prob_3d.f90
+++ b/Exec/Scaling/Prob_3d.f90
@@ -65,10 +65,11 @@ subroutine fort_initdata(level,time,lo,hi, &
                                bind(C, name="fort_initdata")
 
       use amrex_fort_module, only : rt => amrex_real
+      use amrex_parmparse_module
       use probdata_module
       use atomic_rates_module, only : XHYDROGEN
       use meth_params_module, only : URHO, UMX, UMZ, UEDEN, UEINT, UFS, &
-                                     small_dens, TEMP_COMP, NE_COMP
+                                     small_dens, TEMP_COMP, NE_COMP, ZHI_COMP
  
       implicit none
  
@@ -81,6 +82,13 @@ subroutine fort_initdata(level,time,lo,hi, &
       real(rt) diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,nd)
 
       integer i,j,k
+      real(rt) z_in
+
+      type(amrex_parmparse) :: pp
+
+      call amrex_parmparse_build(pp, "nyx")
+      call pp%query("initial_z", z_in)
+      call amrex_parmparse_destroy(pp)
 
       ! This is the case where we have compiled with states defined 
       !  but they have only one component each so we fill them this way.
@@ -90,7 +98,7 @@ subroutine fort_initdata(level,time,lo,hi, &
          diag_eos(:,:,:,1)    = 0.0d0
 
       ! This is the regular case with NO_HYDRO = FALSE
-      else if (ns.gt.1 .and. nd.eq.2) then
+      else if (ns.gt.1 .and. nd.ge.2) then
 
          do k = lo(3), hi(3)
          do j = lo(2), hi(2)
@@ -108,8 +116,13 @@ subroutine fort_initdata(level,time,lo,hi, &
                state(i,j,k,UFS+1) = (1.d0 - XHYDROGEN)
             end if
 
-            diag_eos(i,j,k,TEMP_COMP) = 1000.d0
-            diag_eos(i,j,k,  NE_COMP) =    0.d0
+            diag_eos(i,j,k,TEMP_COMP) = 0.021d0*(1.0d0 + z_in)**2
+            diag_eos(i,j,k,  NE_COMP) = 0.d0
+
+            if (ZHI_COMP .gt. -1) then
+               diag_eos(i,j,k, ZHI_COMP) = 7.5d0
+            endif
+
          enddo
          enddo
          enddo
diff --git a/Exec/Scaling/inputs b/Exec/Scaling/inputs
index 2c6b1d5c..48808055 100644
--- a/Exec/Scaling/inputs
+++ b/Exec/Scaling/inputs
@@ -6,29 +6,21 @@ nyx.ppm_reference    = 1
 nyx.use_colglaz      = 0
 nyx.corner_coupling  = 1
 
+nyx.strang_split     = 1
 nyx.add_ext_src      = 1
 nyx.heat_cool_type   = 3
-nyx.strang_split     = 1
-
-gravity.show_timings = 1
-nyx.show_timings     = 1
+#nyx.simd_width       = 8
 
-#This is 1e-8 times the lowest density in plt00000
 nyx.small_dens = 1.e-2
-
-#This is 1e-5 times the constant temparature in plt00000
 nyx.small_temp = 1.e-2
 
-#This is 1e-8 times the lowest pressure in plt00000
-nyx.small_pres = 1.0e-4
-
 nyx.do_santa_barbara = 1
 nyx.init_sb_vels     = 1
 gravity.ml_tol = 1.e-10
 gravity.sl_tol = 1.e-10
 
 nyx.initial_z = 159.0
-nyx.final_z = 4.2
+nyx.final_z = 2.0
 
 #File written during the run: nstep | time | dt | redshift | a
 amr.data_log = runlog
@@ -36,19 +28,18 @@ amr.data_log = runlog
 
 #This is how we restart from a checkpoint and write an ascii particle file
 #Leave this commented out in cvs version
-#amr.restart = chk03500
+#amr.restart = chk00100
 #max_step = 4
 #particles.particle_output_file = particle_output
 
 gravity.gravity_type = PoissonGrav
 gravity.no_sync      = 1
 gravity.no_composite = 1
+gravity.solve_with_cpp = 0
+gravity.solve_with_hpgmg = 1
 
 mg.bottom_solver = 4
 
-gravity.solve_with_cpp = 0
-gravity.solve_with_hpgmg = 0
-
 # PROBLEM SIZE & GEOMETRY
 geometry.is_periodic =  1     1     1
 geometry.coord_sys   =  0
@@ -58,11 +49,12 @@ geometry.prob_lo     =  0     0     0
 #Domain size in Mpc
 geometry.prob_hi     =  28.49002849  28.49002849  28.49002849
 
-amr.n_cell           =  64 64 64
+amr.n_cell           =  64  64  64
 amr.max_grid_size    = 32
+#fabarray.mfiter_tile_size = 128 8 8
+fabarray.mfiter_tile_size = 1024000 8 8
 
-nyx.particle_initrandom_mass  = 1.01241529887243E5
-
+nyx.particle_initrandom_mass  = 3.317482451E9
 
 # >>>>>>>>>>>>>  BC FLAGS <<<<<<<<<<<<<<<<
 # 0 = Interior           3 = Symmetry
@@ -76,16 +68,28 @@ nyx.hi_bc       =  0   0   0
 nyx.do_hydro = 1
 nyx.do_grav  = 1
 
-# COMOVING
+# COSMOLOGY
 nyx.comoving_OmM = 0.275
 nyx.comoving_OmB = 0.046
 nyx.comoving_h   = 0.702d0
 
+# UVB and reionization
+nyx.inhomo_reion     = 0
+nyx.inhomo_zhi_file  = "zhi.bin"
+nyx.inhomo_grid      = 512
+nyx.uvb_rates_file   = "TREECOOL_middle"
+nyx.uvb_density_A    = 1.0
+nyx.uvb_density_B    = 0.0
+nyx.reionization_zHI_flash   = -1.0
+nyx.reionization_zHeII_flash = -1.0
+nyx.reionization_T_zHI       = 2.0e4
+nyx.reionization_T_zHeII     = 1.5e4
+
 # PARTICLES
 nyx.do_dm_particles = 1
 
 # >>>>>>>>>>>>>  PARTICLE INIT OPTIONS <<<<<<<<<<<<<<<<
-#  "AsciiFile"        "Random"	    "Cosmological"
+#  "AsciiFile"        "Random"      "Cosmological"
 # >>>>>>>>>>>>>  PARTICLE INIT OPTIONS <<<<<<<<<<<<<<<<
 nyx.particle_init_type = RandomPerCell
 
@@ -96,7 +100,7 @@ nyx.particle_move_type = Gravitational
 
 
 # TIME STEP CONTROL
-nyx.relative_max_change_a = 0.02    # max change in scale factor
+nyx.relative_max_change_a = 0.01    # max change in scale factor
 particles.cfl             = 0.5     # 'cfl' for particles 
 nyx.cfl                   = 0.5     # cfl number for hyperbolic system
 nyx.init_shrink           = 1.0     # scale back initial timestep
@@ -110,7 +114,7 @@ nyx.v                 = 1       # verbosity in Nyx.cpp
 gravity.v             = 1       # verbosity in Gravity.cpp
 amr.v                 = 1       # verbosity in Amr.cpp
 mg.v                  = 1       # verbosity in Amr.cpp
-particles.v           = 1       # verbosity in Particle class
+particles.v           = 2       # verbosity in Particle class
 
 # REFINEMENT / REGRIDDING
 amr.max_level          = 0        # maximum level number allowed
@@ -123,20 +127,18 @@ amr.regrid_on_restart  = 1
 #amr.nosub              = 1
 
 # CHECKPOINT FILES
-amr.checkpoint_files_output = 0 # no output
+amr.checkpoint_files_output = 0  # no output
 amr.check_file        = chk
-amr.check_int         = 200
-amr.checkpoint_nfiles = 128
+amr.check_int         = 100
+amr.checkpoint_nfiles = 64
 
 # PLOTFILES
-amr.plot_files_output = 0
-#fab.format          = IEEE32
+amr.plot_files_output = 0  # no output
 fab.format          = NATIVE_32
 amr.plot_file       = plt
 amr.plot_int        = -1
-amr.plot_nfiles     = 128
-nyx.plot_z_values   = 5.4 5.0 4.6 4.2
-#nyx.plot_z_values   = 6.0 5.4 5.0 4.6 4.2 4.0 3.6 3.2 3.0 2.6 2.4 2.2 2.0 
+amr.plot_nfiles     = 64
+nyx.plot_z_values   = 7.0 6.0 5.0 4.0 3.0 2.0
 particles.write_in_plotfile = 1
 
 amr.plot_vars        = density xmom ymom zmom rho_e Temp phi_grav
diff --git a/Exec/Scaling/integrate_state_vode_3d.f90 b/Exec/Scaling/integrate_state_vode_3d.f90
deleted file mode 100644
index a866d3ac..00000000
--- a/Exec/Scaling/integrate_state_vode_3d.f90
+++ /dev/null
@@ -1,243 +0,0 @@
-subroutine integrate_state_vode(lo, hi, &
-                                state   , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, &
-                                diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, &
-                                a, half_dt, min_iter, max_iter)
-!
-!   Calculates the sources to be added later on.
-!
-!   Parameters
-!   ----------
-!   lo : double array (3)
-!       The low corner of the current box.
-!   hi : double array (3)
-!       The high corner of the current box.
-!   state_* : double arrays
-!       The state vars
-!   diag_eos_* : double arrays
-!       Temp and Ne
-!   src_* : doubles arrays
-!       The source terms to be added to state (iterative approx.)
-!   double array (3)
-!       The low corner of the entire domain
-!   a : double
-!       The current a
-!   half_dt : double
-!       time step size, in Mpc km^-1 s ~ 10^12 yr.
-!
-!   Returns
-!   -------
-!   state : double array (dims) @todo
-!       The state vars
-!
-    use amrex_fort_module, only : rt => amrex_real
-    use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, &
-                                   TEMP_COMP, NE_COMP, gamma_minus_1
-    use bl_constants_module, only: M_PI
-    use eos_params_module
-    use network
-    use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT
-    use fundamental_constants_module
-    use comoving_module, only: comoving_h, comoving_OmB
-    use atomic_rates_module, only: tabulate_rates, interp_to_this_z, YHELIUM
-    use vode_aux_module    , only: z_vode, i_vode, j_vode, k_vode
-
-    implicit none
-
-    integer         , intent(in) :: lo(3), hi(3)
-    integer         , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3
-    integer         , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3
-    real(rt), intent(inout) ::    state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR)
-    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2)
-    real(rt), intent(in)    :: a, half_dt
-    integer         , intent(inout) :: max_iter, min_iter
-
-    integer :: i, j, k
-    real(rt) :: z, rho
-    real(rt) :: T_orig, ne_orig, e_orig
-    real(rt) :: T_out , ne_out , e_out, mu, mean_rhob
-
-    z = 1.d0/a - 1.d0
-
-    z_vode = z
-    mean_rhob = comoving_OmB * 3.d0*(comoving_h*100.d0)**2 / (8.d0*M_PI*Gconst)
-
-    ! Interpolate from the table to this redshift
-    call interp_to_this_z(z)
-
-    ! Note that (lo,hi) define the region of the box containing the grow cells
-    ! Do *not* assume this is just the valid region
-    ! apply heating-cooling to UEDEN and UEINT
-
-    do k = lo(3),hi(3)
-        do j = lo(2),hi(2)
-            do i = lo(1),hi(1)
-
-                ! Original values
-                rho     = state(i,j,k,URHO)
-                e_orig  = state(i,j,k,UEINT) / rho
-                T_orig  = diag_eos(i,j,k,TEMP_COMP)
-                ne_orig = diag_eos(i,j,k,  NE_COMP)
-
-                if (e_orig .lt. 0.d0) then
-                    print *,'negative e entering strang integration ',z, i,j,k, rho/mean_rhob, e_orig
-                    call bl_abort('bad e in strang')
-                end if
-
-                i_vode = i
-                j_vode = j
-                k_vode = k
-
-                call vode_wrapper(half_dt,rho,T_orig,ne_orig,e_orig, &
-                                              T_out ,ne_out ,e_out)
-
-                if (e_out .lt. 0.d0) then
-                    print *,'negative e exiting strang integration ',z, i,j,k, rho/mean_rhob, e_out
-                    T_out  = 10.0
-                    ne_out = 0.0
-                    mu     = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne_out)
-                    e_out  = T_out / (gamma_minus_1 * mp_over_kB * mu)
-                    call flush(6)
-!                    call bl_abort('bad e out of strang')
-                end if
-
-                ! Update (rho e) and (rho E)
-                state(i,j,k,UEINT) = state(i,j,k,UEINT) + rho * (e_out-e_orig)
-                state(i,j,k,UEDEN) = state(i,j,k,UEDEN) + rho * (e_out-e_orig)
-
-                ! Update T and ne (do not use stuff computed in f_rhs, per vode manual)
-                call nyx_eos_T_given_Re(T_out, ne_out, rho, e_out, a)
-                diag_eos(i,j,k,TEMP_COMP) = T_out
-                diag_eos(i,j,k,  NE_COMP) = ne_out
-
-            end do ! i
-        end do ! j
-    end do ! k
-
-end subroutine integrate_state_vode
-
-subroutine vode_wrapper(dt, rho_in, T_in, ne_in, e_in, T_out, ne_out, e_out)
-
-    use amrex_fort_module, only : rt => amrex_real
-    use vode_aux_module, only: rho_vode, T_vode, ne_vode, &
-                               i_vode, j_vode, k_vode
-
-    implicit none
-
-    real(rt), intent(in   ) :: dt
-    real(rt), intent(in   ) :: rho_in, T_in, ne_in, e_in
-    real(rt), intent(  out) ::         T_out,ne_out,e_out
-
-    ! Set the number of independent variables -- this should be just "e"
-    integer, parameter :: NEQ = 1
-  
-    ! Allocate storage for the input state
-    real(rt) :: y(NEQ)
-
-    ! Our problem is stiff, tell ODEPACK that. 21 means stiff, jacobian 
-    ! function is supplied, 22 means stiff, figure out my jacobian through 
-    ! differencing
-    integer, parameter :: MF_ANALYTIC_JAC = 21, MF_NUMERICAL_JAC = 22
-
-    ! Tolerance parameters:
-    !
-    !  itol specifies whether to use an single absolute tolerance for
-    !  all variables (1), or to pass an array of absolute tolerances, one
-    !  for each variable with a scalar relative tol (2), a scalar absolute
-    !  and array of relative tolerances (3), or arrays for both (4)
-    !  
-    !  The error is determined as e(i) = rtol*abs(y(i)) + atol, and must
-    !  be > 0.  
-    !
-    ! We will use arrays for both the absolute and relative tolerances, 
-    ! since we want to be easier on the temperature than the species
-
-    integer, parameter :: ITOL = 1
-    real(rt) :: atol(NEQ), rtol(NEQ)
-    
-    ! We want to do a normal computation, and get the output values of y(t)
-    ! after stepping though dt
-    integer, PARAMETER :: ITASK = 1
-  
-    ! istate determines the state of the calculation.  A value of 1 meeans
-    ! this is the first call to the problem -- this is what we will want.
-    ! Note, istate is changed over the course of the calculation, so it
-    ! cannot be a parameter
-    integer :: istate
-
-    ! we will override the maximum number of steps, so turn on the 
-    ! optional arguments flag
-    integer, parameter :: IOPT = 1
-    
-    ! declare a real work array of size 22 + 9*NEQ + 2*NEQ**2 and an
-    ! integer work array of since 30 + NEQ
-
-    integer, parameter :: LRW = 22 + 9*NEQ + 2*NEQ**2
-    real(rt)   :: rwork(LRW)
-    real(rt)   :: time
-    ! real(rt)   :: dt4
-    
-    integer, parameter :: LIW = 30 + NEQ
-    integer, dimension(LIW) :: iwork
-    
-    real(rt) :: rpar
-    integer          :: ipar
-
-    EXTERNAL jac, f_rhs
-    
-    logical, save :: firstCall = .true.
-
-    T_vode   = T_in
-    ne_vode  = ne_in
-    rho_vode = rho_in
-
-    ! We want VODE to re-initialize each time we call it
-    istate = 1
-    
-    rwork(:) = 0.d0
-    iwork(:) = 0
-    
-    ! Set the maximum number of steps allowed (the VODE default is 500)
-    iwork(6) = 2000
-    
-    ! Initialize the integration time
-    time = 0.d0
-    
-    ! We will integrate "e" in time. 
-    y(1) = e_in
-
-    ! Set the tolerances.  
-    atol(1) = 1.d-4 * e_in
-    rtol(1) = 1.d-4
-
-    ! call the integration routine
-    call dvode(f_rhs, NEQ, y, time, dt, ITOL, rtol, atol, ITASK, &
-               istate, IOPT, rwork, LRW, iwork, LIW, jac, MF_NUMERICAL_JAC, &
-               rpar, ipar)
-
-    e_out  = y(1)
-    T_out  = T_vode
-    ne_out = ne_vode
-
-    if (istate < 0) then
-       print *, 'istate = ', istate, 'at (i,j,k) ',i_vode,j_vode,k_vode
-       call bl_error("ERROR in vode_wrapper: integration failed")
-    endif
-
-!      print *,'Calling vode with 1/4 the time step'
-!      dt4 = 0.25d0  * dt
-!      y(1) = e_in
-
-!      do n = 1,4
-!         call dvode(f_rhs, NEQ, y, time, dt4, ITOL, rtol, atol, ITASK, &
-!                    istate, IOPT, rwork, LRW, iwork, LIW, jac, MF_NUMERICAL_JAC, &
-!                    rpar, ipar)
-!         if (istate < 0) then
-!            print *, 'doing subiteration ',n
-!            print *, 'istate = ', istate, 'at (i,j,k) ',i,j,k
-!            call bl_error("ERROR in vode_wrapper: sub-integration failed")
-!         end if
-
-!      end do
-!   endif
-
-end subroutine vode_wrapper
diff --git a/Source/AGN/AGN_sources.cpp b/Source/AGN/AGN_sources.cpp
index ffb4e76b..f7099018 100644
--- a/Source/AGN/AGN_sources.cpp
+++ b/Source/AGN/AGN_sources.cpp
@@ -17,7 +17,6 @@ Nyx::get_old_source (Real      old_time,
 
     MultiFab& S_old = get_old_data(State_Type);
     MultiFab& D_old = get_old_data(DiagEOS_Type);
-    const int num_comps = S_old.nComp();
 
     ext_src.setVal(0.);
 
@@ -30,8 +29,8 @@ Nyx::get_old_source (Real      old_time,
     Nyx::theAPC()->GetParticleData(part_data);
 
     for (FillPatchIterator 
-         Old_fpi (*this, S_old, 4, old_time, State_Type, Density, num_comps),
-         Old_dfpi(*this, D_old, 4, old_time, DiagEOS_Type, 0, 2);
+         Old_fpi (*this, S_old, 4, old_time, State_Type  , Density, S_old.nComp()),
+         Old_dfpi(*this, D_old, 4, old_time, DiagEOS_Type, 0      , D_old.nComp());
          Old_fpi.isValid();
          ++Old_fpi)
     {
@@ -71,7 +70,6 @@ Nyx::get_new_source (Real      old_time,
 
     MultiFab& S_old = get_old_data(State_Type);
     MultiFab& D_old = get_old_data(DiagEOS_Type);
-    const int num_comps = S_old.nComp();
 
     ext_src.setVal(0.);
 
@@ -87,10 +85,10 @@ Nyx::get_new_source (Real      old_time,
     std::cout << "AGN DATA(V) " << part_data[0] << " " << part_data[1] << " " << part_data[2] << std::endl;
     std::cout << "AGN DATA(A) " << part_data[3] << " " << part_data[4] << " " << part_data[5] << std::endl;
 
-    for (FillPatchIterator Old_fpi(*this, S_old, 4, old_time, State_Type, Density, num_comps),
-                           New_fpi(*this, S_old, 4, new_time, State_Type, Density, num_comps),
-                           Old_dfpi(*this, D_old, 4, old_time, DiagEOS_Type, 0, 2),
-                           New_dfpi(*this, D_old, 4, new_time, DiagEOS_Type, 0, 2);
+    for (FillPatchIterator Old_fpi( *this, S_old, 4, old_time, State_Type  , Density, S_old.nComp()),
+                           New_fpi( *this, S_old, 4, new_time, State_Type  , Density, S_old.nComp()),
+                           Old_dfpi(*this, D_old, 4, old_time, DiagEOS_Type, 0      , D_old.nComp()),
+                           New_dfpi(*this, D_old, 4, new_time, DiagEOS_Type, 0      , D_old.nComp());
          Old_fpi.isValid() && New_fpi.isValid() && Old_dfpi.isValid() && New_dfpi.isValid();
          ++Old_fpi, ++New_fpi, ++Old_dfpi, ++New_dfpi)
     {
diff --git a/Source/AGN/agn_3d.f90 b/Source/AGN/agn_3d.f90
index 23d111cd..787251eb 100644
--- a/Source/AGN/agn_3d.f90
+++ b/Source/AGN/agn_3d.f90
@@ -15,7 +15,7 @@ subroutine nyx_compute_overlap(np, particles, ng, ghosts, delta_x) &
 
     cutoff = delta_x(1)
     
-    do i = 1, np
+    do i = 1, np-1
        do j = i+1, np
 
           r2 = sum((particles(i)%pos - particles(j)%pos)**2)
@@ -205,8 +205,7 @@ subroutine agn_particle_velocity(np, particles, &
        j = particles(n)%pos(2) / dx(2)
        k = particles(n)%pos(3) / dx(3)
 
-       ! momx, momy, momz, E: momentum and total energy.
-
+       ! momx, momy, momz: momentum = volume x change in momentum density.
        momx = sum((state_new(i-1:i+1, j-1:j+1, k-1:k+1, UMX) - &
                    state_old(i-1:i+1, j-1:j+1, k-1:k+1, UMX)) * weight) * vol
        momy = sum((state_new(i-1:i+1, j-1:j+1, k-1:k+1, UMY) - &
@@ -224,6 +223,7 @@ subroutine agn_particle_velocity(np, particles, &
 
        ! Update particle energy if particle isn't brand new
        if (add_energy .gt. 0) then
+          ! E: total energy = volume x change in total energy density.
           E = sum((state_new(i-1:i+1, j-1:j+1, k-1:k+1, UEDEN) - &
                    state_old(i-1:i+1, j-1:j+1, k-1:k+1, UEDEN)) * weight) * vol
           deltaEnergy = - E / mass
@@ -406,7 +406,7 @@ subroutine agn_release_energy(np, particles, &
     use amrex_fort_module, only : amrex_real
     use fundamental_constants_module, only: k_B, m_proton
     use eos_module
-    use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, NE_COMP
+    use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, NDIAG, NE_COMP
     use particle_mod      , only: agn_particle_t
     use eos_module, only : nyx_eos_given_RT
     use agn_params_module, only : T_min
@@ -417,7 +417,7 @@ subroutine agn_release_energy(np, particles, &
     real(amrex_real),     intent(inout)        :: state &
          (slo(1):shi(1),slo(2):shi(2),slo(3):shi(3),NVAR)
     real(amrex_real),     intent(inout)        :: diag_eos &
-         (dlo(1):dhi(1),dlo(2):dhi(2),dlo(3):dhi(3),2)
+         (dlo(1):dhi(1),dlo(2):dhi(2),dlo(3):dhi(3),NDIAG)
     real(amrex_real),     intent(in   )        :: a
     real(amrex_real),     intent(in   )        :: dx(3)
 
@@ -443,14 +443,18 @@ subroutine agn_release_energy(np, particles, &
 
        call nyx_eos_given_RT(e, pressure, avg_rho, T_min, avg_Ne, a)
 
-          print *, 'neighborhood mass: ', m_g
-          print *, 'e = ', e
-          print *, 'particle energy: ', particles(n)%energy
-          print *, 'm_g * e = ', (m_g * e)
+!       print *, 'AGN particle at ', particles(n)%pos, ':', i, j, k
+       print 50, particles(n)%pos, i, j, k, particles(n)%mass, &
+            particles(n)%energy
+50     format (1x, 'AGN particle at ', 3F8.3, 3I4, ' m=', E12.5, ' e=', E12.5)
 
        if (particles(n)%energy > m_g * e) then
 
           print *, 'RELEASING ENERGY of particle at ', particles(n)%pos
+          print *, 'neighborhood mass: ', m_g
+          print *, 'e = ', e
+          print *, 'particle energy: ', particles(n)%energy
+          print *, 'm_g * e = ', (m_g * e)
 
           state(i-1:i+1, j-1:j+1, k-1:k+1, UEDEN) = &
           state(i-1:i+1, j-1:j+1, k-1:k+1, UEDEN) + &
diff --git a/Source/DarkMatterParticleContainer.H b/Source/DarkMatterParticleContainer.H
index df1014bc..eb649a2e 100644
--- a/Source/DarkMatterParticleContainer.H
+++ b/Source/DarkMatterParticleContainer.H
@@ -21,6 +21,7 @@ public:
     }
 
     using MyParIter = amrex::ParIter<1+BL_SPACEDIM>;
+    using MyConstParIter = amrex::ParConstIter<1+BL_SPACEDIM>;
 
     virtual ~DarkMatterParticleContainer () {}
 
@@ -39,6 +40,8 @@ public:
     virtual void moveKick      (amrex::MultiFab& acceleration, int level, amrex::Real timestep,
                                 amrex::Real a_new = 1.0, amrex::Real a_half = 1.0);
 
+    void InitFromBinaryMortonFile(const std::string& particle_directory, int nextra, int skip_factor);
+
 };
 
 #endif /* _DarkMatterParticleContainer_H_ */
diff --git a/Source/DarkMatterParticleContainer.cpp b/Source/DarkMatterParticleContainer.cpp
index 3ea2cb65..219a02db 100644
--- a/Source/DarkMatterParticleContainer.cpp
+++ b/Source/DarkMatterParticleContainer.cpp
@@ -1,8 +1,78 @@
+#include <stdint.h>
+
 #include "DarkMatterParticleContainer.H"
 #include "dm_F.H"
 
 using namespace amrex;
 
+/// These are helper functions used when initializing from a morton-ordered
+/// binary particle file.
+namespace {
+
+  inline uint64_t split(unsigned int a) {
+    uint64_t x = a & 0x1fffff;
+    x = (x | x << 32) & 0x1f00000000ffff;
+    x = (x | x << 16) & 0x1f0000ff0000ff;
+    x = (x | x << 8)  & 0x100f00f00f00f00f;
+    x = (x | x << 4)  & 0x10c30c30c30c30c3;
+    x = (x | x << 2)  & 0x1249249249249249;
+    return x;
+  }
+  
+  inline uint64_t get_morton_index(unsigned int x,
+				   unsigned int y,
+				   unsigned int z) {
+    uint64_t morton_index = 0;
+    morton_index |= split(x) | ( split(y) << 1) | (split(z) << 2);
+    return morton_index;
+  }  
+
+  struct BoxMortonKey {
+    uint64_t morton_id;
+    int box_id;
+  };
+
+  struct by_morton_id { 
+    bool operator()(const BoxMortonKey &a, const BoxMortonKey &b) { 
+      return a.morton_id < b.morton_id;
+    }
+  };
+
+  std::string get_file_name(const std::string& base, int file_num) {
+    std::stringstream ss;
+    ss << base << file_num;
+    return ss.str();
+  }
+
+  struct ParticleMortonFileHeader {
+    long NP;
+    int  DM;
+    int  NX;
+    int  SZ;
+    int  NF;
+  };
+  
+  void ReadHeader(const std::string& dir,
+		  const std::string& file,
+		  ParticleMortonFileHeader& hdr) {
+    std::string header_filename = dir;
+    header_filename += "/";
+    header_filename += file;
+    
+    Array<char> fileCharPtr;
+    ParallelDescriptor::ReadAndBcastFile(header_filename, fileCharPtr);
+    std::string fileCharPtrString(fileCharPtr.dataPtr());
+    std::istringstream HdrFile(fileCharPtrString, std::istringstream::in);
+
+    HdrFile >> hdr.NP;
+    HdrFile >> hdr.DM;
+    HdrFile >> hdr.NX;
+    HdrFile >> hdr.SZ;
+    HdrFile >> hdr.NF;    
+  }
+
+}
+
 void
 DarkMatterParticleContainer::moveKickDrift (amrex::MultiFab&       acceleration,
 		                            int                    lev,
@@ -610,3 +680,108 @@ DarkMatterParticleContainer::AssignDensityAndVels (Array<std::unique_ptr<MultiFa
 {
      AssignDensity(mf, lev_min, BL_SPACEDIM+1);
 }
+
+void 
+DarkMatterParticleContainer::InitFromBinaryMortonFile(const std::string& particle_directory,
+						      int nextra, int skip_factor) {
+  BL_PROFILE("DarkMatterParticleContainer::InitFromBinaryMortonFile");
+  
+  ParticleMortonFileHeader hdr;
+  ReadHeader(particle_directory, "Header", hdr);    
+  
+  uint64_t num_parts = hdr.NP;
+  int DM             = hdr.DM;
+  int NX             = hdr.NX;
+  int float_size     = hdr.SZ;
+  int num_files      = hdr.NF;
+  size_t psize       = (DM + NX) * float_size;
+  
+  std::string particle_file_base = particle_directory + "/particles.";
+  std::vector<std::string> file_names;
+  for (int i = 0; i < num_files; ++i)
+    file_names.push_back(get_file_name(particle_file_base, i));
+  
+  const int lev = 0;
+  const BoxArray& ba = ParticleBoxArray(lev);
+  int num_boxes = ba.size();
+  uint64_t num_parts_per_box  = num_parts / num_boxes;
+  uint64_t num_parts_per_file = num_parts / num_files;
+  uint64_t num_bytes_per_file = num_parts_per_file * psize;
+  
+  std::vector<BoxMortonKey> box_morton_keys(num_boxes);
+  for (int i = 0; i < num_boxes; ++i) {
+    const Box& box = ba[i];
+    unsigned int x = box.smallEnd(0);
+    unsigned int y = box.smallEnd(1);
+    unsigned int z = box.smallEnd(2);
+    box_morton_keys[i].morton_id = get_morton_index(x, y, z);
+    box_morton_keys[i].box_id = i;
+  }
+  
+  std::sort(box_morton_keys.begin(), box_morton_keys.end(), by_morton_id());
+  
+  std::vector<int> file_indices(num_boxes);
+  for (int i = 0; i < num_boxes; ++i)
+    file_indices[box_morton_keys[i].box_id] = i;
+  
+  ParticleType p;
+  for (MFIter mfi = MakeMFIter(lev); mfi.isValid(); ++mfi) {
+    Box tile_box = mfi.tilebox();      
+    const int grid = mfi.index();
+    const int tile = mfi.LocalTileIndex();      
+    auto& particles = GetParticles(lev);
+    
+    uint64_t start    = file_indices[grid]*num_parts_per_box;
+    uint64_t stop     = start + num_parts_per_box;
+
+    int file_num      = start / num_parts_per_file;
+    uint64_t seek_pos = (start * psize ) % num_bytes_per_file;
+    std::string file_name = file_names[file_num];
+    
+    std::ifstream ifs;
+    ifs.open(file_name.c_str(), std::ios::in|std::ios::binary);
+    if ( not ifs ) {
+      amrex::Print() << "Failed to open file " << file_name << " for reading. \n";
+      amrex::Abort();
+    } 
+
+    ifs.seekg(seek_pos, std::ios::beg);
+    
+    for (uint64_t i = start; i < stop; ++i) {
+      int next_file = i / num_parts_per_file;
+      if (next_file != file_num) {
+	file_num = next_file;
+	file_name = file_names[file_num];
+	ifs.close();
+	ifs.open(file_name.c_str(), std::ios::in|std::ios::binary);
+	if ( not ifs ) {
+	  amrex::Print() << "Failed to open file " << file_name << " for reading. \n";
+	  amrex::Abort();
+	}
+      }
+
+      float fpos[DM];
+      float fextra[NX];
+      ifs.read((char*)&fpos[0],   DM*sizeof(float));
+      ifs.read((char*)&fextra[0], NX*sizeof(float));
+      
+      if ( (i - start) % skip_factor == 0 ) {
+	AMREX_D_TERM(p.m_rdata.pos[0] = fpos[0];,
+		     p.m_rdata.pos[1] = fpos[1];,
+		     p.m_rdata.pos[2] = fpos[2];);
+	
+	for (int comp = 0; comp < NX; comp++)
+	  p.m_rdata.arr[BL_SPACEDIM+comp] = fextra[comp];
+	
+	p.m_rdata.arr[BL_SPACEDIM] *= skip_factor;
+	
+	p.m_idata.id  = ParticleType::NextID();
+	p.m_idata.cpu = ParallelDescriptor::MyProc();
+	particles[std::make_pair(grid, tile)].push_back(p);
+      }
+    }    
+  }
+  
+  Redistribute();
+}
+
diff --git a/Source/EOS/Make.package b/Source/EOS/Make.package
index a9b60772..b03797a6 100644
--- a/Source/EOS/Make.package
+++ b/Source/EOS/Make.package
@@ -5,3 +5,4 @@ f90EXE_sources += eos_stuff.f90
 endif
 
 f90EXE_sources += atomic_rates.f90
+f90EXE_sources += reion_aux_module.f90
diff --git a/Source/EOS/atomic_rates.f90 b/Source/EOS/atomic_rates.f90
index f25409bc..425b877c 100644
--- a/Source/EOS/atomic_rates.f90
+++ b/Source/EOS/atomic_rates.f90
@@ -19,14 +19,11 @@ module atomic_rates_module
 
   implicit none
 
-  ! Routine which acts like a class constructor
-  public  :: tabulate_rates, interp_to_this_z
-
   ! Photo- rates (from file)
-  integer   , parameter          , private :: NCOOLFILE=301
-  real(rt), dimension(NCOOLFILE), public :: lzr
-  real(rt), dimension(NCOOLFILE), public :: rggh0, rgghe0, rgghep
-  real(rt), dimension(NCOOLFILE), public :: reh0, rehe0, rehep
+  integer, private :: NCOOLFILE
+  real(rt), dimension(:), allocatable, private :: lzr
+  real(rt), dimension(:), allocatable, private :: rggh0, rgghe0, rgghep
+  real(rt), dimension(:), allocatable, private :: reh0, rehe0, rehep
 
   ! Other rates (from equations)
   integer, parameter, public :: NCOOLTAB=2000
@@ -38,10 +35,13 @@ module atomic_rates_module
   real(rt), public, save :: this_z, ggh0, gghe0, gghep, eh0, ehe0, ehep
  
   real(rt), parameter, public :: TCOOLMIN = 0.0d0, TCOOLMAX = 9.0d0  ! in log10
+  real(rt), parameter, public :: TCOOLMIN_R = 10.0d0**TCOOLMIN, TCOOLMAX_R = 10.0d0**TCOOLMAX
   real(rt), parameter, public :: deltaT = (TCOOLMAX - TCOOLMIN)/NCOOLTAB
 
   real(rt), parameter, public :: MPROTON = 1.6726231d-24, BOLTZMANN = 1.3806e-16
 
+  real(rt), public, save :: uvb_density_A = 1.0d0, uvb_density_B = 0.0d0, mean_rhob
+
   ! Note that XHYDROGEN can be set by a call to set_xhydrogen which now
   ! lives in set_method_params.
   real(rt), public :: XHYDROGEN = 0.76d0
@@ -49,20 +49,114 @@ module atomic_rates_module
 
   contains
 
-      subroutine tabulate_rates()
-      integer :: i
+      subroutine fort_tabulate_rates() bind(C, name='fort_tabulate_rates')
+      use parallel, only: parallel_ioprocessor
+      use amrex_parmparse_module
+      use bl_constants_module, only: M_PI
+      use fundamental_constants_module, only: Gconst
+      use comoving_module, only: comoving_h,comoving_OmB
+      use reion_aux_module, only: zhi_flash, zheii_flash, T_zhi, T_zheii, &
+                                  flash_h, flash_he, inhomogeneous_on
+
+      integer :: i, inhomo_reion
       logical, parameter :: Katz96=.false.
       real(rt), parameter :: t3=1.0d3, t5=1.0d5, t6=1.0d6
-      real(rt) :: t, U, E, y, sqrt_t, corr_term
+      real(rt) :: t, U, E, y, sqrt_t, corr_term, tmp
       logical, save :: first=.true.
 
-      !$OMP CRITICAL(TREECOOL_READ)
+      character(len=:), allocatable :: file_in
+      type(amrex_parmparse) :: pp
+
       if (first) then
 
          first = .false.
 
-         ! Read in photoionization rates and heating from a file
-         open(unit=11,file='TREECOOL_middle',status='old')
+         ! Get info from inputs
+         call amrex_parmparse_build(pp, "nyx")
+         call pp%query("inhomo_reion"             , inhomo_reion)
+         call pp%query("uvb_rates_file"           , file_in)
+         call pp%query("uvb_density_A"            , uvb_density_A)
+         call pp%query("uvb_density_B"            , uvb_density_B)
+         call pp%query("reionization_zHI_flash"   , zhi_flash)
+         call pp%query("reionization_zHeII_flash" , zheii_flash)
+         call pp%query("reionization_T_zHI"       , T_zhi)
+         call pp%query("reionization_T_zHeII"     , T_zheii)
+         call amrex_parmparse_destroy(pp)
+
+         if (parallel_ioprocessor()) then
+            print*, 'TABULATE_RATES: reionization parameters are:'
+            print*, '    reionization_zHI_flash     = ', zhi_flash
+            print*, '    reionization_zHeII_flash   = ', zheii_flash
+            print*, '    reionization_T_zHI         = ', T_zhi
+            print*, '    reionization_T_zHeII       = ', T_zheii
+
+            print*, 'TABULATE_RATES: rho-dependent heating parameters are:'
+            print*, '    A       = ', uvb_density_A
+            print*, '    B       = ', uvb_density_B
+            print*, '    UVB heating rates will be multiplied by A*(rho/rho_mean)**B'
+        endif
+
+        ! Save mean density (in code units) for density-dependent heating
+        mean_rhob = comoving_OmB * 3.d0*(comoving_h*100.d0)**2 / (8.d0*M_PI*Gconst)
+
+         ! Set options in reion_aux_module
+         !   Hydrogen reionization
+         if (zhi_flash .gt. 0.0) then
+            if (inhomo_reion .gt. 0) then
+               if (parallel_ioprocessor()) print*, 'TABULATE_RATES: ignoring reionization_zHI, as nyx.inhomo_reion > 0'
+               flash_h = .false.
+               inhomogeneous_on = .true.
+            else
+               flash_h = .true.
+               inhomogeneous_on = .false.
+            endif
+         else
+            flash_h = .false.
+            if (inhomo_reion .gt. 0) then
+               inhomogeneous_on = .true.
+            else
+               inhomogeneous_on = .false.
+            endif
+         endif
+
+         !   Helium reionization
+         if (zheii_flash .gt. 0.0) then
+            flash_he = .true.
+         else
+            flash_he = .false.
+         endif
+
+         if (parallel_ioprocessor()) then
+            print*, 'TABULATE_RATES: reionization flags are set to:'
+            print*, '    Hydrogen flash            = ', flash_h
+            print*, '    Helium   flash            = ', flash_he
+            print*, '    inhomogeneous_on (H only) = ', inhomogeneous_on
+         endif
+
+
+         ! Read in UVB rates from a file
+         if (len(file_in) .gt. 0) then
+            open(unit=11, file=file_in, status='old')
+            if (parallel_ioprocessor()) then
+               print*, 'TABULATE_RATES: UVB file is set in inputs ('//file_in//').'
+            endif
+         else
+            open(unit=11, file='TREECOOL', status='old')
+            if (parallel_ioprocessor()) then
+               print*, 'TABULATE_RATES: UVB file is defaulted to "TREECOOL".'
+            endif
+         endif
+
+         NCOOLFILE = 0
+         do
+            read(11,*,end=10) tmp, tmp, tmp, tmp, tmp,  tmp, tmp
+            NCOOLFILE = NCOOLFILE + 1
+         end do
+         10 rewind(11)
+
+         allocate( lzr(NCOOLFILE), rggh0(NCOOLFILE), rgghe0(NCOOlFILE), rgghep(NCOOLFILE) )
+         allocate( reh0(NCOOLFILE), rehe0(NCOOLFILE), rehep(NCOOLFILE) )
+
          do i = 1, NCOOLFILE
             read(11,*) lzr(i), rggh0(i), rgghe0(i), rgghep(i), &
                                 reh0(i),  rehe0(i),  rehep(i)
@@ -177,19 +271,21 @@ subroutine tabulate_rates()
          endif  ! Katz rates
 
       end if  ! first_call
-      !$OMP END CRITICAL(TREECOOL_READ)
 
-      end subroutine tabulate_rates
+      end subroutine fort_tabulate_rates
 
       ! ****************************************************************************
 
-      subroutine interp_to_this_z(z)
+      subroutine fort_interp_to_this_z(z) bind(C, name='fort_interp_to_this_z')
+
+      use vode_aux_module, only: z_vode
 
       real(rt), intent(in) :: z
       real(rt) :: lopz, fact
       integer :: i, j
 
       this_z = z
+      z_vode = z
       lopz   = dlog10(1.0d0 + z)
 
       if (lopz .ge. lzr(NCOOLFILE)) then
@@ -222,26 +318,6 @@ subroutine interp_to_this_z(z)
       ehe0  = rehe0(j)  + (rehe0(j+1)-rehe0(j))*fact
       ehep  = rehep(j)  + (rehep(j+1)-rehep(j))*fact
 
-      end subroutine interp_to_this_z
+      end subroutine fort_interp_to_this_z
 
 end module atomic_rates_module
-
-! *************************************************************************************
-! This must live outside of atomic_rates module so it can be called by the C++
-! *************************************************************************************
-
-subroutine fort_init_this_z(comoving_a) &
-    bind(C, name="fort_init_this_z")
-
-    use amrex_fort_module, only : rt => amrex_real
-    use atomic_rates_module
-
-    implicit none
-
-    real(rt), intent(in   ) :: comoving_a
-    real(rt)                :: z
-
-    z = 1.d0/comoving_a - 1.d0
-    call interp_to_this_z(z)
-
-end subroutine fort_init_this_z
diff --git a/Source/EOS/eos_hc.f90 b/Source/EOS/eos_hc.f90
index 1f2ffaba..d3274293 100644
--- a/Source/EOS/eos_hc.f90
+++ b/Source/EOS/eos_hc.f90
@@ -10,16 +10,34 @@
 module eos_module
 
   use amrex_fort_module, only : rt => amrex_real
+  use iso_c_binding, only: c_double
 
   implicit none
 
   ! Routines:
-  public  :: nyx_eos_given_RT, nyx_eos_T_given_Re, eos_init_small_pres
-  public  :: nyx_eos_nh0_and_nhep, iterate_ne
+  public  :: nyx_eos_given_RT, nyx_eos_given_RT_vec, nyx_eos_T_given_Re, nyx_eos_T_given_Re_vec, eos_init_small_pres
+  public  :: nyx_eos_nh0_and_nhep, iterate_ne, iterate_ne_vec
   private :: ion_n
 
+  real(rt), public :: xacc ! EOS Newton-Raphson convergence tolerance
+  real(c_double), public :: vode_rtol, vode_atol_scaled ! VODE integration tolerances
+
   contains
 
+      subroutine fort_setup_eos_params (xacc_in, vode_rtol_in, vode_atol_scaled_in) &
+                                       bind(C, name='fort_setup_eos_params')
+        use amrex_fort_module, only : rt => amrex_real
+        implicit none
+        real(rt), intent(in) :: xacc_in, vode_rtol_in, vode_atol_scaled_in
+
+        xacc = xacc_in
+        vode_rtol = vode_rtol_in
+        vode_atol_scaled = vode_atol_scaled_in
+
+      end subroutine fort_setup_eos_params
+
+     ! ****************************************************************************
+
       subroutine eos_init_small_pres(R, T, Ne, P, a)
 
         use amrex_fort_module, only : rt => amrex_real
@@ -92,11 +110,11 @@ subroutine nyx_eos_given_RT(e, P, R, T, Ne, a)
         use meth_params_module, only: gamma_minus_1
         implicit none
 
-        real(rt),          intent(  out) :: e, P
-        real(rt),          intent(in   ) :: R, T, Ne
-        real(rt),          intent(in   ) :: a
+        double precision,          intent(  out) :: e, P
+        double precision,          intent(in   ) :: R, T, Ne
+        double precision,          intent(in   ) :: a
 
-        real(rt) :: mu
+        double precision :: mu
 
         mu = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+Ne)
         e  = T / (gamma_minus_1 * mp_over_kB * mu)
@@ -105,20 +123,48 @@ subroutine nyx_eos_given_RT(e, P, R, T, Ne, a)
 
       end subroutine nyx_eos_given_RT
 
-      ! ****************************************************************************
+     ! ****************************************************************************
+
+      subroutine nyx_eos_given_RT_vec(e, P, R, T, Ne, a, veclen)
+
+        use atomic_rates_module, ONLY: YHELIUM
+        use fundamental_constants_module, only: mp_over_kb
+        use meth_params_module, only: gamma_minus_1
+        implicit none
+
+        integer, intent(in) :: veclen
+        real(rt), dimension(veclen), intent(  out) :: e, P
+        real(rt), dimension(veclen), intent(in   ) :: R, T, Ne
+        real(rt),          intent(in   ) :: a
+
+        real(rt), dimension(veclen) :: mu
+        integer :: i
 
-      subroutine nyx_eos_T_given_Re(T, Ne, R_in, e_in, a)
+        do i = 1, veclen
+          mu(i) = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+Ne(i))
+          e(i)  = T(i) / (gamma_minus_1 * mp_over_kB * mu(i))
+  
+          P(i)  = gamma_minus_1 * R(i) * e(i)
+        end do
+
+      end subroutine nyx_eos_given_RT_vec
+
+     ! ****************************************************************************
+
+      subroutine nyx_eos_T_given_Re(JH, JHe, T, Ne, R_in, e_in, a, species)
 
       use atomic_rates_module, ONLY: XHYDROGEN, MPROTON
       use fundamental_constants_module, only: density_to_cgs, e_to_cgs
 
       ! In/out variables
-      real(rt),           intent(inout) :: T, Ne
-      real(rt),           intent(in   ) :: R_in, e_in
-      real(rt),           intent(in   ) :: a
+      integer,    intent(in)    :: JH, JHe
+      real(rt),   intent(inout) :: T, Ne
+      real(rt),   intent(in   ) :: R_in, e_in
+      real(rt),   intent(in   ) :: a
+      real(rt), optional, intent(out) :: species(5)
 
-      real(rt) :: nh, nh0, nhep, nhp, nhe0, nhepp
-      real(rt) :: z, rho, U
+      double precision :: nh, nh0, nhep, nhp, nhe0, nhepp
+      double precision :: z, rho, U
 
       ! This converts from code units to CGS
       rho = R_in * density_to_cgs / a**3
@@ -127,18 +173,55 @@ subroutine nyx_eos_T_given_Re(T, Ne, R_in, e_in, a)
 
       z   = 1.d0/a - 1.d0
 
-      call iterate_ne(z, U, T, nh, ne, nh0, nhp, nhe0, nhep, nhepp)
+      call iterate_ne(JH, Jhe, z, U, T, nh, ne, nh0, nhp, nhe0, nhep, nhepp)
+
+      if (present(species)) then
+         species(1) = nh0
+         species(2) = nhp
+         species(3) = nhe0
+         species(4) = nhep
+         species(5) = nhepp
+      endif
 
       end subroutine nyx_eos_T_given_Re
 
-      ! ****************************************************************************
+     ! ****************************************************************************
 
-      subroutine nyx_eos_nh0_and_nhep(z, rho, e, nh0, nhep)
-      ! This is for skewers analysis code, input is in CGS
+      subroutine nyx_eos_T_given_Re_vec(T, Ne, R_in, e_in, a, veclen)
 
+      use amrex_fort_module, only : rt => amrex_real
       use atomic_rates_module, ONLY: XHYDROGEN, MPROTON
+      use fundamental_constants_module, only: density_to_cgs, e_to_cgs
+
+      ! In/out variables
+      integer, intent(in) :: veclen
+      real(rt), dimension(veclen), intent(inout) :: T, Ne
+      real(rt), dimension(veclen), intent(in   ) :: R_in, e_in
+      real(rt),                    intent(in   ) :: a
+
+      real(rt), dimension(veclen) :: nh, nh0, nhep, nhp, nhe0, nhepp, rho, U
+      real(rt) :: z
+
+      ! This converts from code units to CGS
+      rho = R_in * density_to_cgs / a**3
+        U = e_in * e_to_cgs
+      nh  = rho*XHYDROGEN/MPROTON
+
+      z   = 1.d0/a - 1.d0
+
+      call iterate_ne_vec(z, U, T, nh, ne, nh0, nhp, nhe0, nhep, nhepp, veclen)
+
+      end subroutine nyx_eos_T_given_Re_vec
+
+     ! ****************************************************************************
+
+      subroutine nyx_eos_nh0_and_nhep(JH, JHe, z, rho, e, nh0, nhep)
+      ! This is for skewers analysis code, input is in CGS
+
+      use atomic_rates_module, only: XHYDROGEN, MPROTON
 
       ! In/out variables
+      integer, intent(in) :: JH, Jhe
       real(rt),           intent(in   ) :: z, rho, e
       real(rt),           intent(  out) :: nh0, nhep
 
@@ -147,34 +230,308 @@ subroutine nyx_eos_nh0_and_nhep(z, rho, e, nh0, nhep)
       nh  = rho*XHYDROGEN/MPROTON
       ne  = 1.0d0 ! Guess
 
-      call iterate_ne(z, e, T, nh, ne, nh0, nhp, nhe0, nhep, nhepp)
+      call iterate_ne(JH, JHe, z, e, T, nh, ne, nh0, nhp, nhe0, nhep, nhepp)
 
       nh0  = nh*nh0
       nhep = nh*nhep
 
       end subroutine nyx_eos_nh0_and_nhep
 
-      ! ****************************************************************************
+     ! ****************************************************************************
+
+      subroutine iterate_ne_vec(z, U, t, nh, ne, nh0, nhp, nhe0, nhep, nhepp, veclen)
+
+      use atomic_rates_module, ONLY: this_z, YHELIUM, BOLTZMANN, MPROTON, TCOOLMAX_R
+      use meth_params_module, only: gamma_minus_1
+      use amrex_error_module, only: amrex_abort
+
+      integer :: i
+
+      integer, intent(in) :: veclen
+      real(rt), intent (in   ) :: z
+      real(rt), dimension(veclen), intent(in) :: U, nh
+      real(rt), dimension(veclen), intent (inout) :: ne
+      real(rt), dimension(veclen), intent (  out) :: t, nh0, nhp, nhe0, nhep, nhepp
+
+      real(rt), parameter :: xacc = 1.0d-6
+
+      integer, dimension(veclen)  :: JH, JHe
+      real(rt), dimension(veclen) :: f, df, eps, mu
+      real(rt), dimension(veclen) :: nhp_plus, nhep_plus, nhepp_plus
+      real(rt), dimension(veclen) :: dnhp_dne, dnhep_dne, dnhepp_dne, dne
+      real(rt), dimension(veclen):: U_in, t_in, nh_in, ne_in
+      real(rt), dimension(veclen) :: nhp_out, nhep_out, nhepp_out
+      integer :: vec_count, orig_idx(veclen)
+      integer :: ii
+      character(len=128) :: errmsg
+
+      ! Check if we have interpolated to this z
+      if (abs(z-this_z) .gt. xacc*z) then
+          write(errmsg, *) "iterate_ne_vec(): Wrong redshift! z = ", z, " but this_z = ", this_z
+          call amrex_abort(errmsg)
+      end if
+
+      ii = 0
+      ne(1:veclen) = 1.0d0 ! 0 is a bad guess
+
+      do  ! Newton-Raphson solver
+         ii = ii + 1
+
+         ! Ion number densities
+         do i = 1, veclen
+           mu(i) = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne(i))
+           t(i)  = gamma_minus_1*MPROTON/BOLTZMANN * U(i) * mu(i)
+         end do
+         vec_count = 0
+         do i = 1, veclen
+           if (t(i) .ge. TCOOLMAX_R) then ! Fully ionized plasma
+             nhp(i)   = 1.0d0
+             nhep(i)  = 0.0d0
+             nhepp(i) = YHELIUM
+           else
+             vec_count = vec_count + 1
+             U_in(vec_count) = U(i)
+             t_in(vec_count) = t(i)
+             nh_in(vec_count) = nh(i)
+             ne_in(vec_count) = ne(i)
+             orig_idx(vec_count) = i
+           endif
+         end do
+
+         call ion_n_vec(JH(1:vec_count), &
+                    JHe(1:vec_count), &
+                    U_in(1:vec_count), &
+                    nh_in(1:vec_count), &
+                    ne_in(1:vec_count), &
+                    nhp_out(1:vec_count), &
+                    nhep_out(1:vec_count), &
+                    nhepp_out(1:vec_count), &
+                    t_in(1:vec_count), &
+                    vec_count)
+         nhp(orig_idx(1:vec_count)) = nhp_out(1:vec_count)
+         nhep(orig_idx(1:vec_count)) = nhep_out(1:vec_count)
+         nhepp(orig_idx(1:vec_count)) = nhepp_out(1:vec_count)
+
+         ! Forward difference derivatives
+         do i = 1, veclen
+           if (ne(i) .gt. 0.0d0) then
+              eps(i) = xacc*ne(i)
+           else
+              eps(i) = 1.0d-24
+           endif
+         end do
+         do i = 1, veclen
+           mu(i) = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne(i)+eps(i))
+           t(i)  = gamma_minus_1*MPROTON/BOLTZMANN * U(i) * mu(i)
+         end do
+         vec_count = 0
+         do i = 1, veclen
+           if (t(i) .ge. TCOOLMAX_R) then ! Fully ionized plasma
+             nhp_plus(i)   = 1.0d0
+             nhep_plus(i)  = 0.0d0
+             nhepp_plus(i) = YHELIUM
+           else
+             vec_count = vec_count + 1
+             U_in(vec_count) = U(i)
+             t_in(vec_count) = t(i)
+             nh_in(vec_count) = nh(i)
+             ne_in(vec_count) = ne(i)+eps(i)
+             orig_idx(vec_count) = i
+           endif
+         end do
+
+         call ion_n_vec(JH(1:vec_count), &
+                    JHe(1:vec_count), &
+                    U_in(1:vec_count), &
+                    nh_in(1:vec_count), &
+                    ne_in(1:vec_count), &
+                    nhp_out(1:vec_count), &
+                    nhep_out(1:vec_count), &
+                    nhepp_out(1:vec_count), &
+                    t_in(1:vec_count), &
+                    vec_count)
+         nhp_plus(orig_idx(1:vec_count)) = nhp_out(1:vec_count)
+         nhep_plus(orig_idx(1:vec_count)) = nhep_out(1:vec_count)
+         nhepp_plus(orig_idx(1:vec_count)) = nhepp_out(1:vec_count)
+
+         do i = 1, veclen
+           dnhp_dne(i)   = (nhp_plus(i)   - nhp(i))   / eps(i)
+           dnhep_dne(i)  = (nhep_plus(i)  - nhep(i))  / eps(i)
+           dnhepp_dne(i) = (nhepp_plus(i) - nhepp(i)) / eps(i)
+         end do
+
+         do i = 1, veclen
+           f(i)   = ne(i) - nhp(i) - nhep(i) - 2.0d0*nhepp(i)
+           df(i)  = 1.0d0 - dnhp_dne(i) - dnhep_dne(i) - 2.0d0*dnhepp_dne(i)
+           dne(i) = f(i)/df(i)
+         end do
+
+         do i = 1, veclen
+           ne(i) = max((ne(i)-dne(i)), 0.0d0)
+         end do
+
+         if (maxval(abs(dne(1:veclen))) < xacc) exit
+
+         if (ii .gt. 15) &
+            STOP 'iterate_ne_vec(): No convergence in Newton-Raphson!'
+
+      enddo
+
+      ! Get rates for the final ne
+      do i = 1, veclen
+        mu(i) = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne(i))
+        t(i)  = gamma_minus_1*MPROTON/BOLTZMANN * U(i) * mu(i)
+      end do
+      vec_count = 0
+      do i = 1, veclen
+        if (t(i) .ge. TCOOLMAX_R) then ! Fully ionized plasma
+          nhp(i)   = 1.0d0
+          nhep(i)  = 0.0d0
+          nhepp(i) = YHELIUM
+        else
+          vec_count = vec_count + 1
+          U_in(vec_count) = U(i)
+          t_in(vec_count) = t(i)
+          nh_in(vec_count) = nh(i)
+          ne_in(vec_count) = ne(i)
+          orig_idx(vec_count) = i
+        endif
+      end do
+      call ion_n_vec(JH(1:vec_count), &
+                 JHe(1:vec_count), &
+                 U_in(1:vec_count), &
+                 nh_in(1:vec_count), &
+                 ne_in(1:vec_count), &
+                 nhp_out(1:vec_count), &
+                 nhep_out(1:vec_count), &
+                 nhepp_out(1:vec_count), &
+                 t_in(1:vec_count), &
+                 vec_count)
+      nhp(orig_idx(1:vec_count)) = nhp_out(1:vec_count)
+      nhep(orig_idx(1:vec_count)) = nhep_out(1:vec_count)
+      nhepp(orig_idx(1:vec_count)) = nhepp_out(1:vec_count)
+
+      ! Neutral fractions:
+      do i = 1, veclen
+        nh0(i)   = 1.0d0 - nhp(i)
+        nhe0(i)  = YHELIUM - (nhep(i) + nhepp(i))
+      end do
+      end subroutine iterate_ne_vec
+
+     ! ****************************************************************************
+
+      subroutine ion_n_vec(JH, JHe, U, nh, ne, nhp, nhep, nhepp, t, vec_count)
+
+      use amrex_fort_module, only : rt => amrex_real
+      use meth_params_module, only: gamma_minus_1
+      use atomic_rates_module, ONLY: YHELIUM, MPROTON, BOLTZMANN, &
+                                     TCOOLMIN, TCOOLMAX, NCOOLTAB, deltaT, &
+                                     AlphaHp, AlphaHep, AlphaHepp, Alphad, &
+                                     GammaeH0, GammaeHe0, GammaeHep, &
+                                     ggh0, gghe0, gghep
+
+      integer, intent(in) :: vec_count
+      integer, dimension(vec_count), intent(in) :: JH, JHe
+      real(rt), intent(in   ) :: U(vec_count), nh(vec_count), ne(vec_count)
+      real(rt), intent(  out) :: nhp(vec_count), nhep(vec_count), nhepp(vec_count), t(vec_count)
+      real(rt) :: ahp(vec_count), ahep(vec_count), ahepp(vec_count), ad(vec_count), geh0(vec_count), gehe0(vec_count), gehep(vec_count)
+      real(rt) :: ggh0ne(vec_count), gghe0ne(vec_count), gghepne(vec_count)
+      real(rt) :: mu(vec_count), tmp(vec_count), logT(vec_count), flo(vec_count), fhi(vec_count)
+      real(rt), parameter :: smallest_val=tiny(1.0d0)
+      integer :: j(vec_count), i
+
+      mu(:) = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne(:))
+      t(:)  = gamma_minus_1*MPROTON/BOLTZMANN * U(:) * mu(:)
+
+      logT(1:vec_count) = dlog10(t(1:vec_count))
+
+      ! Temperature floor
+      do i = 1, vec_count
+        if (logT(i) .le. TCOOLMIN) logT(i) = TCOOLMIN + 0.5d0*deltaT
+      end do
+
+      ! Interpolate rates
+      do i = 1, vec_count
+        tmp(i) = (logT(i)-TCOOLMIN)/deltaT
+        j(i) = int(tmp(i))
+        fhi(i) = tmp(i) - j(i)
+        flo(i) = 1.0d0 - fhi(i)
+        j(i) = j(i) + 1 ! F90 arrays start with 1
+      end do
+
+      do i = 1, vec_count
+        ahp(i)   = flo(i)*AlphaHp  (j(i)) + fhi(i)*AlphaHp  (j(i)+1)
+        ahep(i)  = flo(i)*AlphaHep (j(i)) + fhi(i)*AlphaHep (j(i)+1)
+        ahepp(i) = flo(i)*AlphaHepp(j(i)) + fhi(i)*AlphaHepp(j(i)+1)
+        ad(i)    = flo(i)*Alphad   (j(i)) + fhi(i)*Alphad   (j(i)+1)
+        geh0(i)  = flo(i)*GammaeH0 (j(i)) + fhi(i)*GammaeH0 (j(i)+1)
+        gehe0(i) = flo(i)*GammaeHe0(j(i)) + fhi(i)*GammaeHe0(j(i)+1)
+        gehep(i) = flo(i)*GammaeHep(j(i)) + fhi(i)*GammaeHep(j(i)+1)
+      end do
+
+      do i = 1, vec_count
+        if (ne(i) .gt. 0.0d0) then
+           ggh0ne(i)   = JH(i)  * ggh0  / (ne(i)*nh(i))
+           gghe0ne(i)  = JH(i)  * gghe0 / (ne(i)*nh(i))
+           gghepne(i)  = JHe(i) * gghep / (ne(i)*nh(i))
+        else
+           ggh0ne(i)   = 0.0d0
+           gghe0ne(i)  = 0.0d0
+           gghepne(i)  = 0.0d0
+        endif
+      end do
+
+      ! H+
+      do i = 1, vec_count
+        nhp(i) = 1.0d0 - ahp(i)/(ahp(i) + geh0(i) + ggh0ne(i))
+      end do
+
+      ! He+
+      do i = 1, vec_count
+        if ((gehe0(i) + gghe0ne(i)) .gt. smallest_val) then
+  
+           nhep(i)  = YHELIUM/(1.0d0 + (ahep(i)  + ad(i)     )/(gehe0(i) + gghe0ne(i)) &
+                                  + (gehep(i) + gghepne(i))/ahepp(i))
+        else
+           nhep(i)  = 0.0d0
+        endif
+      end do
+
+      ! He++
+      do i = 1, vec_count
+        if (nhep(i) .gt. 0.0d0) then
+           nhepp(i) = nhep(i)*(gehep(i) + gghepne(i))/ahepp(i)
+        else
+           nhepp(i) = 0.0d0
+        endif
+      end do
 
-      subroutine iterate_ne(z, U, t, nh, ne, nh0, nhp, nhe0, nhep, nhepp)
+      end subroutine ion_n_vec
 
-      use atomic_rates_module, ONLY: this_z, YHELIUM
+     ! ****************************************************************************
+
+      subroutine iterate_ne(JH, JHe, z, U, t, nh, ne, nh0, nhp, nhe0, nhep, nhepp)
+
+      use amrex_error_module, only: amrex_abort
+      use atomic_rates_module, only: this_z, YHELIUM
 
       integer :: i
 
+      integer, intent(in) :: JH, JHe
       real(rt), intent (in   ) :: z, U, nh
       real(rt), intent (inout) :: ne
       real(rt), intent (  out) :: t, nh0, nhp, nhe0, nhep, nhepp
 
-      real(rt), parameter :: xacc = 1.0d-6
-
       real(rt) :: f, df, eps
       real(rt) :: nhp_plus, nhep_plus, nhepp_plus
       real(rt) :: dnhp_dne, dnhep_dne, dnhepp_dne, dne
+      character(len=128) :: errmsg
 
       ! Check if we have interpolated to this z
-      if (abs(z-this_z) .gt. xacc*z) &
-          STOP 'iterate_ne(): Wrong redshift!'
+      if (abs(z-this_z) .gt. xacc*z) then
+          write(errmsg, *) "iterate_ne(): Wrong redshift! z = ", z, " but this_z = ", this_z
+          call amrex_abort(errmsg)
+      end if
 
       i = 0
       ne = 1.0d0 ! 0 is a bad guess
@@ -182,7 +539,7 @@ subroutine iterate_ne(z, U, t, nh, ne, nh0, nhp, nhe0, nhep, nhepp)
          i = i + 1
 
          ! Ion number densities
-         call ion_n(U, nh, ne, nhp, nhep, nhepp, t)
+         call ion_n(JH, JHe, U, nh, ne, nhp, nhep, nhepp, t)
 
          ! Forward difference derivatives
          if (ne .gt. 0.0d0) then
@@ -190,7 +547,7 @@ subroutine iterate_ne(z, U, t, nh, ne, nh0, nhp, nhe0, nhep, nhepp)
          else
             eps = 1.0d-24
          endif
-         call ion_n(U, nh, (ne+eps), nhp_plus, nhep_plus, nhepp_plus, t)
+         call ion_n(JH, JHe, U, nh, (ne+eps), nhp_plus, nhep_plus, nhepp_plus, t)
 
          dnhp_dne   = (nhp_plus   - nhp)   / eps
          dnhep_dne  = (nhep_plus  - nhep)  / eps
@@ -212,32 +569,34 @@ subroutine iterate_ne(z, U, t, nh, ne, nh0, nhp, nhe0, nhep, nhepp)
       enddo
 
       ! Get rates for the final ne
-      call ion_n(U, nh, ne, nhp, nhep, nhepp, t)
+      call ion_n(JH, JHe, U, nh, ne, nhp, nhep, nhepp, t)
 
       ! Neutral fractions:
       nh0   = 1.0d0 - nhp
       nhe0  = YHELIUM - (nhep + nhepp)
       end subroutine iterate_ne
 
-      ! ****************************************************************************
+     ! ****************************************************************************
 
-      subroutine ion_n(U, nh, ne, nhp, nhep, nhepp, t)
+      subroutine ion_n(JH, JHe, U, nh, ne, nhp, nhep, nhepp, t)
 
-      use meth_params_module, only: gamma_minus_1
-      use atomic_rates_module, ONLY: YHELIUM, MPROTON, BOLTZMANN, &
+      use meth_params_module,  only: gamma_minus_1
+      use atomic_rates_module, only: YHELIUM, MPROTON, BOLTZMANN, &
                                      TCOOLMIN, TCOOLMAX, NCOOLTAB, deltaT, &
                                      AlphaHp, AlphaHep, AlphaHepp, Alphad, &
                                      GammaeH0, GammaeHe0, GammaeHep, &
                                      ggh0, gghe0, gghep
 
+      integer, intent(in) :: JH, JHe
       real(rt), intent(in   ) :: U, nh, ne
       real(rt), intent(  out) :: nhp, nhep, nhepp, t
       real(rt) :: ahp, ahep, ahepp, ad, geh0, gehe0, gehep
       real(rt) :: ggh0ne, gghe0ne, gghepne
       real(rt) :: mu, tmp, logT, flo, fhi
-      real(rt) :: smallest_val
+      real(rt), parameter :: smallest_val=tiny(1.0d0)
       integer :: j
 
+
       mu = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne)
       t  = gamma_minus_1*MPROTON/BOLTZMANN * U * mu
 
@@ -268,9 +627,9 @@ subroutine ion_n(U, nh, ne, nhp, nhep, nhepp, t)
       gehep = flo*GammaeHep(j) + fhi*GammaeHep(j+1)
 
       if (ne .gt. 0.0d0) then
-         ggh0ne   = ggh0 /(ne*nh)
-         gghe0ne  = gghe0/(ne*nh)
-         gghepne  = gghep/(ne*nh)
+         ggh0ne   = JH  * ggh0  / (ne*nh)
+         gghe0ne  = JH  * gghe0 / (ne*nh)
+         gghepne  = JHe * gghep / (ne*nh)
       else
          ggh0ne   = 0.0d0
          gghe0ne  = 0.0d0
@@ -281,7 +640,6 @@ subroutine ion_n(U, nh, ne, nhp, nhep, nhepp, t)
       nhp = 1.0d0 - ahp/(ahp + geh0 + ggh0ne)
 
       ! He+
-      smallest_val = Tiny(1.0d0)
       if ((gehe0 + gghe0ne) .gt. smallest_val) then
 
          nhep  = YHELIUM/(1.0d0 + (ahep  + ad     )/(gehe0 + gghe0ne) &
@@ -299,4 +657,5 @@ subroutine ion_n(U, nh, ne, nhp, nhep, nhepp, t)
 
       end subroutine ion_n
 
+
 end module eos_module
diff --git a/Source/EOS/eos_stuff.f90 b/Source/EOS/eos_stuff.f90
index ebcf86a4..5eab4309 100644
--- a/Source/EOS/eos_stuff.f90
+++ b/Source/EOS/eos_stuff.f90
@@ -43,8 +43,8 @@ module eos_module
 
   private nspec, aion, zion
 
-  public eos_init_small_pres, nyx_eos_T_given_Re, nyx_eos_S_given_Re, &
-         nyx_eos_soundspeed, nyx_eos_given_RT, eos
+  public eos_init_small_pres, nyx_eos_T_given_Re, nyx_eos_T_given_Re_vec, nyx_eos_S_given_Re, &
+         nyx_eos_soundspeed, nyx_eos_given_RT, nyx_eos_given_RT_vec, eos
 
 contains
 
@@ -119,11 +119,12 @@ subroutine nyx_eos_soundspeed(c, R, e)
 
   end subroutine nyx_eos_soundspeed
 
-  subroutine nyx_eos_T_given_Re(T, Ne, R, e, comoving_a)
+  subroutine nyx_eos_T_given_Re(JH, JHe, T, Ne, R, e, comoving_a)
 
      use amrex_fort_module, only : rt => amrex_real
 
      ! In/out variables
+     integer, intent(in) :: JH, JHe ! stubs here
      real(rt),           intent(inout) :: T, Ne
      real(rt),           intent(in   ) :: R, e
      real(rt),           intent(in   ) :: comoving_a
@@ -390,4 +391,36 @@ subroutine eos(input, dens, temp, &
 
   end subroutine eos
 
+
+  subroutine nyx_eos_T_given_Re_vec(T, Ne, R_in, e_in, a, veclen)
+
+    use amrex_fort_module, only : rt => amrex_real
+    use amrex_error_module, only: amrex_abort
+  
+    ! In/out variables
+    integer, intent(in) :: veclen
+    real(rt), dimension(veclen), intent(inout) :: T, Ne
+    real(rt), dimension(veclen), intent(in   ) :: R_in, e_in
+    real(rt),                    intent(in   ) :: a
+  
+    call amrex_abort("nyx_eos_T_given_Re_vec supported only with USE_HEATCOOL=TRUE and USE_CVODE=TRUE")
+
+  end subroutine nyx_eos_T_given_Re_vec
+
+
+  subroutine nyx_eos_given_RT_vec(e, P, R, T, Ne, a, veclen)
+
+    use amrex_fort_module, only : rt => amrex_real
+    use amrex_error_module, only: amrex_abort
+    implicit none
+  
+    integer, intent(in) :: veclen
+    real(rt), dimension(veclen), intent(  out) :: e, P
+    real(rt), dimension(veclen), intent(in   ) :: R, T, Ne
+    real(rt),          intent(in   ) :: a
+  
+    call amrex_abort("nyx_eos_given_RT_vec supported only with USE_HEATCOOL=TRUE and USE_CVODE=TRUE")
+
+  end subroutine nyx_eos_given_RT_vec
+
 end module eos_module
diff --git a/Source/EOS/reion_aux_module.f90 b/Source/EOS/reion_aux_module.f90
new file mode 100644
index 00000000..5b84345b
--- /dev/null
+++ b/Source/EOS/reion_aux_module.f90
@@ -0,0 +1,10 @@
+module reion_aux_module
+
+  use amrex_fort_module, only : rt => amrex_real
+  implicit none
+
+  ! Global variables (re)set on inputs
+  real(rt), save :: zhi_flash=-1.0, zheii_flash=-1.0, T_zhi=0.0, T_zheii=0.0
+  logical, save  :: flash_h=.false., flash_he=.false., inhomogeneous_on=.false.
+
+end module reion_aux_module
diff --git a/Source/Forcing/ext_src_force_3d.f90 b/Source/Forcing/ext_src_force_3d.f90
index 8db2dff0..26a7abe7 100644
--- a/Source/Forcing/ext_src_force_3d.f90
+++ b/Source/Forcing/ext_src_force_3d.f90
@@ -38,7 +38,6 @@ subroutine ext_src_force(lo, hi, old_state, os_l1, os_l2, os_l3, os_h1, os_h2, o
     use amrex_fort_module, only : rt => amrex_real
     use meth_params_module, only : NVAR, UMX, UMY, UMZ, UEDEN, UEINT
     use fundamental_constants_module
-    use atomic_rates_module, only: interp_to_this_z
 
     implicit none
 
diff --git a/Source/Forcing/integrate_state_force_3d.f90 b/Source/Forcing/integrate_state_force_3d.f90
index 70819f40..f9b82620 100644
--- a/Source/Forcing/integrate_state_force_3d.f90
+++ b/Source/Forcing/integrate_state_force_3d.f90
@@ -37,7 +37,7 @@ subroutine integrate_state_force(lo, hi, &
     use atomic_rates_module, only: XHYDROGEN
     use probdata_module, only: prob_lo, prob_hi, alpha, rho0, temp0
     use meth_params_module, only : NVAR, URHO, UMX, UMY, UMZ, UEDEN, UEINT, &
-                                   TEMP_COMP, NE_COMP, small_pres, small_temp, gamma_minus_1
+                                   NDIAG, TEMP_COMP, NE_COMP, small_pres, small_temp, gamma_minus_1
     use bl_constants_module, only : TWO, ONE, HALF, ZERO, M_PI, M_SQRT_2
     use fundamental_constants_module
  
@@ -50,7 +50,7 @@ subroutine integrate_state_force(lo, hi, &
     integer         , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3
     integer         , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3
     real(rt), intent(inout) ::    state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR)
-    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2)
+    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, NDIAG)
     real(rt), intent(in)    :: dx(3), time, a, half_dt
 
     integer :: i, j, k
diff --git a/Source/Gravity/Gravity.H b/Source/Gravity/Gravity.H
index 0a19aab3..b6c0086f 100644
--- a/Source/Gravity/Gravity.H
+++ b/Source/Gravity/Gravity.H
@@ -147,7 +147,6 @@ protected:
     amrex::BCRec* phys_bc;
 
     static int verbose;
-    static int show_timings;
     static int no_sync;
     static int no_composite;
     static int dirichlet_bcs;
diff --git a/Source/HeatCool/Make.package b/Source/HeatCool/Make.package
index 4d690d8f..7ea7acf0 100644
--- a/Source/HeatCool/Make.package
+++ b/Source/HeatCool/Make.package
@@ -1,13 +1,14 @@
 ifeq ($(USE_HEATCOOL), TRUE)
-f90EXE_sources += cooling.f90
 f90EXE_sources += ext_src_hc_3d.f90
 f90EXE_sources += integrate_state_3d.f90
-f90EXE_sources += integrate_state_hc_3d.f90
 f90EXE_sources += integrate_state_vode_3d.f90
 ifeq ($(USE_CVODE), TRUE)
+  f90EXE_sources += fcvode_extras.f90
   f90EXE_sources += integrate_state_fcvode_3d.f90
+  f90EXE_sources += integrate_state_fcvode_vec_3d.f90
 else
   f90EXE_sources += integrate_state_fcvode_3d_stubs.f90
+  f90EXE_sources += integrate_state_fcvode_vec_3d_stubs.f90
 endif
 f90EXE_sources += vode_aux.f90
 f90EXE_sources += f_rhs.f90
diff --git a/Source/HeatCool/cooling.f90 b/Source/HeatCool/cooling.f90
deleted file mode 100644
index 2ccd5bda..00000000
--- a/Source/HeatCool/cooling.f90
+++ /dev/null
@@ -1,115 +0,0 @@
-! Calculates cooling (H & He) + UV heating rates. 
-!
-!     Working units are CGS here, temperature is in K 
-!
-
-module heating_cooling_module
-
-  use amrex_fort_module, only : rt => amrex_real
-
-  implicit none
-
-  public :: hc_rates
-
-  contains
-
-      subroutine hc_rates(z, R_in, e_in, t, ne, energy, prnt_d)
-
-      use fundamental_constants_module, only: e_to_cgs, density_to_cgs, & 
-                                              heat_from_cgs
-      use eos_module, only: iterate_ne
-      use atomic_rates_module, ONLY: TCOOLMIN, TCOOLMAX, NCOOLTAB, deltaT, &
-                                     MPROTON, XHYDROGEN, &
-                                     AlphaHp, AlphaHep, AlphaHepp, Alphad, &
-                                     GammaeH0, GammaeHe0, GammaeHep, &
-                                     BetaH0, BetaHe0, BetaHep, Betaff1, Betaff4, &
-                                     RecHp, RecHep, RecHepp, &
-                                     eh0, ehe0, ehep
-
-      real(rt), intent(in   ) :: z, R_in, e_in
-      real(rt), intent(inout) :: t, ne
-      real(rt), intent(  out) :: energy
-      logical, intent(in)             :: prnt_d ! for diagnostics print
-
-      real(rt), parameter :: compt_c = 1.01765467d-37, T_cmb = 2.725d0
-
-      real(rt) :: logT, tmp, fhi, flo
-      real(rt) :: ahp, ahep, ahepp, ad, geh0, gehe0, gehep
-      real(rt) :: bh0, bhe0, bhep, bff1, bff4, rhp, rhep, rhepp
-      real(rt) :: lambda_c, lambda_ff, lambda, heat
-      real(rt) :: rho, U
-      real(rt) :: nh, nh0, nhp, nhe0, nhep, nhepp
-      integer :: j
-
-
-     ! Converts from code units to CGS
-      rho = R_in * density_to_cgs * (1.0d0+z)**3
-        U = e_in * e_to_cgs
-      nh  = rho*XHYDROGEN/MPROTON
-
-      ! Get gas temperature and individual ionization species
-      call iterate_ne(z, U, t, nh, ne, nh0, nhp, nhe0, nhep, nhepp)
-
-      ! Convert species to CGS units: 
-      ne    = nh * ne
-      nh0   = nh * nh0
-      nhp   = nh * nhp
-      nhe0  = nh * nhe0
-      nhep  = nh * nhep
-      nhepp = nh * nhepp
-
-      logT = dlog10(t)
-      if (logT .ge. TCOOLMAX) then ! Only free-free and Compton cooling are relevant
-         lambda_ff = 1.42d-27 * dsqrt(t) * (1.1d0 + 0.34d0*dexp(-(5.5d0 - logT)**2 / 3.0d0)) &
-                              * (nhp + 4.0d0*nhepp)*ne
-         lambda_c  = compt_c*T_cmb**4*ne*(t - T_cmb*(1.0d0+z))*(1.0d0 + z)**4
-
-         energy = (-lambda_ff -lambda_c) * heat_from_cgs/(1.0d0+z)**4
-         ne     = ne / nh
-         return
-      endif
-
-      ! Temperature floor
-      if (logT .le. TCOOLMIN) logT = TCOOLMIN + 0.5d0*deltaT
-
-      ! Interpolate rates
-      tmp = (logT-TCOOLMIN)/deltaT
-      j = int(tmp)
-      fhi = tmp - j
-      flo = 1.0d0 - fhi
-      j = j + 1 ! F90 arrays start with 1
-
-      ahp   = flo*AlphaHp  (j) + fhi*AlphaHp  (j+1)
-      ahep  = flo*AlphaHep (j) + fhi*AlphaHep (j+1)
-      ahepp = flo*AlphaHepp(j) + fhi*AlphaHepp(j+1)
-      ad    = flo*Alphad   (j) + fhi*Alphad   (j+1)
-      geh0  = flo*GammaeH0 (j) + fhi*GammaeH0 (j+1)
-      gehe0 = flo*GammaeHe0(j) + fhi*GammaeHe0(j+1)
-      gehep = flo*GammaeHep(j) + fhi*GammaeHep(j+1)
-      bh0   = flo*BetaH0   (j) + fhi*BetaH0   (j+1)
-      bhe0  = flo*BetaHe0  (j) + fhi*BetaHe0  (j+1)
-      bhep  = flo*BetaHep  (j) + fhi*BetaHep  (j+1)
-      bff1  = flo*Betaff1  (j) + fhi*Betaff1  (j+1)
-      bff4  = flo*Betaff4  (j) + fhi*Betaff4  (j+1)
-      rhp   = flo*RecHp    (j) + fhi*RecHp    (j+1)
-      rhep  = flo*RecHep   (j) + fhi*RecHep   (j+1)
-      rhepp = flo*RecHepp  (j) + fhi*RecHepp  (j+1)
-
-      ! Cooling: 
-      lambda = ( bh0*nh0 + bhe0*nhe0 + bhep*nhep + &
-                 rhp*nhp + rhep*nhep + rhepp*nhepp + &
-                 bff1*(nhp+nhep) + bff4*nhepp ) * ne
-
-      lambda_c = compt_c*T_cmb**4*ne*(t - T_cmb*(1.0d0+z))*(1.0d0 + z)**4   ! Compton cooling
-      lambda = lambda + lambda_c
-
-      ! Heating terms
-      heat = nh0*eh0 + nhe0*ehe0 + nhep*ehep
-
-      ! Convert back to code units
-      ne     = ne / nh
-      energy = (heat - lambda)*heat_from_cgs/(1.0d0+z)**4
-
-      end subroutine hc_rates
-
-end module heating_cooling_module
diff --git a/Source/HeatCool/ext_src_hc_3d.f90 b/Source/HeatCool/ext_src_hc_3d.f90
index dcfae8fb..3089d87d 100644
--- a/Source/HeatCool/ext_src_hc_3d.f90
+++ b/Source/HeatCool/ext_src_hc_3d.f90
@@ -36,9 +36,9 @@ subroutine ext_src_hc(lo, hi, old_state, os_l1, os_l2, os_l3, os_h1, os_h2, os_h
 !       @todo
 !
     use amrex_fort_module, only : rt => amrex_real
+    use amrex_error_module, only: amrex_abort
     use meth_params_module, only : NVAR, UEDEN, UEINT, heat_cool_type
     use fundamental_constants_module
-    use atomic_rates_module, only: interp_to_this_z
 
     implicit none
 
@@ -87,13 +87,9 @@ subroutine ext_src_hc(lo, hi, old_state, os_l1, os_l2, os_l3, os_h1, os_h2, os_h
     !      both "old_state" is in fact the "old" state and
     !           "new_state" is in fact the "new" state
 
-    call interp_to_this_z(z)
-
     half_dt = 0.5d0 * dt
     if (heat_cool_type .eq. 1) then
-        call integrate_state_hc(lo,hi,tmp_state,ns_l1,ns_l2,ns_l3,ns_h1,ns_h2,ns_h3, &
-                                      new_diag ,nd_l1,nd_l2,nd_l3,nd_h1,nd_h2,nd_h3, &
-                                a,half_dt,min_iter,max_iter)
+        call amrex_abort("ERROR: heat_cool_type = 1 is not in function anymore.")
     else if (heat_cool_type .eq. 3) then
         call integrate_state_vode(lo,hi,tmp_state,ns_l1,ns_l2,ns_l3,ns_h1,ns_h2,ns_h3, &
                                         new_diag ,nd_l1,nd_l2,nd_l3,nd_h1,nd_h2,nd_h3, &
@@ -102,6 +98,10 @@ subroutine ext_src_hc(lo, hi, old_state, os_l1, os_l2, os_l3, os_h1, os_h2, os_h
         call integrate_state_fcvode(lo,hi,tmp_state,ns_l1,ns_l2,ns_l3, ns_h1,ns_h2,ns_h3, &
                                                 new_diag ,nd_l1,nd_l2,nd_l3, nd_h1,nd_h2,nd_h3, &
                                   a,half_dt,min_iter,max_iter)
+    else if (heat_cool_type .eq. 7) then
+        call integrate_state_fcvode_vec(lo,hi,tmp_state,ns_l1,ns_l2,ns_l3, ns_h1,ns_h2,ns_h3, &
+                                                new_diag ,nd_l1,nd_l2,nd_l3, nd_h1,nd_h2,nd_h3, &
+                                  a,half_dt,min_iter,max_iter)
     endif
  
     ! Recall that this routine is called from a tiled MFIter 
diff --git a/Source/HeatCool/f_rhs.f90 b/Source/HeatCool/f_rhs.f90
index f025a4e4..d1d41c8d 100644
--- a/Source/HeatCool/f_rhs.f90
+++ b/Source/HeatCool/f_rhs.f90
@@ -7,13 +7,13 @@ subroutine f_rhs(num_eq, time, e_in, energy, rpar, ipar)
       use eos_module, only: iterate_ne
       use atomic_rates_module, ONLY: TCOOLMIN, TCOOLMAX, NCOOLTAB, deltaT, &
                                      MPROTON, XHYDROGEN, &
-                                     AlphaHp, AlphaHep, AlphaHepp, Alphad, &
-                                     GammaeH0, GammaeHe0, GammaeHep, &
+                                     uvb_density_A, uvb_density_B, mean_rhob, &
                                      BetaH0, BetaHe0, BetaHep, Betaff1, Betaff4, &
                                      RecHp, RecHep, RecHepp, &
                                      eh0, ehe0, ehep
 
-      use vode_aux_module       , only: z_vode, rho_vode, T_vode, ne_vode, i_vode, j_vode, k_vode
+      use vode_aux_module       , only: z_vode, rho_vode, T_vode, ne_vode, &
+                                        JH_vode, JHe_vode, i_vode, j_vode, k_vode
 
       integer, intent(in)             :: num_eq, ipar
       real(rt), intent(inout) :: e_in(num_eq)
@@ -27,7 +27,7 @@ subroutine f_rhs(num_eq, time, e_in, energy, rpar, ipar)
       real(rt) :: ahp, ahep, ahepp, ad, geh0, gehe0, gehep
       real(rt) :: bh0, bhe0, bhep, bff1, bff4, rhp, rhep, rhepp
       real(rt) :: lambda_c, lambda_ff, lambda, heat
-      real(rt) :: rho, U, a
+      real(rt) :: rho, U, a, rho_heat
       real(rt) :: nh, nh0, nhp, nhe0, nhep, nhepp
       integer :: j
 
@@ -46,7 +46,7 @@ subroutine f_rhs(num_eq, time, e_in, energy, rpar, ipar)
       end if
 
       ! Get gas temperature and individual ionization species
-      call iterate_ne(z_vode, U, T_vode, nh, ne_vode, nh0, nhp, nhe0, nhep, nhepp)
+      call iterate_ne(JH_vode, JHe_vode, z_vode, U, T_vode, nh, ne_vode, nh0, nhp, nhe0, nhep, nhepp)
 
       ! Convert species to CGS units: 
       ne_vode = nh * ne_vode
@@ -80,13 +80,6 @@ subroutine f_rhs(num_eq, time, e_in, energy, rpar, ipar)
       flo = 1.0d0 - fhi
       j = j + 1 ! F90 arrays start with 1
 
-      ahp   = flo*AlphaHp  (j) + fhi*AlphaHp  (j+1)
-      ahep  = flo*AlphaHep (j) + fhi*AlphaHep (j+1)
-      ahepp = flo*AlphaHepp(j) + fhi*AlphaHepp(j+1)
-      ad    = flo*Alphad   (j) + fhi*Alphad   (j+1)
-      geh0  = flo*GammaeH0 (j) + fhi*GammaeH0 (j+1)
-      gehe0 = flo*GammaeHe0(j) + fhi*GammaeHe0(j+1)
-      gehep = flo*GammaeHep(j) + fhi*GammaeHep(j+1)
       bh0   = flo*BetaH0   (j) + fhi*BetaH0   (j+1)
       bhe0  = flo*BetaHe0  (j) + fhi*BetaHe0  (j+1)
       bhep  = flo*BetaHep  (j) + fhi*BetaHep  (j+1)
@@ -105,7 +98,9 @@ subroutine f_rhs(num_eq, time, e_in, energy, rpar, ipar)
       lambda = lambda + lambda_c
 
       ! Heating terms
-      heat = nh0*eh0 + nhe0*ehe0 + nhep*ehep
+      heat = JH_vode*nh0*eh0 + JH_vode*nhe0*ehe0 + JHe_vode*nhep*ehep
+      rho_heat = uvb_density_A * (rho_vode/mean_rhob)**uvb_density_B
+      heat = rho_heat*heat
 
       ! Convert back to code units
       ne_vode     = ne_vode / nh
@@ -117,6 +112,131 @@ subroutine f_rhs(num_eq, time, e_in, energy, rpar, ipar)
 
 end subroutine f_rhs
 
+
+subroutine f_rhs_vec(time, e_in, energy)
+
+      use amrex_fort_module, only : rt => amrex_real
+      use fundamental_constants_module, only: e_to_cgs, density_to_cgs, & 
+                                              heat_from_cgs
+      use eos_module, only: iterate_ne_vec
+      use atomic_rates_module, ONLY: TCOOLMIN, TCOOLMAX, NCOOLTAB, deltaT, &
+                                     MPROTON, XHYDROGEN, &
+                                     BetaH0, BetaHe0, BetaHep, Betaff1, Betaff4, &
+                                     RecHp, RecHep, RecHepp, &
+                                     eh0, ehe0, ehep
+
+      use vode_aux_module       , only: T_vode_vec, ne_vode_vec, rho_vode_vec, z_vode
+      use misc_params, only: simd_width
+
+      implicit none
+
+      real(rt),                        intent(in   ) :: time
+      real(rt), dimension(simd_width), intent(inout) :: e_in
+      real(rt), dimension(simd_width), intent(  out) :: energy
+
+      real(rt), parameter :: compt_c = 1.01765467d-37, T_cmb = 2.725d0
+
+      real(rt), dimension(simd_width) :: logT, tmp, fhi, flo
+      real(rt), dimension(simd_width) :: ahp, ahep, ahepp, ad, geh0, gehe0, gehep
+      real(rt), dimension(simd_width) :: bh0, bhe0, bhep, bff1, bff4, rhp, rhep, rhepp
+      real(rt), dimension(simd_width) :: lambda_c, lambda_ff, lambda, heat
+      real(rt), dimension(simd_width) :: rho, U
+      real(rt) :: a
+      real(rt), dimension(simd_width) :: nh, nh0, nhp, nhe0, nhep, nhepp
+      integer, dimension(simd_width) :: j
+      integer :: m
+      logical, dimension(simd_width) :: hot
+
+      do m = 1, simd_width
+        if (e_in(m) .lt. 0.d0) then
+           e_in(m) = tiny(e_in(m))
+        endif
+      end do
+
+     ! Converts from code units to CGS
+      rho = rho_vode_vec(1:simd_width) * density_to_cgs * (1.0d0+z_vode)**3
+        U = e_in * e_to_cgs
+      nh  = rho*XHYDROGEN/MPROTON
+
+      if (time .gt. 1) then
+         print *,'TIME INTO F_RHS ',time
+         call bl_pd_abort("TOO BIG TIME IN F_RHS")
+      end if
+
+      ! Get gas temperature and individual ionization species
+      call iterate_ne_vec(z_vode, U, T_vode_vec, nh, ne_vode_vec, nh0, nhp, nhe0, nhep, nhepp, simd_width)
+
+      ! Convert species to CGS units: 
+      ne_vode_vec(1:simd_width) = nh * ne_vode_vec(1:simd_width)
+      nh0   = nh * nh0
+      nhp   = nh * nhp
+      nhe0  = nh * nhe0
+      nhep  = nh * nhep
+      nhepp = nh * nhepp
+
+      logT = dlog10(T_vode_vec(1:simd_width))
+      do m = 1, simd_width
+         if (logT(m) .ge. TCOOLMAX) then ! Only free-free and Compton cooling are relevant
+            lambda_ff(m) = 1.42d-27 * dsqrt(T_vode_vec(m)) * (1.1d0 + 0.34d0*dexp(-(5.5d0 - logT(m))**2 / 3.0d0)) &
+                                 * (nhp(m) + 4.0d0*nhepp(m))*ne_vode_vec(m)
+            lambda_c(m)  = compt_c*T_cmb**4 * ne_vode_vec(m) * (T_vode_vec(m) - T_cmb*(1.0d0+z_vode))*(1.0d0 + z_vode)**4
+
+            energy(m)  = (-lambda_ff(m) -lambda_c(m)) * heat_from_cgs/(1.0d0+z_vode)**4
+
+            ! Convert to the actual term to be used in e_out = e_in + dt*energy
+            energy(m)  = energy(m) / rho_vode_vec(m) * (1.0d0+z_vode)
+            ne_vode_vec(m) = ne_vode_vec(m) / nh(m)
+            hot(m) = .true.
+         else
+            hot(m) = .false.
+         endif
+      end do
+
+      do m = 1, simd_width
+         if (.not. hot(m)) then
+            ! Temperature floor
+            if (logT(m) .le. TCOOLMIN) logT(m) = TCOOLMIN + 0.5d0*deltaT
+      
+            ! Interpolate rates
+            tmp(m) = (logT(m)-TCOOLMIN)/deltaT
+            j(m) = int(tmp(m))
+            fhi(m) = tmp(m) - j(m)
+            flo(m) = 1.0d0 - fhi(m)
+            j(m) = j(m) + 1 ! F90 arrays start with 1
+      
+            bh0(m)   = flo(m)*BetaH0   (j(m)) + fhi(m)*BetaH0   (j(m)+1)
+            bhe0(m)  = flo(m)*BetaHe0  (j(m)) + fhi(m)*BetaHe0  (j(m)+1)
+            bhep(m)  = flo(m)*BetaHep  (j(m)) + fhi(m)*BetaHep  (j(m)+1)
+            bff1(m)  = flo(m)*Betaff1  (j(m)) + fhi(m)*Betaff1  (j(m)+1)
+            bff4(m)  = flo(m)*Betaff4  (j(m)) + fhi(m)*Betaff4  (j(m)+1)
+            rhp(m)   = flo(m)*RecHp    (j(m)) + fhi(m)*RecHp    (j(m)+1)
+            rhep(m)  = flo(m)*RecHep   (j(m)) + fhi(m)*RecHep   (j(m)+1)
+            rhepp(m) = flo(m)*RecHepp  (j(m)) + fhi(m)*RecHepp  (j(m)+1)
+      
+            ! Cooling: 
+            lambda(m) = ( bh0(m)*nh0(m) + bhe0(m)*nhe0(m) + bhep(m)*nhep(m) + &
+                       rhp(m)*nhp(m) + rhep(m)*nhep(m) + rhepp(m)*nhepp(m) + &
+                       bff1(m)*(nhp(m)+nhep(m)) + bff4(m)*nhepp(m) ) * ne_vode_vec(m)
+
+            lambda_c(m) = compt_c*T_cmb**4*ne_vode_vec(m)*(T_vode_vec(m) - T_cmb*(1.0d0+z_vode))*(1.0d0 + z_vode)**4   ! Compton cooling
+            lambda(m) = lambda(m) + lambda_c(m)
+      
+            ! Heating terms
+            heat(m) = nh0(m)*eh0 + nhe0(m)*ehe0 + nhep(m)*ehep
+      
+            ! Convert back to code units
+            ne_vode_vec(m)     = ne_vode_vec(m) / nh(m)
+            energy(m) = (heat(m) - lambda(m))*heat_from_cgs/(1.0d0+z_vode)**4
+      
+            ! Convert to the actual term to be used in e_out = e_in + dt*energy
+            a = 1.d0 / (1.d0 + z_vode)
+            energy(m) = energy(m) / rho_vode_vec(m) / a
+         end if
+      end do
+
+end subroutine f_rhs_vec
+
+
 subroutine jac(neq, t, y, ml, mu, pd, nrpd, rpar, ipar)
 
   use amrex_fort_module, only : rt => amrex_real
diff --git a/Source/HeatCool/fcvode_extras.f90 b/Source/HeatCool/fcvode_extras.f90
new file mode 100644
index 00000000..8f79183b
--- /dev/null
+++ b/Source/HeatCool/fcvode_extras.f90
@@ -0,0 +1,186 @@
+module fcvode_extras
+
+  implicit none
+
+  contains
+
+    subroutine fcvode_wrapper(dt, rho_in, T_in, ne_in, e_in, neq, cvmem, &
+                              sunvec_y, yvec, T_out, ne_out, e_out)
+
+        use amrex_fort_module, only : rt => amrex_real
+        use vode_aux_module, only: rho_vode, T_vode, ne_vode, z_vode
+        use atomic_rates_module, only: this_z
+        use cvode_interface
+        use fnvector_serial
+        use eos_module, only: vode_rtol, vode_atol_scaled
+        use, intrinsic :: iso_c_binding
+
+        implicit none
+
+        real(rt), intent(in   ) :: dt
+        real(rt), intent(in   ) :: rho_in, T_in, ne_in, e_in
+        type(c_ptr), value :: cvmem
+        type(c_ptr), value :: sunvec_y
+        real(rt), intent(  out) ::         T_out,ne_out,e_out
+
+        real(c_double) :: atol, rtol
+        real(c_double) :: time, tout
+        integer(c_long), intent(in) :: neq
+        real(c_double), pointer, intent(in) :: yvec(:)
+
+        integer(c_int) :: ierr
+
+        real(c_double) :: t_soln
+
+        T_vode   = T_in
+        ne_vode  = ne_in
+        rho_vode = rho_in
+
+        ! Initialize the integration time
+        time = 0.d0
+
+        ! We will integrate "e" in time. 
+        yvec(1) = e_in
+
+        ! Set the tolerances.  
+        atol = vode_atol_scaled * e_in
+        rtol = vode_rtol
+
+        ierr = FCVodeReInit(cvmem, time, sunvec_y)
+        ierr = FCVodeSStolerances(CVmem, rtol, atol)
+
+        ierr = FCVode(CVmem, dt, sunvec_y, time, CV_NORMAL)
+
+        e_out  = yvec(1)
+        T_out  = T_vode
+        ne_out = ne_vode
+
+    end subroutine fcvode_wrapper
+
+    subroutine fcvode_wrapper_vec(dt, rho_in, T_in, ne_in, e_in, neq, cvmem, &
+                              sunvec_y, yvec, T_out, ne_out, e_out)
+
+        use amrex_fort_module, only : rt => amrex_real
+        use vode_aux_module, only: rho_vode_vec, T_vode_vec, ne_vode_vec
+        use cvode_interface
+        use fnvector_serial
+        use misc_params, only: simd_width
+        use eos_module, only: vode_rtol, vode_atol_scaled
+        use, intrinsic :: iso_c_binding
+
+        implicit none
+
+        real(rt), intent(in   ) :: dt
+        real(rt), dimension(simd_width), intent(in   ) :: rho_in, T_in, ne_in, e_in
+        type(c_ptr), value :: cvmem
+        type(c_ptr), value :: sunvec_y
+        real(rt), dimension(simd_width), intent(  out) ::         T_out,ne_out,e_out
+
+        real(c_double) :: rtol
+        real(c_double), pointer, dimension(:) :: atol
+        real(c_double) :: time, tout
+        integer(c_long), intent(in) :: neq
+        real(c_double), pointer, intent(in) :: yvec(:)
+        type(c_ptr) :: sunvec_atol
+
+        integer(c_int) :: ierr
+
+        real(c_double) :: t_soln
+
+        allocate(atol(simd_width))
+
+        sunvec_atol = N_VMake_Serial(neq, atol)
+
+        T_vode_vec(1:simd_width)   = T_in(1:simd_width)
+        ne_vode_vec(1:simd_width)  = ne_in(1:simd_width)
+        rho_vode_vec(1:simd_width) = rho_in(1:simd_width)
+
+        ! Initialize the integration time
+        time = 0.d0
+
+        ! We will integrate "e" in time. 
+        yvec(1:simd_width) = e_in(1:simd_width)
+
+        ! Set the tolerances.  
+        atol(1:simd_width) = vode_atol_scaled * e_in(1:simd_width)
+        rtol = vode_rtol
+
+        ierr = FCVodeReInit(cvmem, time, sunvec_y)
+        ierr = FCVodeSVtolerances(CVmem, rtol, sunvec_atol)
+
+        ierr = FCVode(CVmem, dt, sunvec_y, time, CV_NORMAL)
+
+        e_out(1:simd_width)  = yvec(1:simd_width)
+        T_out(1:simd_width)  = T_vode_vec(1:simd_width)
+        ne_out(1:simd_width) = ne_vode_vec(1:simd_width)
+
+        call N_VDestroy_Serial(sunvec_atol)
+        deallocate(atol)
+
+    end subroutine fcvode_wrapper_vec
+
+    integer(c_int) function RhsFn(tn, sunvec_y, sunvec_f, user_data) &
+           result(ierr) bind(C,name='RhsFn')
+
+      use, intrinsic :: iso_c_binding
+      use fnvector_serial
+      use cvode_interface
+      implicit none
+
+      real(c_double), value :: tn
+      type(c_ptr), value    :: sunvec_y
+      type(c_ptr), value    :: sunvec_f
+      type(c_ptr), value    :: user_data
+
+      ! pointers to data in SUNDAILS vectors
+      real(c_double), pointer :: yvec(:)
+      real(c_double), pointer :: fvec(:)
+
+      real(c_double) :: energy
+
+      integer(c_long), parameter :: neq = 1
+
+      ! get data arrays from SUNDIALS vectors
+      call N_VGetData_Serial(sunvec_y, neq, yvec)
+      call N_VGetData_Serial(sunvec_f, neq, fvec)
+
+      call f_rhs(1, tn, yvec(1), energy, 0.0, 0)
+
+      fvec(1) = energy
+
+      ierr = 0
+    end function RhsFn
+
+
+    integer(c_int) function RhsFn_vec(tn, sunvec_y, sunvec_f, user_data) &
+           result(ierr) bind(C,name='RhsFn_vec')
+
+      use, intrinsic :: iso_c_binding
+      use fnvector_serial
+      use cvode_interface
+      use misc_params, only: simd_width
+      implicit none
+
+      real(c_double), value :: tn
+      type(c_ptr), value    :: sunvec_y, sunvec_f, user_data
+
+      ! pointers to data in SUNDAILS vectors
+      real(c_double), dimension(:), pointer :: yvec, fvec
+
+      integer(c_long) :: neq
+      real(c_double) :: energy(simd_width)
+
+      neq = int(simd_width, c_long)
+
+      ! get data arrays from SUNDIALS vectors
+      call N_VGetData_Serial(sunvec_y, neq, yvec)
+      call N_VGetData_Serial(sunvec_f, neq, fvec)
+
+      call f_rhs_vec(tn, yvec, energy)
+
+      fvec = energy
+
+      ierr = 0
+    end function RhsFn_vec
+
+end module fcvode_extras
diff --git a/Source/HeatCool/heat_cool_stubs.f90 b/Source/HeatCool/heat_cool_stubs.f90
index 4ac41459..5307be1f 100644
--- a/Source/HeatCool/heat_cool_stubs.f90
+++ b/Source/HeatCool/heat_cool_stubs.f90
@@ -81,3 +81,10 @@ end subroutine adjust_heat_cool
 
 end module adjust_heat_cool_module
 
+! unused VODE stubs if we are not doing heating/cooling
+module vode_aux_module
+  use amrex_fort_module, only : rt => amrex_real
+  implicit none
+
+  real(rt) :: z_vode
+end module vode_aux_module
diff --git a/Source/HeatCool/integrate_state_3d.f90 b/Source/HeatCool/integrate_state_3d.f90
index 3c0183a2..d463080c 100644
--- a/Source/HeatCool/integrate_state_3d.f90
+++ b/Source/HeatCool/integrate_state_3d.f90
@@ -33,7 +33,8 @@ subroutine integrate_state(lo, hi, &
 !
    
     use amrex_fort_module, only : rt => amrex_real
-    use meth_params_module, only : NVAR, heat_cool_type
+    use amrex_error_module, only: amrex_abort
+    use meth_params_module, only : NVAR, NDIAG, heat_cool_type
 
     implicit none
 
@@ -41,14 +42,12 @@ subroutine integrate_state(lo, hi, &
     integer         , intent(in   ) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3
     integer         , intent(in   ) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3
     real(rt), intent(inout) ::    state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR)
-    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2)
+    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, NDIAG)
     real(rt), intent(in   ) :: dx(3), time, a, half_dt
     integer         , intent(inout) :: min_iter, max_iter
 
     if (heat_cool_type .eq. 1) then
-        call integrate_state_hc(lo, hi, state   , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, &
-                                        diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, &
-                                a, half_dt, min_iter, max_iter)
+        call amrex_abort("ERROR: heat_cool_type = 1 is not in function anymore.")
     else if (heat_cool_type .eq. 3) then
         call integrate_state_vode(lo, hi, state   , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, &
                                           diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, &
@@ -57,6 +56,10 @@ subroutine integrate_state(lo, hi, &
         call integrate_state_fcvode(lo, hi, state   , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, &
                                           diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, &
                                   a, half_dt, min_iter, max_iter)
+    else if (heat_cool_type .eq. 7) then
+        call integrate_state_fcvode_vec(lo, hi, state   , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, &
+                                          diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, &
+                                  a, half_dt, min_iter, max_iter)
 
     end if
 
diff --git a/Exec/LyA/integrate_state_fcvode_3d.f90 b/Source/HeatCool/integrate_state_fcvode_3d.f90
similarity index 70%
rename from Exec/LyA/integrate_state_fcvode_3d.f90
rename to Source/HeatCool/integrate_state_fcvode_3d.f90
index fb83df6d..33f1df51 100644
--- a/Exec/LyA/integrate_state_fcvode_3d.f90
+++ b/Source/HeatCool/integrate_state_fcvode_3d.f90
@@ -32,15 +32,19 @@ subroutine integrate_state_fcvode(lo, hi, &
     use amrex_fort_module, only : rt => amrex_real
     use amrex_error_module, only : amrex_abort
     use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, &
-                                   TEMP_COMP, NE_COMP, gamma_minus_1
+                                   NDIAG, TEMP_COMP, NE_COMP, ZHI_COMP, &
+                                   gamma_minus_1
     use bl_constants_module, only: M_PI
     use eos_params_module
     use network
     use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT
     use fundamental_constants_module
     use comoving_module, only: comoving_h, comoving_OmB
-    use atomic_rates_module, only: tabulate_rates, interp_to_this_z, YHELIUM
-    use vode_aux_module    , only: z_vode, i_vode, j_vode, k_vode
+    use comoving_nd_module, only: fort_integrate_comoving_a
+    use atomic_rates_module, only: YHELIUM
+    use vode_aux_module    , only: JH_vode, JHe_vode, z_vode, i_vode, j_vode, k_vode
+    use reion_aux_module   , only: zhi_flash, zheii_flash, flash_h, flash_he, &
+                                   T_zhi, T_zheii, inhomogeneous_on
     use cvode_interface
     use fnvector_serial
     use fcvode_extras
@@ -52,14 +56,16 @@ subroutine integrate_state_fcvode(lo, hi, &
     integer         , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3
     integer         , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3
     real(rt), intent(inout) ::    state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR)
-    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2)
+    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, NDIAG)
     real(rt), intent(in)    :: a, half_dt
     integer         , intent(inout) :: max_iter, min_iter
 
     integer :: i, j, k
-    real(rt) :: z, rho
+    real(rt) :: z, z_end, a_end, rho, H_reion_z, He_reion_z
     real(rt) :: T_orig, ne_orig, e_orig
-    real(rt) :: T_out , ne_out , e_out, mu, mean_rhob
+    real(rt) :: T_out , ne_out , e_out, mu, mean_rhob, T_H, T_He
+    real(rt) :: species(5)
+
     integer(c_int) :: ierr       ! error flag from C functions
     real(c_double) :: tstart     ! initial time
     real(c_double) :: atol, rtol
@@ -71,12 +77,25 @@ subroutine integrate_state_fcvode(lo, hi, &
     allocate(yvec(neq))
 
     z = 1.d0/a - 1.d0
+    call fort_integrate_comoving_a(a, a_end, half_dt)
+    z_end = 1.0d0/a_end - 1.0d0
 
-    z_vode = z
     mean_rhob = comoving_OmB * 3.d0*(comoving_h*100.d0)**2 / (8.d0*M_PI*Gconst)
 
-    ! Interpolate from the table to this redshift
-    call interp_to_this_z(z)
+    ! Flash reionization?
+    if ((flash_h .eqv. .true.) .and. (z .gt. zhi_flash)) then
+       JH_vode = 0
+    else
+       JH_vode = 1
+    endif
+    if ((flash_he .eqv. .true.) .and. (z .gt. zheii_flash)) then
+       JHe_vode = 0
+    else
+       JHe_vode = 1
+    endif
+
+    if (flash_h ) H_reion_z  = zhi_flash
+    if (flash_he) He_reion_z = zheii_flash
 
     ! Note that (lo,hi) define the region of the box containing the grow cells
     ! Do *not* assume this is just the valid region
@@ -124,6 +143,15 @@ subroutine integrate_state_fcvode(lo, hi, &
                 T_orig  = diag_eos(i,j,k,TEMP_COMP)
                 ne_orig = diag_eos(i,j,k,  NE_COMP)
 
+                if (inhomogeneous_on) then
+                   H_reion_z = diag_eos(i,j,k,ZHI_COMP)
+                   if (z .gt. H_reion_z) then
+                      JH_vode = 0
+                   else
+                      JH_vode = 1
+                   endif
+                endif
+
                 if (e_orig .lt. 0.d0) then
                     print *,'negative e entering strang integration ',z, i,j,k, rho/mean_rhob, e_orig
                     call bl_abort('bad e in strang')
@@ -146,12 +174,34 @@ subroutine integrate_state_fcvode(lo, hi, &
 !                    call bl_abort('bad e out of strang')
                 end if
 
+                ! Update T and ne (do not use stuff computed in f_rhs, per vode manual)
+                call nyx_eos_T_given_Re(JH_vode, JHe_vode, T_out, ne_out, rho, e_out, a, species)
+
+                !  Flash heating in reionization:
+                T_H = 0.0d0
+                if (inhomogeneous_on .or. flash_h) then
+                   if ((H_reion_z  .lt. z) .and. (H_reion_z  .ge. z_end)) T_H  = (1.0d0 - species(2))*T_zhi
+                endif
+
+                T_He = 0.0d0
+                if (flash_he) then
+                   if ((He_reion_z .lt. z) .and. (He_reion_z .ge. z_end)) T_He = (1.0d0 - species(5))*T_zheii
+                endif
+
+                if ((T_H .gt. 0.0d0) .or. (T_He .gt. 0.0d0)) then
+                   T_out = T_orig + T_H + T_He
+                   ne_out = 1.0d0 + YHELIUM
+                   if (T_He .gt. 0.0d0) ne_out = ne_out + YHELIUM
+                   mu = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne_out)
+                   e_out  = T_out / (gamma_minus_1 * mp_over_kB * mu)
+                   call nyx_eos_T_given_Re(JH_vode, JHe_vode, T_out, ne_out, rho, e_out, a, species)
+                endif
+
                 ! Update (rho e) and (rho E)
                 state(i,j,k,UEINT) = state(i,j,k,UEINT) + rho * (e_out-e_orig)
                 state(i,j,k,UEDEN) = state(i,j,k,UEDEN) + rho * (e_out-e_orig)
 
-                ! Update T and ne (do not use stuff computed in f_rhs, per vode manual)
-                call nyx_eos_T_given_Re(T_out, ne_out, rho, e_out, a)
+                ! Update T and ne
                 diag_eos(i,j,k,TEMP_COMP) = T_out
                 diag_eos(i,j,k,  NE_COMP) = ne_out
 
diff --git a/Source/HeatCool/integrate_state_fcvode_3d_stubs.f90 b/Source/HeatCool/integrate_state_fcvode_3d_stubs.f90
index c705c3f3..97b6ab76 100644
--- a/Source/HeatCool/integrate_state_fcvode_3d_stubs.f90
+++ b/Source/HeatCool/integrate_state_fcvode_3d_stubs.f90
@@ -6,14 +6,14 @@ subroutine integrate_state_fcvode(lo, hi, &
     use amrex_error_module, only : amrex_abort
     use amrex_fort_module, only : rt => amrex_real
     use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, &
-                                   TEMP_COMP, NE_COMP, gamma_minus_1
+                                   NDIAG, TEMP_COMP, NE_COMP, gamma_minus_1
     use bl_constants_module, only: M_PI
     use eos_params_module
     use network
     use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT
     use fundamental_constants_module
     use comoving_module, only: comoving_h, comoving_OmB
-    use atomic_rates_module, only: tabulate_rates, interp_to_this_z, YHELIUM
+    use atomic_rates_module, only: YHELIUM
     use vode_aux_module    , only: z_vode, i_vode, j_vode, k_vode, firstcall
 
     implicit none
@@ -22,7 +22,7 @@ subroutine integrate_state_fcvode(lo, hi, &
     integer         , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3
     integer         , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3
     real(rt), intent(inout) ::    state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR)
-    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2)
+    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, NDIAG)
     real(rt), intent(in)    :: a, half_dt
     integer         , intent(inout) :: max_iter, min_iter
 
diff --git a/Source/HeatCool/integrate_state_fcvode_vec_3d.f90 b/Source/HeatCool/integrate_state_fcvode_vec_3d.f90
new file mode 100644
index 00000000..a263c452
--- /dev/null
+++ b/Source/HeatCool/integrate_state_fcvode_vec_3d.f90
@@ -0,0 +1,197 @@
+subroutine integrate_state_fcvode_vec(lo, hi, &
+                                  state   , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, &
+                                  diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, &
+                                  a, half_dt, min_iter, max_iter)
+!
+!   Calculates the sources to be added later on.
+!
+!   Parameters
+!   ----------
+!   lo : double array (3)
+!       The low corner of the current box.
+!   hi : double array (3)
+!       The high corner of the current box.
+!   state_* : double arrays
+!       The state vars
+!   diag_eos_* : double arrays
+!       Temp and Ne
+!   src_* : doubles arrays
+!       The source terms to be added to state (iterative approx.)
+!   double array (3)
+!       The low corner of the entire domain
+!   a : double
+!       The current a
+!   half_dt : double
+!       time step size, in Mpc km^-1 s ~ 10^12 yr.
+!
+!   Returns
+!   -------
+!   state : double array (dims) @todo
+!       The state vars
+!
+    use amrex_fort_module, only : rt => amrex_real
+    use amrex_error_module, only : amrex_abort
+    use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, &
+                                   NDIAG, TEMP_COMP, NE_COMP, gamma_minus_1
+    use bl_constants_module, only: M_PI
+    use eos_params_module
+    use network
+    use eos_module, only: nyx_eos_T_given_Re, nyx_eos_T_given_Re_vec, nyx_eos_given_RT
+    use fundamental_constants_module
+    use comoving_module, only: comoving_h, comoving_OmB
+    use atomic_rates_module, only: YHELIUM
+    use vode_aux_module    , only: z_vode, i_vode, j_vode, k_vode
+    use cvode_interface
+    use fnvector_serial
+    use fcvode_extras
+    use misc_params, only: simd_width
+    use parallel, only : parallel_ioprocessor
+    use, intrinsic :: iso_c_binding
+
+    implicit none
+
+    integer         , intent(in) :: lo(3), hi(3)
+    integer         , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3
+    integer         , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3
+    real(rt), intent(inout) ::    state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR)
+    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, NDIAG)
+    real(rt), intent(in)    :: a, half_dt
+    integer         , intent(inout) :: max_iter, min_iter
+
+    integer :: i, j, k, ii
+    real(rt) :: z
+    real(rt), dimension(simd_width) :: rho
+    real(rt), dimension(simd_width) :: T_orig, ne_orig, e_orig
+    real(rt), dimension(simd_width) :: T_out , ne_out , e_out, mu
+    real(rt) :: mean_rhob
+    integer(c_int) :: ierr       ! error flag from C functions
+    real(c_double) :: tstart     ! initial time
+    real(c_double) :: rtol
+    real(c_double), pointer, dimension(:) :: atol
+    type(c_ptr) :: sunvec_y      ! sundials vector
+    type(c_ptr) :: CVmem         ! CVODE memory
+    type(c_ptr) :: sunvec_atol
+    integer(c_long) :: neq
+    real(c_double), pointer :: yvec(:)
+    character(len=128) :: errmsg
+
+    if (mod(hi(1)-lo(1)+1, simd_width) /= 0) then
+      if (parallel_ioprocessor()) then
+        !$omp single
+        write(errmsg, *) "simd_width does not divide evenly to tile x-length! lo(1) = ", &
+                         lo(1), " hi(1) = ", hi(1), " simd_width = ", simd_width
+        call amrex_abort(errmsg)
+        !$omp end single
+      endif
+    end if
+
+    neq = int(simd_width, c_long)
+
+    allocate(yvec(neq))
+    allocate(atol(neq))
+
+    z = 1.d0/a - 1.d0
+
+    z_vode = z
+    mean_rhob = comoving_OmB * 3.d0*(comoving_h*100.d0)**2 / (8.d0*M_PI*Gconst)
+
+    ! Note that (lo,hi) define the region of the box containing the grow cells
+    ! Do *not* assume this is just the valid region
+    ! apply heating-cooling to UEDEN and UEINT
+
+    sunvec_y = N_VMake_Serial(NEQ, yvec)
+    if (.not. c_associated(sunvec_y)) then
+        call amrex_abort('integrate_state_fcvode_vec: sunvec_y = NULL')
+    end if
+
+    sunvec_atol = N_VMake_Serial(NEQ, atol)
+    if (.not. c_associated(sunvec_atol)) then
+        call amrex_abort('integrate_state_fcvode_vec: sunvec_atol = NULL')
+    end if
+
+    CVmem = FCVodeCreate(CV_BDF, CV_NEWTON)
+    if (.not. c_associated(CVmem)) then
+        call amrex_abort('integrate_state_fcvode_vec: CVmem = NULL')
+    end if
+
+    tstart = 0.0
+    ! CVodeMalloc allocates variables and initialize the solver. We can
+    ! initialize the solver with junk because once we enter the (i,j,k) loop we will
+    ! immediately call fcvreinit which reuses the same memory allocated from
+    ! CVodeMalloc but sets up new initial conditions.
+    ierr = FCVodeInit(CVmem, c_funloc(RhsFn_vec), tstart, sunvec_y)
+    if (ierr /= 0) then
+       call amrex_abort('integrate_state_fcvode_vec: FCVodeInit() failed')
+    end if
+
+    ! Set dummy tolerances. These will be overwritten as soon as we enter the
+    ! loop and reinitialize the solver.
+    rtol = 1.0d-5
+    atol(:) = 1.0d-10
+    ierr = FCVodeSVtolerances(CVmem, rtol, sunvec_atol)
+    if (ierr /= 0) then
+      call amrex_abort('integrate_state_fcvode_vec: FCVodeSVtolerances() failed')
+    end if
+
+    ierr = FCVDiag(CVmem)
+    if (ierr /= 0) then
+       call amrex_abort('integrate_state_fcvode_vec: FCVDiag() failed')
+    end if
+
+    do k = lo(3),hi(3)
+        do j = lo(2),hi(2)
+          do i = lo(1),hi(1),simd_width
+
+                ! Original values
+                rho     = state(i:i+simd_width-1,j,k,URHO)
+                e_orig  = state(i:i+simd_width-1,j,k,UEINT) / rho
+                T_orig  = diag_eos(i:i+simd_width-1,j,k,TEMP_COMP)
+                ne_orig = diag_eos(i:i+simd_width-1,j,k,  NE_COMP)
+
+                do ii = 1, simd_width
+                  if (e_orig(ii) .lt. 0.d0) then
+                      print *,'negative e entering strang integration ',z, i+ii-1,j,k, rho(ii)/mean_rhob, e_orig(ii)
+                      call bl_abort('bad e in strang')
+                  end if
+                end do
+
+                i_vode = i
+                j_vode = j
+                k_vode = k
+
+                call fcvode_wrapper_vec(half_dt,rho,T_orig,ne_orig,e_orig,neq,CVmem,sunvec_y,yvec, &
+                                              T_out ,ne_out ,e_out)
+
+                do ii = 1, simd_width
+                  if (e_out(ii) .lt. 0.d0) then
+                      print *,'negative e exiting strang integration ',z, i,j,k, rho(ii)/mean_rhob, e_out(ii)
+                      T_out(ii)  = 10.0
+                      ne_out(ii) = 0.0
+                      mu(ii)     = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne_out(ii))
+                      e_out(ii)  = T_out(ii) / (gamma_minus_1 * mp_over_kB * mu(ii))
+                      call flush(6)
+  !                    call bl_abort('bad e out of strang')
+                  end if
+                end do
+
+                ! Update (rho e) and (rho E)
+                state(i:i+simd_width-1,j,k,UEINT) = state(i:i+simd_width-1,j,k,UEINT) + rho(1:simd_width) * (e_out(1:simd_width)-e_orig(1:simd_width))
+                state(i:i+simd_width-1,j,k,UEDEN) = state(i:i+simd_width-1,j,k,UEDEN) + rho(1:simd_width) * (e_out(1:simd_width)-e_orig(1:simd_width))
+
+                ! Update T and ne (do not use stuff computed in f_rhs, per vode manual)
+                call nyx_eos_T_given_Re_vec(T_out(1:simd_width), ne_out(1:simd_width), rho(1:simd_width), e_out(1:simd_width), a, simd_width)
+                diag_eos(i:i+simd_width-1,j,k,TEMP_COMP) = T_out(1:simd_width)
+                diag_eos(i:i+simd_width-1,j,k,  NE_COMP) = ne_out(1:simd_width)
+
+            end do ! i
+        end do ! j
+    end do ! k
+
+    call N_VDestroy_Serial(sunvec_atol)
+    call N_VDestroy_Serial(sunvec_y)
+    call FCVodeFree(cvmem)
+
+    deallocate(yvec)
+    deallocate(atol)
+
+end subroutine integrate_state_fcvode_vec
diff --git a/Source/HeatCool/integrate_state_fcvode_vec_3d_stubs.f90 b/Source/HeatCool/integrate_state_fcvode_vec_3d_stubs.f90
new file mode 100644
index 00000000..6ce893bf
--- /dev/null
+++ b/Source/HeatCool/integrate_state_fcvode_vec_3d_stubs.f90
@@ -0,0 +1,36 @@
+subroutine integrate_state_fcvode_vec(lo, hi, &
+                                  state   , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, &
+                                  diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, &
+                                  a, half_dt, min_iter, max_iter)
+!
+    use amrex_error_module, only : amrex_abort
+    use amrex_fort_module, only : rt => amrex_real
+    use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, &
+                                   NDIAG, TEMP_COMP, NE_COMP, gamma_minus_1
+    use bl_constants_module, only: M_PI
+    use eos_params_module
+    use network
+    use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT
+    use fundamental_constants_module
+    use comoving_module, only: comoving_h, comoving_OmB
+    use atomic_rates_module, only: YHELIUM
+    use vode_aux_module    , only: z_vode, i_vode, j_vode, k_vode, firstcall
+
+    implicit none
+
+    integer         , intent(in) :: lo(3), hi(3)
+    integer         , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3
+    integer         , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3
+    real(rt), intent(inout) ::    state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR)
+    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, NDIAG)
+    real(rt), intent(in)    :: a, half_dt
+    integer         , intent(inout) :: max_iter, min_iter
+
+    integer :: i, j, k
+    real(rt) :: z, rho
+    real(rt) :: T_orig, ne_orig, e_orig
+    real(rt) :: T_out , ne_out , e_out, mu, mean_rhob
+
+    call amrex_abort("Cannot call fcvode without compiling with USE_CVODE=TRUE")
+
+end subroutine integrate_state_fcvode_vec
diff --git a/Source/HeatCool/integrate_state_hc_3d.f90 b/Source/HeatCool/integrate_state_hc_3d.f90
deleted file mode 100644
index f2824a78..00000000
--- a/Source/HeatCool/integrate_state_hc_3d.f90
+++ /dev/null
@@ -1,233 +0,0 @@
-subroutine integrate_state_hc(lo, hi, &
-                              state   , s_l1, s_l2, s_l3, s_h1, s_h2, s_h3, &
-                              diag_eos, d_l1, d_l2, d_l3, d_h1, d_h2, d_h3, &
-                              a, half_dt, min_iter, max_iter)
-!
-!   Calculates the sources to be added later on.
-!
-!   Parameters
-!   ----------
-!   lo : double array (3)
-!       The low corner of the current box.
-!   hi : double array (3)
-!       The high corner of the current box.
-!   state_* : double arrays
-!       The state vars
-!   diag_eos_* : double arrays
-!       Temp and Ne
-!   src_* : doubles arrays
-!       The source terms to be added to state (iterative approx.)
-!   double array (3)
-!       The low corner of the entire domain
-!   a : double
-!       The current a
-!   half_dt : double
-!       time step size, in Mpc km^-1 s ~ 10^12 yr.
-!
-!   Returns
-!   -------
-!   state : double array (dims) @todo
-!       The state vars
-!
-    use amrex_fort_module, only : rt => amrex_real
-    use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, &
-                                   TEMP_COMP, NE_COMP, small_pres, gamma_minus_1
-    use eos_params_module
-    use network
-    use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT
-    use fundamental_constants_module
-    use atomic_rates_module, only: tabulate_rates, interp_to_this_z
-    use heating_cooling_module, only: hc_rates
-
-    implicit none
-
-    integer         , intent(in) :: lo(3), hi(3)
-    integer         , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3
-    integer         , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3
-    real(rt), intent(inout) ::    state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR)
-    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2)
-    real(rt), intent(in)    :: a, half_dt
-    integer         , intent(inout) :: max_iter, min_iter
-
-    integer, parameter :: NITERS = 20
-    real(rt), parameter :: xacc = 1.0d-3
-
-    integer :: i, j, k, n, iter, nsteps, cnt
-    real(rt) :: z, rho, T, ne
-    real(rt) :: T_orig, rho_e_orig, ne_orig, e_int_old, De_int
-    real(rt) :: T_first, ne_first, src_first
-    real(rt) :: src_old, src_new, delta_re, delta_t, rho_e, e_int, prev_soln
-    real(rt) :: b_fac
-    logical          :: do_diag, prnt_cell, done_iter
-    logical          :: went_negative, went_negative_at_first
-
-    z = 1.d0/a - 1.d0
-    do_diag   = .false.
-    prnt_cell = .false.
-
-    ! Interpolate from the table to this redshift
-    call interp_to_this_z(z)
-
-    b_fac = 0.0d0
-    max_iter = 0
-    min_iter = NITERS+1
-
-    ! Note that (lo,hi) define the region of the box containing the grow cells
-    ! Do *not* assume this is just the valid region
-    ! apply heating-cooling to UEDEN and UEINT
-
-    do k = lo(3),hi(3)
-        do j = lo(2),hi(2)
-            do i = lo(1),hi(1)
-                ! Original values
-                rho        = state(i,j,k,URHO)
-                rho_e_orig = state(i,j,k,UEINT)
-                T_orig     = diag_eos(i,j,k,TEMP_COMP)
-                ne_orig    = diag_eos(i,j,k,  NE_COMP)
-
-                if (rho_e_orig .lt. 0.d0) then
-                    print *,'(rho e) entering strang integration negative ',i,j,k, rho_e_orig
-                    call bl_abort('bad rho e in strang')
-                end if
-
-                e_int = rho_e_orig/rho
-                call hc_rates(z, rho, e_int, T, ne, src_new, prnt_cell)
-                T_first   = T
-                ne_first  = ne
-                src_first = src_new
-
-                went_negative_at_first = .false.
-
-                prev_soln = HUGE(prev_soln)
-                do iter = 1, NITERS  ! max allowed iterations
-
-                  nsteps  = 2**(iter-1)
-                  delta_t = half_dt / nsteps
-                  rho_e   = rho_e_orig
-                  e_int   = rho_e/rho
-
-                  delta_re = 0.0d0
-                  src_old  = 0.0d0
-
-                  do n = 1, nsteps
-
-                    done_iter = .false.
-                    e_int_old = e_int
-                    e_int     = rho_e/rho
-
-                    if (n.eq.1) then
-                        T      = T_first
-                       ne      = ne_first
-                       src_new = src_first
-                    else
-                       call hc_rates(z, rho, e_int, T, ne, src_new, prnt_cell)
-                    end if
-
-                    if ( (rho_e+delta_t*src_new/a) .gt. 0.0d0) then 
-
-                       went_negative          = .false.     
-
-                       src_new = delta_t * src_new / a
-                       rho_e   = rho_e + src_new
-
-                       if (src_old*src_new .lt. 0.0d0) then ! src=0 in between
-                          if (rho_e .le. 0.0d0) then 
-                             De_int = e_int/2.0d0
-                             e_int  = e_int/2.0d0
-                          else
-                             De_int = abs(e_int_old - e_int)/2.0d0
-                             e_int  = e_int + sign(De_int, src_new)
-                          endif
-                          cnt = 0
-                          do
-                             cnt = cnt + 1
-                             call hc_rates(z, rho, e_int, T, ne, src_new, prnt_cell)
-                             if (abs(delta_t*src_new/a)/rho .lt. xacc) EXIT
-                             if (cnt .gt. 40) then 
-                                print*, 'BISECTION problem in cell:',i,j,k,iter,n
-                                call flush(6)
-                                call bl_error("Problem in bisection in integrate_hc_3d.f90")
-                             endif
-                             De_int = De_int / 2.0d0
-                             e_int = e_int + sign(De_int, src_new)
-                          enddo
-
-                          rho_e     = e_int * rho
-                          delta_re  = rho_e - rho_e_orig
-                          done_iter = .true.
-                          b_fac     = b_fac+1 ! just for diagnostics
-                          EXIT
-                       endif
-
-                       delta_re = delta_re + src_new ! Cumulative update
-                       src_old  = src_new
-
-                    ! Here we just leave rho_e alone and proceed to the next iter
-                    else   ! (rho_e + src_new) <= 0
-                       went_negative = .true.     
-                       went_negative_at_first = .true.     
-                       ! print *,'WENT NEGATIVE n, nsteps, iter ',n, nsteps, iter
-                       ! print *,' at cell ',i,j,k
-                       ! print *,' rho_e_orig       ',rho_e_orig 
-                       ! print *,' src              ',src_new
-                       ! print *,' dt/a             ',delta_t/a
-                       ! print *,' src*dt/a         ',src_new*delta_t/a
-                       ! print *,'  '
-                       ! call flush(6)
-                       exit  ! Exit the n loop to go to higher nsteps
-                    end if
-
-                  enddo ! n loop
-
-                  if (.not. went_negative) then
-                     if (abs(1.0d0-rho_e/prev_soln) .lt. xacc .or. done_iter) EXIT
-                  end if
-
-                  if (iter .ge. NITERS-2) then 
-                     print*, 'INTEGRATE_HC ITERATIONS:', i,j,k, iter, rho_e_orig, rho_e, (1.0d0-rho_e/prev_soln)
-                     call flush(6)
-                  endif
-
-                  if (iter .eq. NITERS) then 
-                     print*, 'MAXITER too small!', i,j,k, rho, T_orig
-                     call bl_abort('too small MAXITER')
-                  endif
-
-                  if (.not. went_negative) &
-                      prev_soln = rho_e
-                enddo ! iter loop
-
-                ! Update cell quantities
-                state(i,j,k,UEINT) = state(i,j,k,UEINT) + delta_re
-                state(i,j,k,UEDEN) = state(i,j,k,UEDEN) + delta_re
-                diag_eos(i,j,k,TEMP_COMP) = T
-                diag_eos(i,j,k,  NE_COMP) = ne
-
-                if (state(i,j,k,UEINT) .lt. 0.d0) then
-                    print *,'(rho e) exiting strang integration negative ',i,j,k, rho, rho_e_orig/rho, state(i,j,k,UEINT)/rho
-                    call bl_abort('negative rho e exiting strang')
-                end if
-
-                if (state(i,j,k,UEINT) .lt. small_pres/gamma_minus_1) then
-                   print *,'!!!!! Pressure and (rho e) are too small coming out of integrate !!!!!'
-                   print *,'!!!!! (i,j,k) !!!!! ' ,i,j,k
-                   print *,'!!!!! pressure      ',state(i,j,k,UEINT) * gamma_minus_1
-                   call flush(6)
-                end if
-
-                if (max_iter .le. iter) max_iter = iter
-                if (min_iter .ge. iter) min_iter = iter
-
-                ! if (went_negative_at_first) print *,'MAX NSTEPS OF NEG AT ',i,j,k,nsteps
-
-            end do ! i
-        end do ! j
-    end do ! k
-
-    if (do_diag) then 
-      print*, 'HC_ITERATIONS: ', z, max_iter, min_iter, b_fac/((hi(3)-lo(3))*(hi(2)-lo(2))*(hi(1)-lo(1)))
-      call flush(6)
-    endif
-
-end subroutine integrate_state_hc
-
diff --git a/Source/HeatCool/integrate_state_vode_3d.f90 b/Source/HeatCool/integrate_state_vode_3d.f90
index 7c9a6b5f..82453734 100644
--- a/Source/HeatCool/integrate_state_vode_3d.f90
+++ b/Source/HeatCool/integrate_state_vode_3d.f90
@@ -31,13 +31,18 @@ subroutine integrate_state_vode(lo, hi, &
 !
     use amrex_fort_module, only : rt => amrex_real
     use meth_params_module, only : NVAR, URHO, UEDEN, UEINT, &
-                                   TEMP_COMP, NE_COMP, gamma_minus_1
+                                   NDIAG, TEMP_COMP, NE_COMP, ZHI_COMP, gamma_minus_1
+    use bl_constants_module, only: M_PI
     use eos_params_module
     use network
-    use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT
+    use eos_module, only: nyx_eos_T_given_Re, nyx_eos_given_RT, iterate_ne
     use fundamental_constants_module
-    use atomic_rates_module, only: tabulate_rates, interp_to_this_z
-    use vode_aux_module    , only: z_vode, i_vode, j_vode, k_vode, T_vode
+    use comoving_module, only: comoving_h, comoving_OmB
+    use comoving_nd_module, only: fort_integrate_comoving_a
+    use atomic_rates_module, only: YHELIUM
+    use vode_aux_module    , only: JH_vode, JHe_vode, z_vode, i_vode, j_vode, k_vode
+    use reion_aux_module   , only: zhi_flash, zheii_flash, flash_h, flash_he, &
+                                   T_zhi, T_zheii, inhomogeneous_on
 
     implicit none
 
@@ -45,21 +50,36 @@ subroutine integrate_state_vode(lo, hi, &
     integer         , intent(in) :: s_l1, s_l2, s_l3, s_h1, s_h2, s_h3
     integer         , intent(in) :: d_l1, d_l2, d_l3, d_h1, d_h2, d_h3
     real(rt), intent(inout) ::    state(s_l1:s_h1, s_l2:s_h2,s_l3:s_h3, NVAR)
-    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, 2)
+    real(rt), intent(inout) :: diag_eos(d_l1:d_h1, d_l2:d_h2,d_l3:d_h3, NDIAG)
     real(rt), intent(in)    :: a, half_dt
     integer         , intent(inout) :: max_iter, min_iter
 
     integer :: i, j, k
-    real(rt) :: z, rho
+    real(rt) :: z, z_end, a_end, rho, H_reion_z, He_reion_z
     real(rt) :: T_orig, ne_orig, e_orig
-    real(rt) :: T_out , ne_out , e_out
+    real(rt) :: T_out , ne_out , e_out, mu, mean_rhob, T_H, T_He
+    real(rt) :: species(5)
 
     z = 1.d0/a - 1.d0
+    call fort_integrate_comoving_a(a, a_end, half_dt)
+    z_end = 1.0d0/a_end - 1.0d0
 
-    z_vode = z
+    mean_rhob = comoving_OmB * 3.d0*(comoving_h*100.d0)**2 / (8.d0*M_PI*Gconst)
 
-    ! Interpolate from the table to this redshift
-    call interp_to_this_z(z)
+    ! Flash reionization?
+    if ((flash_h .eqv. .true.) .and. (z .gt. zhi_flash)) then
+       JH_vode = 0
+    else
+       JH_vode = 1
+    endif
+    if ((flash_he .eqv. .true.) .and. (z .gt. zheii_flash)) then
+       JHe_vode = 0
+    else
+       JHe_vode = 1
+    endif
+
+    if (flash_h ) H_reion_z  = zhi_flash
+    if (flash_he) He_reion_z = zheii_flash
 
     ! Note that (lo,hi) define the region of the box containing the grow cells
     ! Do *not* assume this is just the valid region
@@ -75,8 +95,19 @@ subroutine integrate_state_vode(lo, hi, &
                 T_orig  = diag_eos(i,j,k,TEMP_COMP)
                 ne_orig = diag_eos(i,j,k,  NE_COMP)
 
+                if (inhomogeneous_on) then
+                   H_reion_z = diag_eos(i,j,k,ZHI_COMP)
+                   if (z .gt. H_reion_z) then
+                      JH_vode = 0
+                   else
+                      JH_vode = 1
+                   endif
+                endif
+
                 if (e_orig .lt. 0.d0) then
-                    print *,'negative e entering strang integration ',i,j,k, e_orig
+                    print *,'negative e entering strang integration ', z, i,j,k, e_orig
+                    print *, 'state(i,j,k,UEINT) = ', state(i,j,k,UEINT)
+                    print *, 'rho / mean_rhob = ', rho / mean_rhob
                     call bl_abort('bad e in strang')
                 end if
 
@@ -88,16 +119,43 @@ subroutine integrate_state_vode(lo, hi, &
                                               T_out ,ne_out ,e_out)
 
                 if (e_out .lt. 0.d0) then
-                    print *,'negative e entering strang integration ',i,j,k, e_out
-                    call bl_abort('bad e out of strang')
+                    print *,'negative e exiting strang integration ', z, i,j,k, e_out
+                    T_out  = 10.0
+                    ne_out = 0.0
+                    mu     = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne_out)
+                    e_out  = T_out / (gamma_minus_1 * mp_over_kB * mu)
+                    call flush(6)
+                    !call bl_abort('bad e out of strang')
                 end if
 
+                ! Update T and ne (do not use stuff computed in f_rhs, per vode manual)
+                call nyx_eos_T_given_Re(JH_vode, JHe_vode, T_out, ne_out, rho, e_out, a, species)
+
+                !  Flash heating in reionization:
+                T_H = 0.0d0
+                if (inhomogeneous_on .or. flash_h) then
+                   if ((H_reion_z  .lt. z) .and. (H_reion_z  .ge. z_end)) T_H  = (1.0d0 - species(2))*T_zhi
+                endif
+
+                T_He = 0.0d0
+                if (flash_he) then
+                   if ((He_reion_z .lt. z) .and. (He_reion_z .ge. z_end)) T_He = (1.0d0 - species(5))*T_zheii
+                endif
+
+                if ((T_H .gt. 0.0d0) .or. (T_He .gt. 0.0d0)) then
+                   T_out = T_orig + T_H + T_He
+                   ne_out = 1.0d0 + YHELIUM
+                   if (T_He .gt. 0.0d0) ne_out = ne_out + YHELIUM
+                   mu = (1.0d0+4.0d0*YHELIUM) / (1.0d0+YHELIUM+ne_out)
+                   e_out  = T_out / (gamma_minus_1 * mp_over_kB * mu)
+                   call nyx_eos_T_given_Re(JH_vode, JHe_vode, T_out, ne_out, rho, e_out, a, species)
+                endif
+
                 ! Update (rho e) and (rho E)
                 state(i,j,k,UEINT) = state(i,j,k,UEINT) + rho * (e_out-e_orig)
                 state(i,j,k,UEDEN) = state(i,j,k,UEDEN) + rho * (e_out-e_orig)
 
-                ! Update T and ne (do not use stuff computed in f_rhs, per vode manual)
-                call nyx_eos_T_given_Re(T_out, ne_out, rho, e_out, a)
+                ! Update T and ne
                 diag_eos(i,j,k,TEMP_COMP) = T_out
                 diag_eos(i,j,k,  NE_COMP) = ne_out
 
@@ -107,8 +165,10 @@ subroutine integrate_state_vode(lo, hi, &
 
 end subroutine integrate_state_vode
 
+
 subroutine vode_wrapper(dt, rho_in, T_in, ne_in, e_in, T_out, ne_out, e_out)
 
+    use amrex_fort_module, only : rt => amrex_real
     use vode_aux_module, only: rho_vode, T_vode, ne_vode, &
                                i_vode, j_vode, k_vode
 
@@ -188,7 +248,7 @@ subroutine vode_wrapper(dt, rho_in, T_in, ne_in, e_in, T_out, ne_out, e_out)
     iwork(:) = 0
     
     ! Set the maximum number of steps allowed (the VODE default is 500)
-    iwork(6) = 1000
+    iwork(6) = 2000
     
     ! Initialize the integration time
     time = 0.d0
diff --git a/Source/HeatCool/vode_aux.f90 b/Source/HeatCool/vode_aux.f90
index 0ac68646..0696c38b 100644
--- a/Source/HeatCool/vode_aux.f90
+++ b/Source/HeatCool/vode_aux.f90
@@ -8,8 +8,9 @@ module vode_aux_module
 
   real(rt), save :: z_vode
   real(rt), save :: rho_vode, T_vode, ne_vode
-  integer , save :: i_vode, j_vode, k_vode
+  real(rt), dimension(:), allocatable, save :: rho_vode_vec, T_vode_vec, ne_vode_vec
+  integer , save :: JH_vode, JHe_vode, i_vode, j_vode, k_vode
   logical,  save :: firstcall
-  !$OMP THREADPRIVATE (rho_vode, T_vode, ne_vode, i_vode, j_vode, k_vode, firstcall)
+  !$OMP THREADPRIVATE (rho_vode, rho_vode_vec, T_vode, T_vode_vec, ne_vode, ne_vode_vec, JH_vode, JHe_vode, i_vode, j_vode, k_vode, firstcall)
 
 end module vode_aux_module
diff --git a/Source/Initialization/Make.package b/Source/Initialization/Make.package
index f9337fa1..194cbb2b 100644
--- a/Source/Initialization/Make.package
+++ b/Source/Initialization/Make.package
@@ -6,3 +6,6 @@ CEXE_sources   += Nyx_setup.cpp
 CEXE_sources   += Nyx_initdata.cpp
 CEXE_sources   += Nyx_initcosmo.cpp
 CEXE_sources   += read_plotfile.cpp
+ifeq ($(USE_CVODE), TRUE)
+  f90EXE_sources += cvode_simd.f90
+endif
diff --git a/Source/Initialization/Nyx_initcosmo.cpp b/Source/Initialization/Nyx_initcosmo.cpp
index 8dc1b802..244771f5 100644
--- a/Source/Initialization/Nyx_initcosmo.cpp
+++ b/Source/Initialization/Nyx_initcosmo.cpp
@@ -124,8 +124,8 @@ void Nyx::initcosmo()
     	MultiFab& S_new = get_level(level).get_new_data(State_Type);
     	MultiFab& D_new = get_level(level).get_new_data(DiagEOS_Type);
 
-        FillCoarsePatch(S_new, 0, 0,   State_Type, 0, NUM_STATE);
-        FillCoarsePatch(D_new, 0, 0, DiagEOS_Type, 0, 2);
+        FillCoarsePatch(S_new, 0, 0,   State_Type, 0, S_new.nComp());
+        FillCoarsePatch(D_new, 0, 0, DiagEOS_Type, 0, D_new.nComp());
 	return;
     }
 
@@ -378,8 +378,8 @@ void Nyx::initcosmo()
 	//seems to have no effect...
 	if (level > 0)
 	{
-           FillCoarsePatch(S_new, 0, 0,   State_Type, 0, NUM_STATE);
-	   FillCoarsePatch(D_new, 0, 0, DiagEOS_Type, 0, 2);
+           FillCoarsePatch(S_new, 0, 0,   State_Type, 0, S_new.nComp());
+	   FillCoarsePatch(D_new, 0, 0, DiagEOS_Type, 0, D_new.nComp());
 	}
 
      	//copy density 
@@ -421,6 +421,8 @@ void Nyx::initcosmo()
 
         D_new.setVal(tempInit, Temp_comp);
         D_new.setVal(0.0, Ne_comp);
+        if (inhomo_reion > 0)
+            D_new.setVal(0.0, Zhi_comp);
 
 #ifdef _OPENMP
 #pragma omp parallel
diff --git a/Source/Initialization/Nyx_initdata.cpp b/Source/Initialization/Nyx_initdata.cpp
index 3147be56..4761a3d9 100644
--- a/Source/Initialization/Nyx_initdata.cpp
+++ b/Source/Initialization/Nyx_initdata.cpp
@@ -83,10 +83,11 @@ Nyx::read_init_params ()
 
     // Input error check
     if (!binary_particle_file.empty() && (particle_init_type != "BinaryFile" &&
-                                          particle_init_type != "BinaryMetaFile"))
+                                          particle_init_type != "BinaryMetaFile" && 
+					  particle_init_type != "BinaryMortonFile"))
     {
         if (ParallelDescriptor::IOProcessor())
-            std::cerr << "ERROR::particle_init_type is not BinaryFile or BinaryMetaFile but you specified binary_particle_file" << std::endl;
+            std::cerr << "ERROR::particle_init_type is not BinaryFile, BinaryMetaFile, or BinaryMortonFile but you specified binary_particle_file" << std::endl;
         amrex::Error();
     }
 
@@ -109,6 +110,21 @@ Nyx::read_init_params ()
         amrex::Error();
     }
 #endif
+
+#ifdef HEATCOOL
+    Real eos_nr_eps = 1.0e-6;
+    Real vode_rtol = 1.0e-4;
+    Real vode_atol_scaled = 1.0e-4;
+
+    // Tolerance for Newton-Raphson iteration of iterate_ne() in the EOS
+    pp.query("eos_nr_eps", eos_nr_eps);
+    // Relative tolerance of VODE integration
+    pp.query("vode_rtol", vode_rtol);
+    // Absolute tolerance of VODE integration (scaled by initial value of ODE)
+    pp.query("vode_atol_scaled", vode_atol_scaled);
+
+    fort_setup_eos_params(&eos_nr_eps, &vode_rtol, &vode_atol_scaled);
+#endif
 }
 
 void
diff --git a/Source/Initialization/Nyx_setup.cpp b/Source/Initialization/Nyx_setup.cpp
index 555b0011..e689ac60 100644
--- a/Source/Initialization/Nyx_setup.cpp
+++ b/Source/Initialization/Nyx_setup.cpp
@@ -197,8 +197,16 @@ Nyx::hydro_setup()
         cnt += NumAdv;
     }
 
+    int NDIAG_C;
     Temp_comp = 0;
       Ne_comp = 1;
+    if (inhomo_reion > 0)
+    {
+        NDIAG_C  = 3;
+        Zhi_comp = 2;
+    } else {
+        NDIAG_C  = 2;
+    }
 
     int dm = BL_SPACEDIM;
 
@@ -228,12 +236,18 @@ Nyx::hydro_setup()
     // Define NUM_GROW from the f90 module.
     fort_get_method_params(&NUM_GROW);
 
+    // Note that we must set NDIAG_C before we call set_method_params because
+    // we use the C++ value to set the Fortran value
     fort_set_method_params
-        (dm, NumAdv, do_hydro, ppm_type, ppm_reference,
+        (dm, NumAdv, NDIAG_C, do_hydro, ppm_type, ppm_reference,
          ppm_flatten_before_integrals,
          use_colglaz, use_flattening, corner_coupling, version_2,
          use_const_species, gamma, normalize_species,
-         heat_cool_type, ParallelDescriptor::Communicator());
+         heat_cool_type, inhomo_reion);
+
+#ifdef HEATCOOL
+    fort_tabulate_rates();
+#endif
 
     if (use_const_species == 1)
         fort_set_eos_params(h_species, he_species);
@@ -258,7 +272,7 @@ Nyx::hydro_setup()
 
     // This has two components: Temperature and Ne
     desc_lst.addDescriptor(DiagEOS_Type, IndexType::TheCellType(),
-                           StateDescriptor::Point, 1, 2, interp,
+                           StateDescriptor::Point, 1, NDIAG_C, interp,
                            state_data_extrap, store_in_checkpoint);
 
 #ifdef GRAVITY
@@ -442,17 +456,6 @@ Nyx::hydro_setup()
     //derive_lst.addComponent("rhog",desc_lst,Gravity_Type,0,BL_SPACEDIM);
 #endif
 
-    //
-    // Entropy (S)
-    //
-    derive_lst.add("entropy", IndexType::TheCellType(), 1,
-                   BL_FORT_PROC_CALL(DERENTROPY, derentropy),
-                   the_same_box);
-    // We add exactly (Density,Xmom,Ymom,Zmom,Eden,Eint) from State and
-    //                (Temp   ,Ne) from Diag_EOS
-    derive_lst.addComponent("entropy", desc_lst, State_Type, Density, 6);
-    derive_lst.addComponent("entropy", desc_lst, DiagEOS_Type,     0, 2);
-
     //
     // Div(u)
     //
@@ -652,15 +655,19 @@ Nyx::no_hydro_setup()
     Density = 0;
     NUM_STATE = 1;
 
+    int NDIAG_C = -1;
+
     // Define NUM_GROW from the f90 module.
     fort_get_method_params(&NUM_GROW);
 
     fort_set_method_params
-        (dm, NumAdv, do_hydro, ppm_type, ppm_reference,
+        (dm, NumAdv, NDIAG_C, do_hydro, ppm_type, ppm_reference,
          ppm_flatten_before_integrals,
          use_colglaz, use_flattening, corner_coupling, version_2,
          use_const_species, gamma, normalize_species,
-         heat_cool_type, ParallelDescriptor::Communicator());
+         heat_cool_type, inhomo_reion);
+
+    fort_tabulate_rates();
 
     int coord_type = Geometry::Coord();
     fort_set_problem_params(dm, phys_bc.lo(), phys_bc.hi(), Outflow, Symmetry, coord_type);
@@ -766,3 +773,22 @@ Nyx::no_hydro_setup()
 }
 #endif
 
+#ifdef USE_CVODE
+void
+Nyx::set_simd_width(const int simd_width)
+{
+    set_simd(&simd_width);
+}
+
+void
+Nyx::alloc_simd_vec()
+{
+    fort_alloc_simd_vec();
+}
+
+void
+Nyx::dealloc_simd_vec()
+{
+    fort_dealloc_simd_vec();
+}
+#endif
diff --git a/Source/Initialization/check_initial_species_3d.f90 b/Source/Initialization/check_initial_species_3d.f90
index 70469ff0..997865d3 100644
--- a/Source/Initialization/check_initial_species_3d.f90
+++ b/Source/Initialization/check_initial_species_3d.f90
@@ -7,6 +7,7 @@ subroutine fort_check_initial_species(lo,hi,&
       use  eos_params_module
 
       use amrex_fort_module, only : rt => amrex_real
+      use amrex_error_module, only : amrex_abort
       implicit none
 
       integer  :: lo(3), hi(3)
@@ -16,6 +17,7 @@ subroutine fort_check_initial_species(lo,hi,&
       ! Local variables
       integer  :: i,j,k,n
       real(rt) :: sum
+      character(len=256) :: errmsg_pt1, errmsg_pt2
 
       if (UFS .gt. 0) then
 
@@ -30,13 +32,11 @@ subroutine fort_check_initial_species(lo,hi,&
                    end do
 
                    if (abs(state(i,j,k,URHO)-sum).gt. 1.d-8 * state(i,j,k,URHO)) then
-                      !
-                      ! A critical region since we usually can't write from threads.
-                      !
-                      !$OMP CRITICAL
-                      print *,'Sum of (rho X)_i vs rho at (i,j,k): ',i,j,k,sum,state(i,j,k,URHO)
-                      call bl_error("Error:: Failed check of initial species summing to 1")
-                      !$OMP END CRITICAL
+                      write(errmsg_pt1, *) 'Sum of (rho X)_i vs rho at (i,j,k): ', &
+                        i,j,k,sum,state(i,j,k,URHO)
+                      write(errmsg_pt2, *) trim(errmsg_pt1) // new_line('a') // &
+                        'Failed check of initial species summing to 1'
+                      call amrex_abort(errmsg_pt2)
                    end if
     
                 enddo
diff --git a/Source/Initialization/cvode_simd.f90 b/Source/Initialization/cvode_simd.f90
new file mode 100644
index 00000000..fc5578b4
--- /dev/null
+++ b/Source/Initialization/cvode_simd.f90
@@ -0,0 +1,44 @@
+subroutine set_simd (simd_width_in) bind(C, name='set_simd')
+
+   use misc_params, only: simd_width
+   implicit none
+
+   integer, intent(in) :: simd_width_in
+
+   simd_width = simd_width_in
+
+end subroutine set_simd
+
+subroutine fort_alloc_simd_vec() bind(C, name='fort_alloc_simd_vec')
+  use misc_params, only: simd_width
+  use vode_aux_module, only: T_vode_vec, ne_vode_vec, rho_vode_vec
+  use amrex_error_module, only: amrex_abort
+  implicit none
+
+  !$omp parallel
+  if (allocated(T_vode_vec) .or. allocated(ne_vode_vec) .or. allocated(rho_vode_vec)) then
+    !$omp single
+    call amrex_abort("Why are VODE SIMD vectors already allocated??")
+    !$omp end single
+  end if
+
+  allocate(T_vode_vec(simd_width), ne_vode_vec(simd_width), rho_vode_vec(simd_width))
+  !$omp end parallel
+end subroutine fort_alloc_simd_vec
+
+
+subroutine fort_dealloc_simd_vec() bind(C, name='fort_dealloc_simd_vec')
+  use vode_aux_module, only: T_vode_vec, ne_vode_vec, rho_vode_vec
+  use amrex_error_module, only: amrex_abort
+  implicit none
+
+  !$omp parallel
+  if (.not. (allocated(T_vode_vec) .and. allocated(ne_vode_vec) .and. allocated(rho_vode_vec))) then
+    !$omp single
+    call amrex_abort("Why are VODE SIMD vectors already deallocated??")
+    !$omp end single
+  end if
+
+  deallocate(T_vode_vec, ne_vode_vec, rho_vode_vec)
+  !$omp end parallel
+end subroutine fort_dealloc_simd_vec
diff --git a/Source/Make.package b/Source/Make.package
index 2d818e42..22c32a48 100644
--- a/Source/Make.package
+++ b/Source/Make.package
@@ -35,6 +35,7 @@ f90EXE_headers += dm_F.H
 f90EXE_sources += Nyx_nd.f90
 f90EXE_sources +=  eos_params.f90
 f90EXE_sources += meth_params.f90
+f90EXE_sources += misc_params.f90
 f90EXE_sources += prob_params.f90
 f90EXE_sources += comoving_params.f90
 f90EXE_sources += comoving_nd.f90
diff --git a/Source/Nyx.H b/Source/Nyx.H
index d7f0c8a2..976c6c56 100644
--- a/Source/Nyx.H
+++ b/Source/Nyx.H
@@ -476,7 +476,7 @@ public:
 
     void compute_new_temp();
 
-    void compute_rho_temp(amrex::Real& rho_T_avg, amrex::Real& T_avg, amrex::Real& T_meanrho);
+    void compute_rho_temp(amrex::Real& rho_T_avg, amrex::Real& T_avg, amrex::Real& Tinv_avg, amrex::Real& T_meanrho);
 
     void get_old_source(amrex::Real old_time, amrex::Real dt, amrex::MultiFab& Rhs);
     void get_new_source(amrex::Real old_time, amrex::Real new_time, amrex::Real dt, amrex::MultiFab& Rhs);
@@ -520,7 +520,8 @@ public:
 
     static int NUM_STATE;
     static int Density, Xmom, Ymom, Zmom, Eden, Eint;
-    static int Temp_comp, Ne_comp;
+
+    static int Temp_comp, Ne_comp, Zhi_comp;
 
     static int FirstSpec, FirstAux, FirstAdv;
     static int NumSpec, NumAux, NumAdv;
@@ -552,6 +553,10 @@ public:
     static void InitErrorList();
     static void InitDeriveList();
 
+    static void set_simd_width(const int simd_width);
+    static void alloc_simd_vec();
+    static void dealloc_simd_vec();
+
 protected:
 
     //
@@ -648,6 +653,7 @@ protected:
     static long particle_initrandom_count_per_box;
     static amrex::Real particle_initrandom_mass;
     static int particle_initrandom_iseed;
+    static int particle_skip_factor;
 
     static amrex::IntVect Nrep;  // how many times the initial conditions are replicated in each direction
 
@@ -668,6 +674,9 @@ protected:
     // specifies the heating/cooling source term
     static int heat_cool_type;
 
+    // specifies inhomogeneous reionization type
+    static int inhomo_reion;
+
     // permits forcing to be switched on and off
     static int do_forcing;
 
@@ -711,7 +720,6 @@ protected:
 
   static amrex::Real getCPUTime();
 
-
 };
 
 // time step interval for finding halos
diff --git a/Source/Nyx.cpp b/Source/Nyx.cpp
index ccf5057c..a94cfc35 100644
--- a/Source/Nyx.cpp
+++ b/Source/Nyx.cpp
@@ -21,6 +21,7 @@ using std::string;
 #include <AMReX_TagBox.H>
 #include <AMReX_Particles_F.H>
 #include <AMReX_Utility.H>
+#include <AMReX_Print.H>
 
 #if BL_USE_MPI
 #include "MemInfo.H"
@@ -63,6 +64,8 @@ static Real fixed_dt    = -1.0;
 static Real initial_dt  = -1.0;
 static Real dt_cutoff   =  0;
 
+int simd_width = 1;
+
 int Nyx::strict_subcycling = 0;
 
 Real Nyx::old_a      = -1.0;
@@ -83,7 +86,7 @@ Real Nyx::change_max  = 1.1;
 BCRec Nyx::phys_bc;
 int Nyx::do_reflux = 1;
 int Nyx::NUM_STATE = -1;
-int Nyx::NUM_GROW = -1;
+int Nyx::NUM_GROW  = -1;
 
 int Nyx::nsteps_from_plotfile = -1;
 
@@ -98,6 +101,7 @@ int Nyx::Zmom = -1;
 
 int Nyx::Temp_comp = -1;
 int Nyx::  Ne_comp = -1;
+int Nyx:: Zhi_comp = -1;
 
 int Nyx::NumSpec  = 0;
 int Nyx::NumAux   = 0;
@@ -118,6 +122,7 @@ Real Nyx::comoving_h;
 int Nyx::do_hydro = -1;
 int Nyx::add_ext_src = 0;
 int Nyx::heat_cool_type = 0;
+int Nyx::inhomo_reion = 0;
 int Nyx::strang_split = 0;
 
 Real Nyx::average_gas_density = 0;
@@ -253,6 +258,16 @@ Nyx::read_params ()
 
     pp.query("strict_subcycling",strict_subcycling);
 
+#ifdef USE_CVODE
+    pp.query("simd_width", simd_width);
+    if (simd_width < 1) amrex::Abort("simd_width must be a positive integer");
+    set_simd_width(simd_width);
+
+    if (verbose > 1) amrex::Print()
+        << "SIMD width (# zones) for heating/cooling integration: "
+        << simd_width << std::endl;
+#endif
+
     // Get boundary conditions
     Array<int> lo_bc(BL_SPACEDIM), hi_bc(BL_SPACEDIM);
     pp.getarr("lo_bc", lo_bc, 0, BL_SPACEDIM);
@@ -354,14 +369,32 @@ Nyx::read_params ()
 #endif
 
     pp.query("heat_cool_type", heat_cool_type);
+    if (heat_cool_type == 7)
+    {
+      amrex::Print() << "----- WARNING WARNING WARNING WARNING WARNING -----" << std::endl;
+      amrex::Print() << "                                                   " << std::endl;
+      amrex::Print() << "      SIMD CVODE is currently EXPERIMENTAL.        " << std::endl;
+      amrex::Print() << "      Use at your own risk.                        " << std::endl;
+      amrex::Print() << "                                                   " << std::endl;
+      amrex::Print() << "----- WARNING WARNING WARNING WARNING WARNING -----" << std::endl;
+      Array<int> n_cell(BL_SPACEDIM);
+      ParmParse pp("amr");
+      pp.getarr("n_cell", n_cell, 0, BL_SPACEDIM);
+      if (n_cell[0] % simd_width) {
+        const std::string errmsg = "Currently the SIMD CVODE solver requires that n_cell[0] % simd_width = 0";
+        amrex::Abort(errmsg);
+      }
+    }
 
     pp.query("use_exact_gravity", use_exact_gravity);
 
+    pp.query("inhomo_reion", inhomo_reion);
+
 #ifdef HEATCOOL
     if (heat_cool_type > 0 && add_ext_src == 0)
        amrex::Error("Nyx::must set add_ext_src to 1 if heat_cool_type > 0");
-    if (heat_cool_type != 1 && heat_cool_type != 3 && heat_cool_type != 5)
-       amrex::Error("Nyx:: nonzero heat_cool_type must equal 1 or 3 or 5");
+    if (heat_cool_type != 1 && heat_cool_type != 3 && heat_cool_type != 5 && heat_cool_type != 7)
+       amrex::Error("Nyx:: nonzero heat_cool_type must equal 1 or 3 or 5 or 7");
     if (heat_cool_type == 0)
        amrex::Error("Nyx::contradiction -- HEATCOOL is defined but heat_cool_type == 0");
 
@@ -377,18 +410,23 @@ Nyx::read_params ()
         case 5:
           std::cout << "CVODE";
           break;
+        case 7:
+          std::cout << "SIMD CVODE";
+          break;
       }
       std::cout << std::endl;
     }
 
 #ifndef USE_CVODE
-    if (heat_cool_type == 5)
-        amrex::Error("Nyx:: cannot set heat_cool_type = 5 unless USE_CVODE=TRUE");
+    if (heat_cool_type == 5 || heat_cool_type == 7)
+        amrex::Error("Nyx:: cannot set heat_cool_type = 5 or 7 unless USE_CVODE=TRUE");
 #endif
 
 #else
     if (heat_cool_type > 0)
        amrex::Error("Nyx::you set heat_cool_type > 0 but forgot to set USE_HEATCOOL = TRUE");
+    if (inhomo_reion > 0)
+       amrex::Error("Nyx::you set inhomo_reion > 0 but forgot to set USE_HEATCOOL = TRUE");
 #endif
 
     pp.query("allow_untagging", allow_untagging);
@@ -563,9 +601,11 @@ Nyx::Nyx (Amr&            papa,
        new_a = old_a;
     }
 
+#ifdef HEATCOOL
      // Initialize "this_z" in the atomic_rates_module
-     if (heat_cool_type == 1 || heat_cool_type == 3 || heat_cool_type == 5)
-         fort_init_this_z(&old_a);
+    if (heat_cool_type == 1 || heat_cool_type == 3 || heat_cool_type == 5 || heat_cool_type == 7)
+         fort_interp_to_this_z(&initial_z);
+#endif
 
 #ifdef AGN
      // Initialize the uniform(0,1) random number generator.
@@ -685,7 +725,7 @@ Nyx::init (AmrLevel& old)
 
         for (FillPatchIterator
                  fpi(old, S_new, 0, cur_time,   State_Type, 0, NUM_STATE),
-                dfpi(old, D_new, 0, cur_time, DiagEOS_Type, 0, 2);
+                dfpi(old, D_new, 0, cur_time, DiagEOS_Type, 0, D_new.nComp());
                 fpi.isValid() && dfpi.isValid();
                 ++fpi,++dfpi)
         {
@@ -2070,6 +2110,13 @@ Nyx::compute_new_temp ()
 
     Real a = get_comoving_a(cur_time);
 
+#ifdef HEATCOOL
+    if (heat_cool_type == 1 || heat_cool_type == 3 || heat_cool_type == 5 || heat_cool_type == 7) {
+       const Real z = 1.0/a - 1.0;
+       fort_interp_to_this_z(&z);
+    }
+#endif
+
 #ifdef _OPENMP
 #pragma omp parallel
 #endif
@@ -2077,11 +2124,19 @@ Nyx::compute_new_temp ()
     {
         const Box& bx = mfi.tilebox();
 
-        fort_compute_temp
-            (bx.loVect(), bx.hiVect(),
-            BL_TO_FORTRAN(S_new[mfi]),
-            BL_TO_FORTRAN(D_new[mfi]), &a,
-             &print_fortran_warnings);
+        if (heat_cool_type == 7) {
+          fort_compute_temp_vec
+              (bx.loVect(), bx.hiVect(),
+              BL_TO_FORTRAN(S_new[mfi]),
+              BL_TO_FORTRAN(D_new[mfi]), &a,
+               &print_fortran_warnings);
+        } else {
+            fort_compute_temp
+              (bx.loVect(), bx.hiVect(),
+              BL_TO_FORTRAN(S_new[mfi]),
+              BL_TO_FORTRAN(D_new[mfi]), &a,
+               &print_fortran_warnings);
+        }
     }
 
     // Compute the maximum temperature
@@ -2123,17 +2178,17 @@ Nyx::compute_new_temp ()
 
 #ifndef NO_HYDRO
 void
-Nyx::compute_rho_temp (Real& rho_T_avg, Real& T_avg, Real& T_meanrho)
+Nyx::compute_rho_temp (Real& rho_T_avg, Real& T_avg, Real& Tinv_avg, Real& T_meanrho)
 {
     BL_PROFILE("Nyx::compute_rho_temp()");
     MultiFab& S_new = get_new_data(State_Type);
     MultiFab& D_new = get_new_data(DiagEOS_Type);
 
-    Real rho_T_sum=0.0,   T_sum=0.0, T_meanrho_sum=0.0;
+    Real rho_T_sum=0.0,   T_sum=0.0, Tinv_sum=0.0, T_meanrho_sum=0.0;
     Real   rho_sum=0.0, vol_sum=0.0,    vol_mn_sum=0.0;
 
 #ifdef _OPENMP
-#pragma omp parallel reduction(+:rho_T_sum, rho_sum, T_sum, T_meanrho_sum, vol_sum, vol_mn_sum)
+#pragma omp parallel reduction(+:rho_T_sum, rho_sum, T_sum, Tinv_sum, T_meanrho_sum, vol_sum, vol_mn_sum)
 #endif
     for (MFIter mfi(S_new,true); mfi.isValid(); ++mfi)
     {
@@ -2143,15 +2198,16 @@ Nyx::compute_rho_temp (Real& rho_T_avg, Real& T_avg, Real& T_meanrho)
             (bx.loVect(), bx.hiVect(), geom.CellSize(),
              BL_TO_FORTRAN(S_new[mfi]),
              BL_TO_FORTRAN(D_new[mfi]), &average_gas_density,
-             &rho_T_sum, &T_sum, &T_meanrho_sum, &rho_sum, &vol_sum, &vol_mn_sum);
+             &rho_T_sum, &T_sum, &Tinv_sum, &T_meanrho_sum, &rho_sum, &vol_sum, &vol_mn_sum);
     }
-    Real sums[6] = {rho_T_sum, rho_sum, T_sum, T_meanrho_sum, vol_sum, vol_mn_sum};
-    ParallelDescriptor::ReduceRealSum(sums,6);
+    Real sums[7] = {rho_T_sum, rho_sum, T_sum, Tinv_sum, T_meanrho_sum, vol_sum, vol_mn_sum};
+    ParallelDescriptor::ReduceRealSum(sums,7);
 
     rho_T_avg = sums[0] / sums[1];  // density weighted T
-        T_avg = sums[2] / sums[4];  // volume weighted T
-    if (sums[5] > 0) {
-       T_meanrho = sums[3] / sums[5];  // T at mean density
+        T_avg = sums[2] / sums[5];  // volume weighted T
+     Tinv_avg = sums[3] / sums[1];  // 21cm T
+    if (sums[6] > 0) {
+       T_meanrho = sums[4] / sums[6];  // T at mean density
        T_meanrho = pow(10.0, T_meanrho);
     }
 }
@@ -2227,6 +2283,7 @@ Nyx::AddProcsToComp(Amr *aptr, int nSidecarProcs, int prevSidecarProcs,
         allInts.push_back(Eint);
         allInts.push_back(Temp_comp);
         allInts.push_back(Ne_comp);
+        allInts.push_back(Zhi_comp);
         allInts.push_back(FirstSpec);
         allInts.push_back(FirstAux);
         allInts.push_back(FirstAdv);
@@ -2256,6 +2313,7 @@ Nyx::AddProcsToComp(Amr *aptr, int nSidecarProcs, int prevSidecarProcs,
         allInts.push_back(do_grav);
         allInts.push_back(add_ext_src);
         allInts.push_back(heat_cool_type);
+        allInts.push_back(inhomo_reion);
         allInts.push_back(strang_split);
         allInts.push_back(reeber_int);
         allInts.push_back(gimlet_int);
@@ -2282,6 +2340,7 @@ Nyx::AddProcsToComp(Amr *aptr, int nSidecarProcs, int prevSidecarProcs,
         Eint = allInts[count++];
         Temp_comp = allInts[count++];
         Ne_comp = allInts[count++];
+        Zhi_comp = allInts[count++];
         FirstSpec = allInts[count++];
         FirstAux = allInts[count++];
         FirstAdv = allInts[count++];
@@ -2311,6 +2370,7 @@ Nyx::AddProcsToComp(Amr *aptr, int nSidecarProcs, int prevSidecarProcs,
         do_grav = allInts[count++];
         add_ext_src = allInts[count++];
         heat_cool_type = allInts[count++];
+        inhomo_reion = allInts[count++];
         strang_split = allInts[count++];
         reeber_int = allInts[count++];
         gimlet_int = allInts[count++];
diff --git a/Source/NyxParticleContainer.H b/Source/NyxParticleContainer.H
index 640581c0..68369c30 100644
--- a/Source/NyxParticleContainer.H
+++ b/Source/NyxParticleContainer.H
@@ -9,6 +9,10 @@
 class NyxParticleContainerBase
 {
 public:
+
+    using MyParIter = amrex::ParIter<1+BL_SPACEDIM>;
+    using MyConstParIter = amrex::ParConstIter<1+BL_SPACEDIM>;
+
     virtual ~NyxParticleContainerBase() {}
 
     virtual void moveKickDrift (amrex::MultiFab& acceleration, int level, amrex::Real timestep, 
@@ -71,9 +75,9 @@ public:
     amrex::Real estTimestep (amrex::MultiFab& acceleration, amrex::Real a, int level, amrex::Real cfl) const;
 
     virtual int finestLevel() const override
-        {
-            return amrex::AmrParticleContainer<NSR,NSI,NAR,NAI>::finestLevel();
-        }
+    {
+        return amrex::AmrParticleContainer<NSR,NSI,NAR,NAI>::finestLevel();
+    }
 
     virtual void Redistribute (int lev_min              = 0,
                                int lev_max              =-1,
@@ -188,8 +192,8 @@ NyxParticleContainer<NSR,NSI,NAR,NAI>::SetParticleVelocities (amrex::Array<amrex
 
 template <int NSR,int NSI,int NAR,int NAI>
 void
-NyxParticleContainer<NSR,NSI,NAR,NAI>::sumParticleMomentum (int   lev,
-					       amrex::Real* mom) const
+NyxParticleContainer<NSR,NSI,NAR,NAI>::sumParticleMomentum (int          lev,
+							    amrex::Real* mom) const
 {
     BL_PROFILE("NyxParticleContainer<NSR,NSI,NAR,NAI>::sumParticleMomentum()");
     BL_ASSERT(NSR >= BL_SPACEDIM+1);
@@ -231,8 +235,8 @@ NyxParticleContainer<NSR,NSI,NAR,NAI>::sumParticleMomentum (int   lev,
 template <int NSR,int NSI,int NAR,int NAI>
 amrex::Real
 NyxParticleContainer<NSR,NSI,NAR,NAI>::estTimestep (amrex::MultiFab&       acceleration,
-				       int             lev,
-				       amrex::Real            cfl) const
+						    int             lev,
+						    amrex::Real            cfl) const
 {
     return estTimestep(acceleration,1.0,lev,cfl);
 }
@@ -240,9 +244,9 @@ NyxParticleContainer<NSR,NSI,NAR,NAI>::estTimestep (amrex::MultiFab&       accel
 template <int NSR,int NSI,int NAR,int NAI>
 amrex::Real
 NyxParticleContainer<NSR,NSI,NAR,NAI>::estTimestep (amrex::MultiFab&       acceleration,
-				       amrex::Real            a,
-				       int             lev,
-				       amrex::Real            cfl) const
+						    amrex::Real            a,
+						    int                    lev,
+						    amrex::Real            cfl) const
 {
     BL_PROFILE("NyxParticleContainer<NSR,NSI,NAR,NAI>::estTimestep(lev)");
     amrex::Real            dt               = 1e50;
@@ -282,21 +286,17 @@ NyxParticleContainer<NSR,NSI,NAR,NAI>::estTimestep (amrex::MultiFab&       accel
         ac_pointer->FillBoundary(geom.periodicity()); // DO WE NEED GHOST CELLS FILLED ???
     }
 
-    for (typename ParticleLevel::const_iterator pmap_it = pmap.begin(), pmapEnd = pmap.end(); pmap_it != pmapEnd; ++pmap_it)
-    {
-        const int        grid = pmap_it->first.first;
-        const AoS&       pbox = pmap_it->second.GetArrayOfStructs();
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+    for (MyConstParIter pti(*this, lev); pti.isValid(); ++pti) {
+        const int grid = pti.index();
+        const AoS&       pbox = pti.GetArrayOfStructs();
         const int        n    = pbox.size();
         const amrex::FArrayBox& gfab = (ac_pointer) ? (*ac_pointer)[grid] : acceleration[grid];
 
         num_particles_at_level += n;
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-
-        for (int i = 0; i < n; i++)
-        {
+        for (int i = 0; i < n; i++) {
             const ParticleType& p = pbox[i];
 
             if (p.id() <= 0) continue;
diff --git a/Source/NyxParticles.cpp b/Source/NyxParticles.cpp
index 3d73594d..30a9993d 100644
--- a/Source/NyxParticles.cpp
+++ b/Source/NyxParticles.cpp
@@ -112,6 +112,7 @@ namespace
 
 bool Nyx::do_dm_particles = false;
 int Nyx::num_particle_ghosts = 1;
+int Nyx::particle_skip_factor = 1;
 
 std::string Nyx::particle_init_type = "";
 std::string Nyx::particle_move_type = "";
@@ -256,7 +257,7 @@ Nyx::read_particle_params ()
     pp.query("particle_initrandom_count_per_box", particle_initrandom_count_per_box);
     pp.query("particle_initrandom_mass", particle_initrandom_mass);
     pp.query("particle_initrandom_iseed", particle_initrandom_iseed);
-
+    pp.query("particle_skip_factor", particle_skip_factor);
     pp.query("ascii_particle_file", ascii_particle_file);
 
     // Input error check
@@ -289,10 +290,11 @@ Nyx::read_particle_params ()
 
     // Input error check
     if (!binary_particle_file.empty() && (particle_init_type != "BinaryFile" &&
-                                          particle_init_type != "BinaryMetaFile"))
+                                          particle_init_type != "BinaryMetaFile" && 
+					  particle_init_type != "BinaryMortonFile"))
     {
         if (ParallelDescriptor::IOProcessor())
-            std::cerr << "ERROR::particle_init_type is not BinaryFile or BinaryMetaFile but you specified binary_particle_file" << std::endl;
+            std::cerr << "ERROR::particle_init_type is not BinaryFile, BinaryMetaFile, or BinaryMortonFile but you specified binary_particle_file" << std::endl;
         amrex::Error();
     }
 
@@ -482,6 +484,24 @@ Nyx::init_particles ()
             if (init_with_sph_particles == 1)
                 SPHPC->InitFromBinaryMetaFile(sph_particle_file, BL_SPACEDIM + 1);
         }
+        else if (particle_init_type == "BinaryMortonFile")
+        {
+	  if (verbose)
+            {
+	      amrex::Print() << "\nInitializing DM particles from morton-ordered binary file\""
+			     << binary_particle_file << "\" ...\n\n";
+	      if (init_with_sph_particles == 1)
+		amrex::Error("Morton-ordered input is not supported for sph particles.");
+            }
+            //
+            // The second argument is how many Reals we read into `m_data[]`
+            // after reading in `m_pos[]` in each of the binary particle files.
+            // Here we're reading in the particle mass and velocity.
+            //
+	  DMPC->InitFromBinaryMortonFile(binary_particle_file,
+					 BL_SPACEDIM + 1,
+					 particle_skip_factor);
+        }
         else
         {
             amrex::Error("not a valid input for nyx.particle_init_type");
diff --git a/Source/Nyx_F.H b/Source/Nyx_F.H
index 6049f962..f01c1cd8 100644
--- a/Source/Nyx_F.H
+++ b/Source/Nyx_F.H
@@ -8,12 +8,18 @@
 extern "C"
 {
 #endif
-  void fort_integrate_comoving_a 
+
+  void fort_alloc_simd_vec();
+  void fort_dealloc_simd_vec();
+
+  void fort_integrate_comoving_a
     (amrex::Real* old_a, amrex::Real* new_a, amrex::Real* dt);
 
   void fort_integrate_comoving_a_to_z 
     (amrex::Real* old_a, amrex::Real* z_value, amrex::Real* dt);
 
+  void set_simd(const int *simd_width);
+
   //  void fort_get_omm    (amrex::Real* omm);
   //  void fort_get_omb    (amrex::Real* frac);
   //  void fort_get_hubble (amrex::Real* hubble);
@@ -34,14 +40,16 @@ extern "C"
   void fort_get_method_params(int* HYP_GROW);
 
   void fort_set_method_params
-    (const int& dm, const int& NumAdv, const int& do_hydro,
+    (const int& dm, const int& NumAdv, const int& Ndiag, const int& do_hydro,
      const int& ppm_type, const int& ppm_ref,
      const int& ppm_flatten_before_integrals,
      const int& use_colglaz, const int& use_flattening,
      const int& corner_coupling,
      const int& version_2, const int& use_const_species,
      const amrex::Real& gamma_in, const int& normalize_species,
-     const int& heat_cool_type, const MPI_Comm& comm);
+     const int& heat_cool_type, const int& inhomo_reion);
+
+  void fort_tabulate_rates();
 
   void filcc
     (const amrex::Real * q, ARLIM_P(q_lo), ARLIM_P(q_hi),
@@ -246,8 +254,20 @@ extern "C"
      amrex::Real* comoving_a,
      const int* print_fortran_warnings);
 
-  void fort_init_this_z
-    (amrex::Real* comoving_a);
+  void fort_compute_temp_vec
+    (const int lo[], const int hi[],
+     const BL_FORT_FAB_ARG(state),
+     const BL_FORT_FAB_ARG(diag_eos),
+     amrex::Real* comoving_a,
+     const int* print_fortran_warnings);
+
+  void fort_interp_to_this_z
+    (const amrex::Real* z);
+
+  void fort_setup_eos_params
+    (amrex::Real* eos_nr_eps,
+     amrex::Real* vode_rtol,
+     amrex::Real* vode_atol_scaled);
 
   void fort_compute_max_temp_loc
     (const int lo[], const int hi[],
@@ -261,7 +281,7 @@ extern "C"
      const BL_FORT_FAB_ARG(state),
      const BL_FORT_FAB_ARG(diag_eos),
      amrex::Real* rho_ave, amrex::Real* rho_T_sum, amrex::Real* T_sum,
-     amrex::Real* T_meanrho_sum, amrex::Real* rho_sum,
+     amrex::Real* Tinv_sum, amrex::Real* T_meanrho_sum, amrex::Real* rho_sum,
      amrex::Real* vol_sum, amrex::Real* vol_mn_sum);
 
 #ifdef AUX_UPDATE
@@ -281,4 +301,5 @@ extern "C"
 #ifdef __cplusplus
 }
 #endif
+
 #endif
diff --git a/Source/Nyx_halos.cpp b/Source/Nyx_halos.cpp
index 57cb7524..66c5b925 100644
--- a/Source/Nyx_halos.cpp
+++ b/Source/Nyx_halos.cpp
@@ -170,9 +170,6 @@ Nyx::halo_find (Real dt)
        for (BoxIterator bit(vertBox); bit.ok(); ++bit)
          {
            IntVect vert = bit();
-           int i = vert[0];
-           int j = vert[1];
-           int k = vert[2];
            IntVect iv(D_DECL(vertices[vert[0]][0],
                              vertices[vert[1]][1],
                              vertices[vert[2]][2]));
@@ -190,6 +187,27 @@ Nyx::halo_find (Real dt)
        std::cout << "  " << std::endl;
        std::cout << " *************************************** " << std::endl;
 
+       // agn_density_old will hold the density from depositing the
+       // mass of existing particles.
+       MultiFab agn_density_old(simBA, simDM, ncomp1, nghost1);
+       agn_density_old.setVal(0.0);
+
+       // Deposit the mass now in the particles onto agn_density_old, on grid.
+       // (No change to mass of particles.)
+       Nyx::theAPC()->AssignDensitySingleLevel(agn_density_old, level);
+
+       // Make sure the density put into ghost cells is added to valid regions
+       agn_density_old.SumBoundary(geom.periodicity());
+
+       // Convert new_state to primitive variables: rho, velocity, energy/rho.
+       conserved_to_primitive(new_state);
+
+       // Add agn_density_old to new_state, which holds primitive variables.
+       // This is from depositing mass of existing particles.
+       // Later, we'll subtract the deposited mass of all particles, old & new.
+       amrex::MultiFab::Add(new_state, agn_density_old,
+                            comp0, Density, ncomp1, nghost0);
+
 #ifdef REEBER
        for (const Halo& h : reeber_halos)
        {
@@ -243,10 +261,17 @@ Nyx::halo_find (Real dt)
        // Call Redistribute so that the new particles get their cell, grid and process defined
        Nyx::theAPC()->Redistribute(lev_min, lev_max, ngrow);
 
+       // Fill the "ghosts" vector with particles in ghost cells of each grid
        Nyx::theAPC()->fillNeighbors(level);
+
+       // ComputeOverlap sets the ID of a particle to -1 if it is less than "cutoff" away from another
+       //   particle and if it is newer than that particle
        Nyx::theAPC()->ComputeOverlap(level);
+
+       // Clear the Neighbor Particle data structure
        Nyx::theAPC()->clearNeighbors(level);
 
+       // This Redistribute is used to remove particles whose ID's have been set to -1 in ComputeOverlap
        Nyx::theAPC()->Redistribute(lev_min, lev_max, ngrow);
 
        // agn_density will hold the density we're going to remove from the grid.
@@ -260,21 +285,26 @@ Nyx::halo_find (Real dt)
        // Make sure the density put into ghost cells is added to valid regions
        agn_density.SumBoundary(geom.periodicity());
 
-       // Take away the density from the gas that was added to the AGN particle.
+       // Take away the density from the gas that was added to the AGN particle:
+       // density is in new_state, which holds primitive variables.
        amrex::MultiFab::Subtract(new_state, agn_density,
                                  comp0, Density, ncomp1, nghost0);
 
-       cout << "Going into ComputeParticleVelocity (no energy), number of AGN particles on this proc is "
-            << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl;
+       // Convert new_state to conserved variables: rho, momentum, energy.
+       primitive_to_conserved(new_state);
+
+       pout() << "Going into ComputeParticleVelocity (no energy), number of AGN particles on this proc is "
+              << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl;
 
        // Re-set the particle velocity (but not energy) after accretion,
-       // using change of momentum density in state.
+       // using change of momentum density from orig_state to new_state,
+       // which hold conserved variables.
        // No change to state, other than filling ghost cells.
        int add_energy = 0;
        Nyx::theAPC()->ComputeParticleVelocity(level, orig_state, new_state, add_energy);
 
-       cout << "Going into ReleaseEnergy, number of AGN particles on this proc is "
-            << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl;
+       pout() << "Going into ReleaseEnergy, number of AGN particles on this proc is "
+              << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl;
        // AGN particles: may zero out energy.
        // new_state: may increase internal and total energy.
        MultiFab& D_new = get_new_data(DiagEOS_Type);
@@ -361,7 +391,7 @@ Nyx::halo_accrete (Real dt)
    const DistributionMapping& simDM = new_state.DistributionMap();
    int ncomp = new_state.nComp();
 
-   // First copy the existing state into orig_state.
+   // First copy the existing state (new_state) into orig_state.
    MultiFab orig_state(simBA, simDM, ncomp, nghost1);
    MultiFab::Copy(orig_state, new_state,
                   comp0, comp0, ncomp, nghost1);
@@ -373,8 +403,8 @@ Nyx::halo_accrete (Real dt)
    MultiFab agn_density_lost(simBA, simDM, ncomp1, nghost1);
    agn_density_lost.setVal(0.0);
 
-   cout << "Going into AccreteMass, number of AGN particles on this proc is "
-        << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl;
+   pout() << "Going into AccreteMass, number of AGN particles on this proc is "
+          << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl;
    // AGN particles: increase mass and energy.
    // new_state: no change, other than filling in ghost cells.
    // agn_density_lost: gets filled in.
@@ -394,8 +424,8 @@ Nyx::halo_accrete (Real dt)
    // using change of momentum density in state.
    // No change to state, other than filling ghost cells.
    int add_energy = 1;
-   cout << "Going into ComputeParticleVelocity (and energy), number of AGN particles on this proc is "
-        << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl;
+   pout() << "Going into ComputeParticleVelocity (and energy), number of AGN particles on this proc is "
+          << Nyx::theAPC()->TotalNumberOfParticles(true, true) << endl;
    Nyx::theAPC()->ComputeParticleVelocity(level, orig_state, new_state, add_energy);
    // Now new_state = get_new_data(State_Type) has been updated.
 }
diff --git a/Source/Nyx_hydro.cpp b/Source/Nyx_hydro.cpp
index 604ac4a4..c28b1d3b 100644
--- a/Source/Nyx_hydro.cpp
+++ b/Source/Nyx_hydro.cpp
@@ -122,14 +122,15 @@ Nyx::just_the_hydro (Real time,
     // Create FAB for extended grid values (including boundaries) and fill.
     MultiFab S_old_tmp(S_old.boxArray(), S_old.DistributionMap(), NUM_STATE, NUM_GROW);
     FillPatch(*this, S_old_tmp, NUM_GROW, time, State_Type, 0, NUM_STATE);
-    MultiFab D_old_tmp(D_old.boxArray(), D_old.DistributionMap(), 2, NUM_GROW);
-    FillPatch(*this, D_old_tmp, NUM_GROW, time, DiagEOS_Type, 0, 2);
+
+    MultiFab D_old_tmp(D_old.boxArray(), D_old.DistributionMap(), D_old.nComp(), NUM_GROW);
+    FillPatch(*this, D_old_tmp, NUM_GROW, time, DiagEOS_Type, 0, D_old.nComp());
 
     if (add_ext_src && strang_split) 
         strang_first_step(time,dt,S_old_tmp,D_old_tmp);
 
 #ifdef _OPENMP
-#pragma omp parallel reduction(max:courno)
+#pragma omp parallel reduction(max:courno) reduction(+:e_added,ke_added)
 #endif
        {
        FArrayBox flux[BL_SPACEDIM], u_gdnv[BL_SPACEDIM];
diff --git a/Source/Nyx_nd.f90 b/Source/Nyx_nd.f90
index f70929eb..9c48bc4d 100644
--- a/Source/Nyx_nd.f90
+++ b/Source/Nyx_nd.f90
@@ -186,11 +186,12 @@ end subroutine fort_set_small_values
 ! :::
 
       subroutine fort_set_method_params( &
-                 dm, numadv, do_hydro, ppm_type_in, ppm_ref_in, &
+                 dm, numadv, ndiag_in, do_hydro, ppm_type_in, ppm_ref_in, &
                  ppm_flatten_before_integrals_in, &
                  use_colglaz_in, use_flattening_in, &
                  corner_coupling_in, version_2_in, &
-                 use_const_species_in, gamma_in, normalize_species_in, heat_cool_in, comm) &
+                 use_const_species_in, gamma_in, normalize_species_in, &
+                 heat_cool_in, inhomo_reion_in) &
                  bind(C, name = "fort_set_method_params")
 
         ! Passing data from C++ into f90
@@ -202,12 +203,12 @@ subroutine fort_set_method_params( &
         use comoving_module, only : comoving_type
         use network, only : nspec, naux
         use eos_module
-        use parallel
 
         implicit none
 
         integer,  intent(in) :: dm
         integer,  intent(in) :: numadv
+        integer,  intent(in) :: ndiag_in
         integer,  intent(in) :: do_hydro
         integer,  intent(in) :: ppm_type_in
         integer,  intent(in) :: ppm_ref_in
@@ -220,19 +221,13 @@ subroutine fort_set_method_params( &
         integer,  intent(in) :: use_const_species_in
         integer,  intent(in) :: normalize_species_in
         integer,  intent(in) :: heat_cool_in
-        integer,  intent(in), optional :: comm
+        integer,  intent(in) :: inhomo_reion_in
 
         integer             :: QNEXT
         integer             :: UNEXT
 
         integer             :: iadv, ispec
 
-        if (present(comm)) then
-          call parallel_initialize(comm=comm)
-        else
-          call parallel_initialize()
-        end if
-
         use_const_species = use_const_species_in
 
         iorder = 2
@@ -242,6 +237,8 @@ subroutine fort_set_method_params( &
 
         comoving_type = 1
 
+        NDIAG = ndiag_in
+
         if (do_hydro .eq. 0) then
 
            NVAR = 1
@@ -256,11 +253,17 @@ subroutine fort_set_method_params( &
 
            TEMP_COMP = -1
              NE_COMP = -1
+            ZHI_COMP = -1
 
         else
 
            TEMP_COMP = 1
              NE_COMP = 2
+            if (inhomo_reion_in .gt. 0) then
+               ZHI_COMP = 3
+            else
+               ZHI_COMP = -1
+            endif
 
            !---------------------------------------------------------------------
            ! conserved state components
@@ -362,13 +365,10 @@ subroutine fort_set_method_params( &
            normalize_species            = normalize_species_in
 
            heat_cool_type               = heat_cool_in
+           inhomo_reion                 = inhomo_reion_in
 
         end if
 
-        if (heat_cool_type .eq. 1 .or. heat_cool_type .eq. 3 .or. heat_cool_type .eq. 5) then
-           call tabulate_rates()
-        end if
-
         ! Easy indexing for the passively advected quantities.  
         ! This lets us loop over all four groups (advected, species, aux)
         ! in a single loop.
diff --git a/Source/SourceTerms/Nyx_sources.cpp b/Source/SourceTerms/Nyx_sources.cpp
index dd75d435..fe64c711 100644
--- a/Source/SourceTerms/Nyx_sources.cpp
+++ b/Source/SourceTerms/Nyx_sources.cpp
@@ -27,12 +27,14 @@ Nyx::get_old_source (Real      old_time,
     Dborder.define(grids, D_old.DistributionMap(), D_old.nComp(), 4);
 
     FillPatch(*this, Sborder, 4, old_time, State_Type, Density, Sborder.nComp());
-    FillPatch(*this, Dborder, 4, old_time, DiagEOS_Type, 0, 2);
+    FillPatch(*this, Dborder, 4, old_time, DiagEOS_Type, 0, D_old.nComp());
+
+    fort_interp_to_this_z(&z);
 
 #ifdef _OPENMP
 #pragma omp parallel
 #endif
-    for (MFIter mfi(S_old,true); mfi.isValid(); ++mfi)
+    for (MFIter mfi(S_old, MFItInfo().SetDynamic(true).EnableTiling()); mfi.isValid(); ++mfi)
     {
         // We explicitly want to fill the ghost regions of the ext_src array
         const Box& bx = mfi.growntilebox(ext_src.nGrow());
@@ -84,13 +86,15 @@ Nyx::get_new_source (Real      old_time,
 
     FillPatch(*this, Sborder_old, 4, old_time, State_Type  , Density, Sborder_old.nComp());
     FillPatch(*this, Sborder_new, 4, new_time, State_Type  , Density, Sborder_new.nComp());
-    FillPatch(*this, Dborder_old, 4, old_time, DiagEOS_Type, 0      , 2);
-    FillPatch(*this, Dborder_new, 4, new_time, DiagEOS_Type, 0      , 2);
+    FillPatch(*this, Dborder_old, 4, old_time, DiagEOS_Type, 0      , Dborder_old.nComp());
+    FillPatch(*this, Dborder_new, 4, new_time, DiagEOS_Type, 0      , Dborder_new.nComp());
+
+    fort_interp_to_this_z(&z);
 
 #ifdef _OPENMP
 #pragma omp parallel
 #endif
-    for (MFIter mfi(S_old,true); mfi.isValid(); ++mfi)
+    for (MFIter mfi(S_old, MFItInfo().SetDynamic(true).EnableTiling()); mfi.isValid(); ++mfi)
     {
         // We explicitly only want to fill the valid region
         const Box& bx = mfi.tilebox();
diff --git a/Source/Src_3d/compute_temp_3d.f90 b/Source/Src_3d/compute_temp_3d.f90
index ae301faa..4fa99ffb 100644
--- a/Source/Src_3d/compute_temp_3d.f90
+++ b/Source/Src_3d/compute_temp_3d.f90
@@ -9,9 +9,12 @@ subroutine fort_compute_temp(lo,hi, &
 
       use amrex_fort_module, only : rt => amrex_real
       use eos_module
-      use atomic_rates_module, only: this_z, interp_to_this_z
+      use atomic_rates_module, only: this_z
       use meth_params_module, only : NVAR, URHO, UMX, UMY, UMZ, UEINT, UEDEN, &
-                                     TEMP_COMP, NE_COMP, small_temp, heat_cool_type
+                                     NDIAG, TEMP_COMP, NE_COMP, ZHI_COMP, &
+                                     small_temp, heat_cool_type
+      use reion_aux_module,    only: zhi_flash, zheii_flash, flash_h, flash_he, &
+                                     inhomogeneous_on
       use  eos_params_module
 
       implicit none
@@ -20,20 +23,27 @@ subroutine fort_compute_temp(lo,hi, &
       integer         , intent(in   ) :: d_l1,d_l2,d_l3,d_h1,d_h2,d_h3
       integer         , intent(in   ) :: print_fortran_warnings
       real(rt), intent(inout) ::    state(s_l1:s_h1,s_l2:s_h2,s_l3:s_h3,NVAR)
-      real(rt), intent(inout) :: diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,2)
+      real(rt), intent(inout) :: diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,NDIAG)
       real(rt), intent(in   ) :: comoving_a
 
-      integer          :: i,j,k
+      integer          :: i,j,k, JH, JHe
       real(rt) :: rhoInv,eint
       real(rt) :: ke,dummy_pres
       real(rt) :: z
 
       z = 1.d0/comoving_a - 1.d0
 
-      if (heat_cool_type.gt.0) then
-          if (z .ne. this_z) &
-             call interp_to_this_z(z)
-      end if
+      ! Flash reionization?
+      if ((flash_h .eqv. .true.) .and. (z .gt. zhi_flash)) then
+         JH = 0
+      else
+         JH = 1
+      endif
+      if ((flash_he .eqv. .true.) .and. (z .gt. zheii_flash)) then
+         JHe = 0
+      else
+         JHe = 1
+      endif
 
       do k = lo(3),hi(3)
          do j = lo(2),hi(2)
@@ -59,7 +69,13 @@ subroutine fort_compute_temp(lo,hi, &
 
                    eint = state(i,j,k,UEINT) * rhoInv
 
-                   call nyx_eos_T_given_Re(diag_eos(i,j,k,TEMP_COMP), diag_eos(i,j,k,NE_COMP), &
+                   if ((inhomogeneous_on) .and. (z .gt. diag_eos(i,j,k,ZHI_COMP))) then
+                       JH = 0
+                   else
+                       JH = 1
+                   endif
+
+                   call nyx_eos_T_given_Re(JH, JHe, diag_eos(i,j,k,TEMP_COMP), diag_eos(i,j,k,NE_COMP), &
                                            state(i,j,k,URHO), eint, comoving_a)
 
                else
@@ -85,14 +101,130 @@ subroutine fort_compute_temp(lo,hi, &
 
       end subroutine fort_compute_temp
 
+      subroutine fort_compute_temp_vec(lo,hi, &
+                                   state   ,s_l1,s_l2,s_l3, s_h1,s_h2,s_h3, &
+                                   diag_eos,d_l1,d_l2,d_l3, d_h1,d_h2,d_h3, &
+                                   comoving_a, print_fortran_warnings) &
+      bind(C, name = "fort_compute_temp_vec")
+
+      use amrex_fort_module, only : rt => amrex_real
+      use eos_module
+      use atomic_rates_module, only: this_z
+      use meth_params_module, only : NVAR, URHO, UMX, UMY, UMZ, UEINT, UEDEN, &
+                                     NDIAG, TEMP_COMP, NE_COMP, small_temp, heat_cool_type
+      use  eos_params_module
+
+      implicit none
+      integer         , intent(in   ) :: lo(3),hi(3)
+      integer         , intent(in   ) :: s_l1,s_l2,s_l3,s_h1,s_h2,s_h3
+      integer         , intent(in   ) :: d_l1,d_l2,d_l3,d_h1,d_h2,d_h3
+      integer         , intent(in   ) :: print_fortran_warnings
+      real(rt), intent(inout) ::    state(s_l1:s_h1,s_l2:s_h2,s_l3:s_h3,NVAR)
+      real(rt), intent(inout) :: diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,NDIAG)
+      real(rt), intent(in   ) :: comoving_a
+
+      integer          :: i,j,k
+      real(rt) :: rhoInv,eint
+      real(rt), dimension(hi(1)-lo(1)+1) :: ke,dummy_pres,small_temp_vec
+      real(rt) :: z
+      real(rt), dimension(hi(1)-lo(1)+1,4) :: eos_inputs_pos_ueint, eos_inputs_neg_ueint
+      integer :: orig_indices(hi(1)-lo(1)+1,3)
+      integer :: pos_eos_count, neg_eos_count
+
+      z = 1.d0/comoving_a - 1.d0
+
+      do k = lo(3),hi(3)
+         do j = lo(2),hi(2)
+            do i = lo(1),hi(1)
+               if (state(i,j,k,URHO) <= 0.d0) then
+                  print *,'   '
+                  print *,'>>> Error: compute_temp ',i,j,k
+                  print *,'>>> ... negative density ',state(i,j,k,URHO)
+                  print *,'    '
+                  call bl_error("Error:: compute_temp_3d.f90 :: compute_temp")
+               end if
+            enddo
+         enddo
+      enddo
+
+      do k = lo(3),hi(3)
+         do j = lo(2),hi(2)
+
+            pos_eos_count = 0
+            neg_eos_count = 0
+
+            do i = lo(1),hi(1)
+               rhoInv = 1.d0 / state(i,j,k,URHO)
+
+               if (state(i,j,k,UEINT) > 0.d0) then
+
+                   pos_eos_count = pos_eos_count + 1
+
+                   eos_inputs_pos_ueint(pos_eos_count,1) = diag_eos(i,j,k,TEMP_COMP)
+                   eos_inputs_pos_ueint(pos_eos_count,2) = diag_eos(i,j,k,NE_COMP)
+                   eos_inputs_pos_ueint(pos_eos_count,3) = state(i,j,k,URHO)
+                   eos_inputs_pos_ueint(pos_eos_count,4) = state(i,j,k,UEINT)*rhoInv
+
+                   orig_indices(pos_eos_count,1) = i
+                   orig_indices(pos_eos_count,2) = j
+                   orig_indices(pos_eos_count,3) = k
+
+               else
+
+                   neg_eos_count = neg_eos_count + 1
+
+                   eos_inputs_neg_ueint(neg_eos_count,1) = diag_eos(i,j,k,TEMP_COMP) ! DON'T NEED THIS; GET RID OF IT
+                   eos_inputs_neg_ueint(neg_eos_count,2) = diag_eos(i,j,k,NE_COMP)
+                   eos_inputs_neg_ueint(neg_eos_count,3) = state(i,j,k,URHO)
+                   eos_inputs_neg_ueint(neg_eos_count,4) = state(i,j,k,UEINT)
+
+                   orig_indices(neg_eos_count,1) = i
+                   orig_indices(neg_eos_count,2) = j
+                   orig_indices(neg_eos_count,3) = k
+
+               end if
+             end do
+
+             ! For cells with positive E_int
+             call nyx_eos_T_given_Re_vec(eos_inputs_pos_ueint(1:pos_eos_count,1), &
+                                         eos_inputs_pos_ueint(1:pos_eos_count,2), &
+                                         eos_inputs_pos_ueint(1:pos_eos_count,3), &
+                                         eos_inputs_pos_ueint(1:pos_eos_count,4), &
+                                         comoving_a, &
+                                         pos_eos_count)
+             diag_eos(orig_indices(1:pos_eos_count,1),j,k,TEMP_COMP) = eos_inputs_pos_ueint(1:pos_eos_count,1)
+             diag_eos(orig_indices(1:pos_eos_count,1),j,k,NE_COMP)   = eos_inputs_pos_ueint(1:pos_eos_count,2)
+
+             ! For cells with negative E_int
+             call nyx_eos_given_RT_vec(eos_inputs_neg_ueint(1:neg_eos_count,4), &
+                                   dummy_pres(1:neg_eos_count), &
+                                   eos_inputs_neg_ueint(1:neg_eos_count,3), &
+                                   small_temp_vec(1:neg_eos_count), &
+                                   eos_inputs_neg_ueint(1:neg_eos_count,2), &
+                                   comoving_a, &
+                                   neg_eos_count)
+
+             ke(1:neg_eos_count) = 0.5d0 * (state(orig_indices(1:neg_eos_count,1),j,k,UMX)*state(orig_indices(1:neg_eos_count,1),j,k,UMX) + &
+                                   state(orig_indices(1:neg_eos_count,1),j,k,UMY)*state(orig_indices(1:neg_eos_count,1),j,k,UMY) + &
+                                   state(orig_indices(1:neg_eos_count,1),j,k,UMZ)*state(orig_indices(1:neg_eos_count,1),j,k,UMZ)) * rhoInv
+
+             diag_eos(orig_indices(1:neg_eos_count,1),j,k,TEMP_COMP) = small_temp_vec(1:neg_eos_count)
+             state(orig_indices(1:neg_eos_count,1),j,k,UEINT) = eos_inputs_neg_ueint(1:neg_eos_count,3) * eos_inputs_neg_ueint(1:neg_eos_count,4)
+             state(orig_indices(1:neg_eos_count,1),j,k,UEDEN) = eos_inputs_neg_ueint(1:neg_eos_count,4) + ke(1:neg_eos_count)
+
+         enddo
+      enddo
+
+      end subroutine fort_compute_temp_vec
+
       subroutine fort_compute_rho_temp(lo,hi,dx, &
                                      state,s_l1,s_l2,s_l3,s_h1,s_h2,s_h3, &
                                   diag_eos,d_l1,d_l2,d_l3,d_h1,d_h2,d_h3, &
                                   rho_ave,rho_T_sum, &
-                                  T_sum,T_meanrho_sum,rho_sum,vol_sum,vol_mn_sum) &
+                                  T_sum,Tinv_sum,T_meanrho_sum,rho_sum,vol_sum,vol_mn_sum) &
       bind(C, name = "fort_compute_rho_temp")
 
-      use meth_params_module, only : NVAR, URHO, TEMP_COMP
+      use meth_params_module, only : NVAR, URHO, NDIAG, TEMP_COMP
 
       use amrex_fort_module, only : rt => amrex_real
       implicit none
@@ -102,8 +234,8 @@ subroutine fort_compute_rho_temp(lo,hi,dx, &
       real(rt), intent(in   ) :: dx(3)
       real(rt), intent(in   ) :: rho_ave
       real(rt), intent(in   ) ::    state(s_l1:s_h1,s_l2:s_h2,s_l3:s_h3,NVAR)
-      real(rt), intent(in   ) :: diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,2)
-      real(rt), intent(inout) :: rho_T_sum, rho_sum, T_sum, T_meanrho_sum
+      real(rt), intent(inout) :: diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,NDIAG)
+      real(rt), intent(inout) :: rho_T_sum, rho_sum, T_sum, Tinv_sum, T_meanrho_sum
       real(rt), intent(inout) :: vol_sum, vol_mn_sum
 
       integer          :: i,j,k
@@ -116,6 +248,7 @@ subroutine fort_compute_rho_temp(lo,hi,dx, &
          do j = lo(2),hi(2)
             do i = lo(1),hi(1)
                    T_sum =     T_sum + vol*diag_eos(i,j,k,TEMP_COMP)
+                Tinv_sum =  Tinv_sum + state(i,j,k,URHO)/diag_eos(i,j,k,TEMP_COMP)
                rho_T_sum = rho_T_sum + state(i,j,k,URHO)*diag_eos(i,j,k,TEMP_COMP)
                  rho_sum =   rho_sum + state(i,j,k,URHO)
                  if ( (state(i,j,k,URHO) .lt. rho_hi) .and. &
@@ -137,7 +270,7 @@ subroutine fort_compute_max_temp_loc(lo,hi, &
                                            max_temp, den_maxt, imax, jmax, kmax) &
       bind(C, name = "fort_compute_max_temp_loc")
 
-      use meth_params_module, only : TEMP_COMP, NVAR, URHO
+      use meth_params_module, only : TEMP_COMP, NVAR, URHO, NDIAG
 
       use amrex_fort_module, only : rt => amrex_real
       implicit none
@@ -145,7 +278,7 @@ subroutine fort_compute_max_temp_loc(lo,hi, &
       integer         , intent(in   ) :: s_l1,s_l2,s_l3,s_h1,s_h2,s_h3
       integer         , intent(in   ) :: d_l1,d_l2,d_l3,d_h1,d_h2,d_h3
       real(rt), intent(inout) ::    state(s_l1:s_h1,s_l2:s_h2,s_l3:s_h3,NVAR)
-      real(rt), intent(inout) :: diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,2)
+      real(rt), intent(inout) :: diag_eos(d_l1:d_h1,d_l2:d_h2,d_l3:d_h3,NDIAG)
       real(rt), intent(in   ) :: max_temp
       real(rt), intent(  out) :: den_maxt
       integer         , intent(inout) :: imax,jmax,kmax
diff --git a/Source/comoving.cpp b/Source/comoving.cpp
index 28e450ca..42f233a7 100644
--- a/Source/comoving.cpp
+++ b/Source/comoving.cpp
@@ -230,9 +230,13 @@ Nyx::comoving_a_post_restart (const std::string& restart_file)
         std::cout << "...setting old_a_time to " << old_a_time << std::endl;
     }
 
+#ifdef HEATCOOL
      // Initialize "this_z" in the atomic_rates_module
-     if (heat_cool_type == 1 || heat_cool_type == 3 || heat_cool_type == 5)
-         fort_init_this_z(&old_a);
+     if (heat_cool_type == 1 || heat_cool_type == 3 || heat_cool_type == 5 || heat_cool_type == 7) {
+         Real old_z = 1.0/old_a - 1.0;
+         fort_interp_to_this_z(&old_z);
+     }
+#endif
 }
 
 void
diff --git a/Source/main.cpp b/Source/main.cpp
index 03766423..e7062a37 100644
--- a/Source/main.cpp
+++ b/Source/main.cpp
@@ -411,6 +411,10 @@ main (int argc, char* argv[])
 #endif
     const Real time_before_main_loop = ParallelDescriptor::second();
 
+#ifdef USE_CVODE
+    Nyx::alloc_simd_vec();
+#endif
+
     bool finished(false);
 
     while ( ! finished) {
@@ -512,6 +516,10 @@ main (int argc, char* argv[])
 #endif
     }  // ---- end while( ! finished)
 
+#ifdef USE_CVODE
+    Nyx::dealloc_simd_vec();
+#endif
+
     const Real time_without_init = ParallelDescriptor::second() - time_before_main_loop;
     if (ParallelDescriptor::IOProcessor()) std::cout << "Time w/o init: " << time_without_init << std::endl;
 
diff --git a/Source/meth_params.f90 b/Source/meth_params.f90
index 51fd612f..89874091 100644
--- a/Source/meth_params.f90
+++ b/Source/meth_params.f90
@@ -17,9 +17,9 @@ module meth_params_module
   integer, parameter     :: MAXADV  = 5
 
   ! NTHERM: number of thermodynamic variables
-  integer         , save :: NTHERM, NVAR
+  integer         , save :: NTHERM, NVAR, NDIAG
   integer         , save :: URHO, UMX, UMY, UMZ, UEDEN, UEINT, UFA, UFS, UFX
-  integer         , save :: TEMP_COMP, NE_COMP
+  integer         , save :: TEMP_COMP, NE_COMP, ZHI_COMP
 
   ! QTHERM: number of primitive variables
   integer         , save :: QTHERM, QVAR
@@ -39,6 +39,7 @@ module meth_params_module
   integer         , save :: use_const_species
   integer         , save :: normalize_species
   integer         , save :: heat_cool_type
+  integer         , save :: inhomo_reion
   integer         , save :: grav_source_type
 
   integer, save :: npassive
diff --git a/Source/misc_params.f90 b/Source/misc_params.f90
new file mode 100644
index 00000000..fdaac1d7
--- /dev/null
+++ b/Source/misc_params.f90
@@ -0,0 +1,7 @@
+module misc_params
+
+  implicit none
+
+  integer :: simd_width
+
+end module misc_params
diff --git a/Source/strang_splitting.cpp b/Source/strang_splitting.cpp
index 32b79a42..581b5437 100644
--- a/Source/strang_splitting.cpp
+++ b/Source/strang_splitting.cpp
@@ -14,6 +14,11 @@ Nyx::strang_first_step (Real time, Real dt, MultiFab& S_old, MultiFab& D_old)
     const Real a = get_comoving_a(time);
     const Real* dx = geom.CellSize();
 
+    {
+      const Real z = 1.0/a - 1.0;
+      fort_interp_to_this_z(&z);
+    }
+
 #ifdef _OPENMP
 #pragma omp parallel
 #endif
@@ -50,13 +55,19 @@ Nyx::strang_second_step (Real time, Real dt, MultiFab& S_new, MultiFab& D_new)
     int min_iter_grid;
     int max_iter_grid;
 
-    const Real a = get_comoving_a(time);
+    // Set a at the half of the time step in the second strang
+    const Real a = get_comoving_a(time-half_dt);
     const Real* dx = geom.CellSize();
 
     compute_new_temp();
 
+    {
+      const Real z = 1.0/a - 1.0;
+      fort_interp_to_this_z(&z);
+    }
+
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel private(min_iter_grid,max_iter_grid) reduction(min:min_iter) reduction(max:max_iter)
 #endif
     for (MFIter mfi(S_new,true); mfi.isValid(); ++mfi)
     {
diff --git a/Source/write_info.cpp b/Source/write_info.cpp
index 74ee74b9..1c1a06df 100644
--- a/Source/write_info.cpp
+++ b/Source/write_info.cpp
@@ -14,12 +14,12 @@ Nyx::write_info ()
         MultiFab& D_new = get_new_data(DiagEOS_Type);
 	Real      max_t = 0;
 
-        Real rho_T_avg=0.0, T_avg=0.0, T_meanrho=0.0;
+        Real rho_T_avg=0.0, T_avg=0.0, Tinv_avg=0.0, T_meanrho=0.0;
 	if (do_hydro)
         {
             compute_new_temp();
             max_t = D_new.norm0(Temp_comp);
-            compute_rho_temp(rho_T_avg, T_avg, T_meanrho);
+            compute_rho_temp(rho_T_avg, T_avg, Tinv_avg, T_meanrho);
 	}
 #endif
 
@@ -37,18 +37,20 @@ Nyx::write_info ()
 
             if (time == 0.0)
             {
-                data_loga << std::setw( 8) <<  "   nstep";
+                data_loga << std::setw( 8) <<  "#  nstep";
                 data_loga << std::setw(14) <<  "       time    ";
-                data_loga << std::setw(14) <<  "        dt     ";
-                data_loga << std::setw(14) <<  "      redshift ";
-                data_loga << std::setw(14) <<  "       a       ";
+                data_loga << std::setw(14) <<  "       dt      ";
+                data_loga << std::setw(14) <<  "         z     ";
+                data_loga << std::setw(14) <<  "      a        ";
 #ifndef NO_HYDRO
                 if (do_hydro == 1)
                 {
-                   data_loga << std::setw(14) <<  "  max temp     ";
-                   data_loga << std::setw(14) <<  "rho-wgted temp ";
-                   data_loga << std::setw(14) <<  " V-wgted temp  ";
-                   data_loga << std::setw(14) <<  " T @ <rho>     ";
+                   data_loga << std::setw(14) <<  "    T_max      ";
+                   data_loga << std::setw(14) <<  "  <T>_rho      ";
+                   data_loga << std::setw(14) <<  "  <T>_V        ";
+                   data_loga << std::setw(14) <<  "T @ <rho>      ";
+                   data_loga << std::setw(14) <<  "T(21cm)        ";
+                   data_loga << std::setw(14) <<  "adiab.         ";
                 }
 #endif
                 data_loga << '\n';
@@ -66,6 +68,8 @@ Nyx::write_info ()
                    data_loga << std::setw(14) <<  std::setprecision(6) << rho_T_avg;
                    data_loga << std::setw(14) <<  std::setprecision(6) << T_avg;
                    data_loga << std::setw(14) <<  std::setprecision(6) << T_meanrho;
+                   data_loga << std::setw(14) <<  std::setprecision(6) << 1.0/Tinv_avg;
+                   data_loga << std::setw(14) <<  std::setprecision(6) << 0.021*(1.0+old_z)*(1.0+old_z);
                 }
 #endif
                 data_loga << '\n';
@@ -85,6 +89,8 @@ Nyx::write_info ()
                    data_loga << std::setw(14) <<  std::setprecision(6) << rho_T_avg;
                    data_loga << std::setw(14) <<  std::setprecision(6) << T_avg;
                    data_loga << std::setw(14) <<  std::setprecision(6) << T_meanrho;
+                   data_loga << std::setw(14) <<  std::setprecision(6) << 1.0/Tinv_avg;
+                   data_loga << std::setw(14) <<  std::setprecision(6) << 0.021*(1.0+new_z)*(1.0+new_z);
                 }
 #endif
                 data_loga << std::endl;
diff --git a/UsersGuide/HeatCool/NyxHeatCool.tex b/UsersGuide/HeatCool/NyxHeatCool.tex
new file mode 100644
index 00000000..90fd7b77
--- /dev/null
+++ b/UsersGuide/HeatCool/NyxHeatCool.tex
@@ -0,0 +1,37 @@
+\label{chap:HeatCool}
+
+\nyx\ provides the capability to compute local heating and cooling effects due to radiation.
+The motivation and algorithm for the heating and cooling components is documented in \cite{lukic15}, and the relevant code is located in the \texttt{Source/HeatCool} subdirectory.
+The code is activated through the \texttt{USE\_HEATCOOL=TRUE} option in the \texttt{GNUmakefile}.
+Mathematically, the heating and cooling can be described by a single ODE in each cell, to be integrated per time step $\Delta t$.
+This ODE exhibits a sensitive relationship to quantities such as temperature and free electron density, and consequently it often requires sophisticated integration techniques to compute correctly.
+
+\nyx\ provides a few different techniques for solving this ODE, which are selected via the \texttt{nyx.heat\_cool\_type} input parameter.
+One method is to use the VODE ODE solver (selected with \texttt{nyx.heat\_cool\_type=3}).
+The source code for VODE is included in the \texttt{Util/VODE} subdirectory and is compiled automatically with the rest of \nyx.
+However, while VODE is sufficient for computing this ODE correctly, it is an old Fortran code which is no longer maintained, and consequently will not easily be adapted to future high-performance computing architectures.
+
+VODE's successor is CVODE, which is a translation of the original VODE solver from Fortran to C.
+CVODE is actively developed and maintained, and is more likely to be adapted to future architectures.
+To use CVODE in \nyx, one may use the \texttt{nyx.heat\_cool\_type=5} input parameter.
+Currently the performance of VODE is slightly better because CVODE evaluates the ODE RHS one more time than VODE per coarse time step integration.
+Users should note that, while the VODE solver is compiled automatically in \nyx, CVODE must be compiled as a separate library; instructions for compiling CVODE are provided in the \amrex\ User Guide.
+To link the external CVODE solver into \nyx, one must set \texttt{USE\_HEATCOOL=TRUE} as well as \texttt{USE\_CVODE=TRUE} in the \texttt{GNUmakefile}.
+
+Finally, a third ODE integration option (which is new and \emph{\textbf{highly experimental}}) consists of using CVODE while treating groups of ODEs in different cells as a single system of coupled ODEs.
+This option can be selected with the \texttt{nyx.heat\_cool\_type=7} option.
+The purpose of this approach is to enable the evaluation of multiple RHSs simultaneously, using SIMD instructions.
+SIMD parallelism comprises a large fraction of compute performance on modern HPC architectures, and consequently, this approach can lead to a significant performance gain in the ODE integration (which is the most expensive computational kernel in \nyx).
+The number of ODEs (cells) which are computed simultaneously is chosen through the input parameter \texttt{nyx.simd\_width}.
+On Intel Xeon Phi, with 512 bit-wide SIMD instructions, an appropriate value for this parameter might be 8 or 16, or perhaps larger; the value which yields the highest performance will vary by architecture.
+However, users are cautioned that this mode remains \emph{\textbf{experimental}} and its results have not been subjected to the same level of verification as the other solver methods.
+In particular, the are three numerical tolerances, available as input parameters, which affect the convergence of the scalar vs SIMD ODE integration:
+
+\begin{itemize}
+  \item \texttt{nyx.eos\_nr\_eps}: this is the convergence criterion for the Newton-Raphson iteration which is used to evaluate the ODE RHS
+  \item \texttt{nyx.vode\_rtol}: this is the relative tolerance required for the ODE integration in VODE or CVODE
+  \item \texttt{nyx.vode\_atol\_scaled}: this is the absolute tolerance required for the ODE integration in VODE or CVODE, scaled by the initial value of the independent variable in the ODE
+\end{itemize}
+
+These variables, in particular \texttt{nyx.vode\_rtol}, have different effects depending on whether one is integrating a single ODE at a time, or a system of ODEs simultaneously.
+One should be mindful of the numerical differences which arise from these, which can be observed with the \texttt{fcompare} tool in \amrex.
diff --git a/UsersGuide/HeatCool/heatcool.bib b/UsersGuide/HeatCool/heatcool.bib
new file mode 100644
index 00000000..6ba96207
--- /dev/null
+++ b/UsersGuide/HeatCool/heatcool.bib
@@ -0,0 +1,12 @@
+@article{lukic15,
+author = {Luki\'c, Zarija and Stark, Casey W. and Nugent, Peter and White, Martin and Meiksin, Avery A. and Almgren, Ann},
+title = "{The Lyman-$\alpha$ forest in optically thin hydrodynamical simulations}",
+journal = {Monthly Notices of the Royal Astronomical Society},
+volume = {446},
+number = {4},
+pages = {3697-3724},
+year = {2015},
+doi = {10.1093/mnras/stu2377},
+URL = { + http://dx.doi.org/10.1093/mnras/stu2377},
+eprint = {/oup/backfile/content_public/journal/mnras/446/4/10.1093_mnras_stu2377/2/stu2377.pdf}
+}
diff --git a/UsersGuide/NyxUserGuide.tex b/UsersGuide/NyxUserGuide.tex
index c1be1110..795870b7 100644
--- a/UsersGuide/NyxUserGuide.tex
+++ b/UsersGuide/NyxUserGuide.tex
@@ -193,6 +193,9 @@ \chapter{Gravity}
 \chapter{Dark Matter Particles}
 \input Particles/Particles.tex
 
+\chapter{Radiative Heating/Cooling}
+\input HeatCool/NyxHeatCool.tex
+
 \chapter{Active Galactic Nuclei}
 \input AGN/AGN.tex
 
@@ -205,6 +208,6 @@ \chapter{Post-processing}
 \renewcommand\bibname{References}
 \addcontentsline{toc}{chapter}{References}
 \bibliographystyle{plain}
-\bibliography{Gravity/gr,ComovingHydro/sgs,Forcing/force}
+\bibliography{Gravity/gr,ComovingHydro/sgs,Forcing/force,HeatCool/heatcool}
 
 \end{document}
diff --git a/Util/hpgmg/LICENSE b/Util/hpgmg/LICENSE
new file mode 100644
index 00000000..d8898f24
--- /dev/null
+++ b/Util/hpgmg/LICENSE
@@ -0,0 +1,23 @@
+Copyright (c) 2014, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory and UChicago Argonne, LLC.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Util/hpgmg/README b/Util/hpgmg/README
new file mode 100644
index 00000000..a26627f8
--- /dev/null
+++ b/Util/hpgmg/README
@@ -0,0 +1,2 @@
+From https://bitbucket.org/friesen/hpgmg.git
+branch make_hpgmg_lib
diff --git a/Util/hpgmg/finite-volume/README b/Util/hpgmg/finite-volume/README
new file mode 100644
index 00000000..7b365530
--- /dev/null
+++ b/Util/hpgmg/finite-volume/README
@@ -0,0 +1,31 @@
+*** Copyright Notice ***
+
+HPGMG, Copyright (c) 2014, The Regents of the University of
+California, through Lawrence Berkeley National Laboratory (subject to
+receipt of any required approvals from the U.S. Dept. of Energy).  All
+rights reserved.
+
+If you have questions about your rights to use or distribute this
+software, please contact Berkeley Lab's Technology Transfer Department
+at  TTD@lbl.gov.
+
+NOTICE.  This software is owned by the U.S. Department of Energy.  As
+such, the U.S. Government has been granted for itself and others
+acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide
+license in the Software to reproduce, prepare derivative works, and
+perform publicly and display publicly.  Beginning five (5) years after
+the date permission to assert copyright is obtained from the U.S.
+Department of Energy, and subject to any subsequent five (5) year
+renewals, the U.S. Government is granted for itself and others acting
+on its behalf a paid-up, nonexclusive, irrevocable, worldwide license
+in the Software to reproduce, prepare derivative works, distribute
+copies to the public, perform publicly and display publicly, and to
+permit others to do so.
+****************************
+
+This directory contains the current HPGMG finite-volume benchmark.
+
+Please see ./source/README for details on how to compiler, run, 
+optimize, and examine the output of the hpgmg finite-volume benchmark.
+
+Example job scripts are in the ./example_jobs directory
diff --git a/Util/hpgmg/finite-volume/local.mk b/Util/hpgmg/finite-volume/local.mk
new file mode 100644
index 00000000..67f20dc0
--- /dev/null
+++ b/Util/hpgmg/finite-volume/local.mk
@@ -0,0 +1 @@
+include $(call incsubdirs,source)
diff --git a/Util/hpgmg/finite-volume/source/Make.package b/Util/hpgmg/finite-volume/source/Make.package
new file mode 100644
index 00000000..d29d499f
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/Make.package
@@ -0,0 +1,5 @@
+cEXE_sources += timers.c
+cEXE_sources += level.c
+cEXE_sources += operators.7pt.c
+cEXE_sources += mg_hpgmg.c
+cEXE_sources += solvers.c
diff --git a/Util/hpgmg/finite-volume/source/Makefile b/Util/hpgmg/finite-volume/source/Makefile
new file mode 100644
index 00000000..260a5556
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/Makefile
@@ -0,0 +1,48 @@
+#CC = cc
+#CFLAGS = -g -O2 -I. -openmp -DUSE_BICGSTAB=1 -DUSE_SUBCOMM=1 -DUSE_FCYCLES=1 -DUSE_CHEBY=1 -DUSE_MPI=1 -DUNLIMIT_FMG_VCYCLES=1
+#LFLAGS = -g -O2 -openmp
+
+#CC = cc
+#CFLAGS = -g -O2 -I. -fopenmp -DUSE_BICGSTAB=1 -DUSE_SUBCOMM=1  -DUSE_CHEBY=1 -DUSE_MPI=1 -DUSE_FCYCLES=1 -DUNLIMIT_FMG_VCYCLES=1
+#LFLAGS = -g -O2 -fopenmp
+
+CC = cc
+CFLAGS = -g -O0 -I. -fopenmp -DUSE_BICGSTAB=1 -DUSE_SUBCOMM=1 -DUSE_FCYCLES=1 -DUSE_CHEBY=1 -DUSE_MPI=1 -DUNLIMIT_FMG_VCYCLES=1
+LFLAGS = -g -O0 -fopenmp
+
+#CC = cc
+#CFLAGS = -g -O0 -I. -DUSE_BICGSTAB=1 -DUSE_SUBCOMM=1 -DUSE_FCYCLES=1 -DUSE_CHEBY=1 -DUSE_MPI=1 -DUNLIMIT_FMG_VCYCLES=1
+#LFLAGS = -g -O0
+
+#CC = cc
+#CFLAGS = -g -O2 -I. -DUSE_BICGSTAB=1 -DUSE_SUBCOMM=1 -DUSE_FCYCLES=1 -DUSE_CHEBY=1 -DUSE_MPI=1
+#LFLAGS = -g -O2
+
+#CC = icc
+#CFLAGS = -g -O2 -I. -DUSE_BICGSTAB=1 -DUSE_FCYCLES=1 -DUSE_CHEBY=1
+#LFLAGS = -g -O2
+
+#CC = mpiicc
+#CFLAGS = -g -O2 -ip -xHost -I. -DUSE_BICGSTAB=1 -DUSE_SUBCOMM=1 -DUSE_FCYCLES=1 -DUSE_CHEBY=1 -DUSE_MPI=1
+#LFLAGS = -g -O2 -ip -xHost
+
+#CC = mpiicc
+#CFLAGS = -g -O0 -I. -DUSE_BICGSTAB=1 -DUSE_SUBCOMM=1 -DUSE_FCYCLES=1 -DUSE_CHEBY=1 -DUSE_MPI=1 -DUSE_PERIODIC_BC=1 -DUNLIMIT_FMG_VCYCLES=1
+#LFLAGS = -g -O0
+
+OBJ = timers.o level.o operators.7pt.o mg.o solvers.o hpgmg_setup.o
+MAIN = call_hpgmg_setup.o
+
+%.o: %.c $(DEPS)
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+libhpgmg_test.a: $(OBJ)
+	ar rcs $@ $^
+
+clean:
+	$(RM) *.o *.a
+
+lib: libhpgmg_test.a
+
+all: $(OBJ) $(MAIN)
+	$(CC) -o hpgmg $^ $(LFLAGS)
diff --git a/Util/hpgmg/finite-volume/source/README b/Util/hpgmg/finite-volume/source/README
new file mode 100644
index 00000000..405f88a7
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/README
@@ -0,0 +1,250 @@
+*** Copyright Notice ***
+
+HPGMG, Copyright (c) 2014, The Regents of the University of
+California, through Lawrence Berkeley National Laboratory (subject to
+receipt of any required approvals from the U.S. Dept. of Energy).  All
+rights reserved.
+
+If you have questions about your rights to use or distribute this
+software, please contact Berkeley Lab's Technology Transfer Department
+at  TTD@lbl.gov.
+
+NOTICE.  This software is owned by the U.S. Department of Energy.  As
+such, the U.S. Government has been granted for itself and others
+acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide
+license in the Software to reproduce, prepare derivative works, and
+perform publicly and display publicly.  Beginning five (5) years after
+the date permission to assert copyright is obtained from the U.S.
+Department of Energy, and subject to any subsequent five (5) year
+renewals, the U.S. Government is granted for itself and others acting
+on its behalf a paid-up, nonexclusive, irrevocable, worldwide license
+in the Software to reproduce, prepare derivative works, distribute
+copies to the public, perform publicly and display publicly, and to
+permit others to do so.
+****************************
+
+
+About
+=====
+HPGMG is a compact benchmark designed to proxy the geometric MG solves found in
+applications built from AMR MG frameworks like CHOMBO or BoxLib.  At a high 
+level, the benchmark solves Au=f where u and f are cell-centered (finite volume) 
+3D structured grids. The operator A is a fourth order finite volume discretization 
+of the helmholtz operator (a*alpha[]*u[] - b* div beta[] grad u[]), where a and 
+b are scalar constants and alpha[] and beta[] are spatially varyingcoefficients.
+HPGMG supports both periodic and homogeneous dirichlet boundary conditions.  
+The benchmark generates a u_exact[] for a large cubical 3D grid partitioned into
+subdomains(boxes) which are distributed across the supercomputer.  It then 
+manually differentiates u[] for form f[], then uses a multigrid solver to 
+calculate a u[].  It may then use u_exact[] to test correctness and order.  
+By default, HPGMG solves a poisson equation (a==0) with homogeneous dirichlet 
+boundary conditions.
+
+The basic relaxation operator is Gauss-Seidel Red-Black (GSRB) which is applied 
+twice (red/black/red/black) at every level up and and down the v-cycle.  HPGMG
+also includes Jacobi, L1Jacobi, Symmetric Gauss-Seidel, and Chebyshev Polynomial
+smoothers.  HPGMG implements both a truncated v-cycle (u-cycle) in which boxes 
+are restricted locally and a true distributed v-cycle in which restriction and 
+interpolation(prolongation) become true distributed operations.  The latter 
+allows the global problem to be restricted to as little as one cell.  At the 
+bottom of the v-cycle, HPGMG switches to one of a few bottom solvers (BiCGStab 
+is used often used in codes like BoxLib).  Once a sufficiently accurate solution
+is obtained on this coarse grid, it is interpolated back up the v-cycle.
+
+HPGMG also includes the option to use Full Multigrid (FMG) in which one executes
+an f-cycle that should hit the discretization error in one pass (instead of 10 
+v-cycles).  This can provide a substantial performance boost, but presents a 
+number of performance optimization challenges.
+
+
+Compilation
+===========
+Although no make file currently exists, compilation is straightforward.
+
+There are a few basic arguments that should be used.  Most are selfexplanatory.  
+
+-DUSE_MPI			// compiles the distributed (MPI) version
+-DUSE_CG			// use CG as a bottom (coarse grid) solver
+-DUSE_BICGSTAB			// use BiCGStab as a bottom (coarse grid) solver
+-DUSE_CABICGSTAB		// use CABiCGStab as a bottom (coarse grid) solver (makes more sense with U-Cycles)
+-DUSE_SUBCOMM			// build a subcommunicator for each level in the MG v-cycle to minimize the scope of MPI_AllReduce()
+
+-DUSE_FCYCLES			// use the Full Multigrid (FMG) solver... HPGMG benchmark should include this option
+				// note, the choice of FMG is orthogonal from U-Cycles and V-Cycles
+-DUSE_VCYCLES			// use true distributed V-Cycles in the multigrid solver... this is the suggested option
+-DUSE_UCYCLES			// use truncated V-Cycles (U-Cycles) in the multigrid solver... a legacy option for understanding the performance implications
+
+-DUSE_CHEBY			// use a Chebyshev Polynomial smoother (degree is specified with CHEBYSHEV_DEGREE)
+-DUSE_GSRB			// use the GSRB smoother (the number of pre/posts smooths is specified by NUM_SMOOTHS)
+-DUSE_JACOBI			// use a weighted Jacobi smoother with a weight of 2/3
+-DUSE_L1JACOBI			// use a L1 Jacobi smoother (each row's weight is the L1 norm of that row)
+
+-DBLOCKCOPY_TILE_I=###		// parallelism for all ghost zone, restriction, interpolation, and (now) operators (and eventually BC's) is organized the (cache/thread) block concept.  
+-DBLOCKCOPY_TILE_J=###		// That is, boxes are decomposed into tiles of size BLOCKCOPY_TILE_I x BLOCKCOPY_TILE_J x BLOCKCOPY_TILE_J.  Users may tune to find the optimal block size.
+-DBLOCKCOPY_TILE_K=###		// Smaller blocks fit in cache and express more TLP (good for MIC/BGQ/GPUs/...).  However, the unit stride for small blocks is reduced (bad for CPUs which rely on prefetchers)
+				// If these are ommited, the code relies on its defaults.
+
+-DBOX_ALIGN_JSTRIDE=###		// Data allocation is now performed on a level-by-level basis (rather than block-by-block).  
+-DBOX_ALIGN_KSTRIDE=###		// In order to guarantee SIMD alignment, you can pad the unit-stride to a nice round number (e.g. 2, 4, or 8) so that j+/-1 is SIMD-aligned.
+-DBOX_ALIGN_VOLUME=###		// Similarly, you can pad the kStride (or volume) so that k+/-1 (or vector+/-1) is SIMD-aligned
+				// If these are ommited, the code relies on its defaults.
+
+-DMAX_COARSE_DIM=###		// provides a means of constraining the maximum coarse dimension.  By default, the maximum is 11 (i.e. maximum coarse grid is 11^3)
+
+
+Let us consider an example for Edison, the Cray XC30 at NERSC where the MPI compiler uses icc and is invoked as 'cc'.
+cc -Ofast -xAVX -fopenmp level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI  -DUSE_SUBCOMM -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB  -o run.edison
+Conversely, true flat MPI should omit the -fopenmp flag.
+cc -Ofast -xAVX          level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI  -DUSE_SUBCOMM -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB  -o run.edison.flat
+
+
+On Mira (an IBM BlueGene/Q), one can use the following...
+soft add +mpiwrapper-xl
+mpixlc_r -O5 -qsmp=omp:noauto level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI                -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB  -o run.bgq
+-or-
+mpixlc_r -O5 -qsmp=omp:noauto level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI                -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB  -o run.bgq
+In order to compile with IBM's HPM counters...
+mpixlc_r -O5 -qsmp=omp:noauto level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI                -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB  -o run.bgq \
+  -DUSE_HPM -I/bgsys/drivers/ppcfloor/bgpm/include -L/soft/perftools/hpctw/lib -L/bgsys/drivers/ppcfloor/bgpm/lib /bgsys/drivers/ppcfloor/bgpm/lib/libbgpm.a  -lbgpm  -lmpihpm_smp -lmpitrace
+
+On Babbage (Xeon Phi cluster at NERSC), one can use the following for native mode compilation...
+mpiicc -mmic -Ofast -fopenmp level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI  -DUSE_SUBCOMM -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB  -o run.babbage
+-or-
+mpiicc -mmic -Ofast -fopenmp level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI  -DUSE_SUBCOMM -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB  -o run.babbage
+
+
+
+4th order HPGMG-FV
+==================
+Included in this release is the 4th order Finite Volume Full Multigrid Implementation.  Unlike a 2nd order implementation, the 4th order version requires a different operator file (operators.fv4.c instead of operators.7pt.c).  Currently, it is highly recommended one use GSRB over the Chebyshev or Jacobi smoothers.  By default, most smoothers make 6 pases thru the data (instead of 4) in order to provide sufficient error and residual.  This has the effect of increasing the time spent in smoothing by 50%.  
+
+Nominally, compared to the 2nd order method, the 4th order smoother performs ...
+- 4x the flops
+- 4x the MPI messages
+- 2x the MPI data movement
+- 1x the DRAM data movement
+- provides 4 more bits of accuracy for every 8x increase in the problem size (instead of 2 bits)
+
+In order to compile the older 2nd order version on Edison, one may use the following command line...
+cc  -Ofast -xAVX -fopenmp level.c operators.fv2.c  mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI  -DUSE_SUBCOMM -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB -o run.edison
+
+
+
+Running the benchmark
+=====================
+The benchmark exploits OpenMP and/or MPI for parallelism.
+You must thus set OMP_NUM_THREADS correctly.  For a machine like Edison at NERSC, this is simply 
+% export OMP_NUM_THREADS=12
+Moreover, on multisocket architectures or when using MPI, you must set affinity correctly.
+
+The benchmark takes 2 arguments.
+./run.hpgmg [log2BoxSize] [Target # of boxes per process]
+- log2BoxSize is the log base 2 of the dimension of each box on the finnest grid (e.g. 6 is a good proxy for real applications)
+- the target number of boxes per process is a loose bound on memory per process
+Given these constraints, the benchmark will then calculate the largest cubical domain it can run.
+
+The code supports nested OpenMP parallelism which can be enabled by setting
+OMP_NESTED=true.  At each multigrid level, the code will try and determine
+the best balance between coarse and fine-frained parallelism.
+
+On edison, (the Cray XC30 at nersc), one uses aprun to invoke mpi jobs.  A job script may include the following...
+#PBS -l mppwidth=96
+export OMP_NUM_THREADS=12
+aprun -n 8  -d 12  -N 2  -S 1  -ss  -cc numa_node ./run.hpgmg  6  8
+This will launch 8 mpi processes (-n 8) of 12 threads (-d 12 == OMP_NUM_THREADS)
+with 2 processes per node (-N 2), 1 process per NUMA node (-S 1) with the 
+appropriate NUMA controls (-ss  -cc numa_node).  Moreover, the dimension of each
+box is 2^6 on a side (64^3) and there is a target of 8 boxes per process on the
+finest grid.  The resultant problem is thus 256^3 on the finest grid.
+
+On Mira (the IBM Blue Gene/Q at Argonne), one may use qsub to directly queue
+the benchmark.  For example...
+qsub -t 00:10:00 -n  64 --proccount   64 --mode c1  -A [ALLOCATION] --env BG_SHAREDMEMSIZE=32MB:PAMID_VERBOSE=1:BG_COREDUMPDISABLED=1:BG_SMP_FAST_WAKEUP=YES:BG_THREADLAYOUT=1:OMP_PROC_BIND=TRUE:OMP_NUM_THREADS=64:OMP_WAIT_POLICY=active:OMP_NESTED=true ./run.bgq 6 8
+Will run the benchmark on 64 processes spread over 64 nodes with 1 process per 
+node (c1) and 64 threads per process.  Each process is allocated 8 64^3 boxes. 
+At one point, the additional environment variables listed were found to 
+accelerate performance.
+
+On Babbage (the Xeon Phi Cluster at NERSC)
+mpirun.mic -n 8 -hostfile micfile.$PBS_JOBID -ppn 1 -env OMP_NUM_THREADS 120 -env KMP_AFFINITY balanced  ./run.babbage 7 1
+
+
+Understanding the Results
+=========================
+During execution, the benchmark will output some debug information for understanding convergence and performance.  
+The following is an example and examines a key subset of this information.
++ aprun -n 512 -d 12 -N 2 -S 1 -ss -cc numa_node ./run.hpgmg.edison 7 8
+
+Requested MPI_THREAD_FUNNELED, got MPI_THREAD_FUNNELED
+512 MPI Tasks of 12 threads
+truncating the v-cycle at 2^3 subdomains
+creating domain...       done
+  128 x 128 x 128 (per subdomain)
+  256 x 256 x 256 (per process)
+  2048 x 2048 x 2048 (overall)
+  1-deep ghost zones
+  allocated 1865 MB
+
+This initial output details how MPI and OpenMP were initialized.  
+Moreover, it notes how deep the v-cycle is (down to 2^3 boxes)
+It then shows the progress as it creates the structured grids noting their respective sizes and the total memory explicitly allocated with malloc().
+Thus, the 2K^3 overall problem represents 8 billion degrees of freedom.
+
+
+MGSolve...
+v-cycle= 1, norm=0.00002091903646017090 (2.091904e-05)
+v-cycle= 2, norm=0.00000079708396334668 (7.970840e-07)
+v-cycle= 3, norm=0.00000007951502395414 (7.951502e-08)
+v-cycle= 4, norm=0.00000000581619537788 (5.816195e-09)
+v-cycle= 5, norm=0.00000000048970464287 (4.897046e-10)
+v-cycle= 6, norm=0.00000000003900568126 (3.900568e-11)
+v-cycle= 7, norm=0.00000000000318039461 (3.180395e-12)
+v-cycle= 8, norm=0.00000000000025703104 (2.570310e-13)
+v-cycle= 9, norm=0.00000000000002088201 (2.088201e-14)
+v-cycle=10, norm=0.00000000000000170463 (1.704634e-15)
+v-cycle=11, norm=0.00000000000000014284 (1.428395e-16)
+done
+
+As the multigrid solver progresses, the max (inf) norm of the residual is reported after each v-cycle.
+One expects to reduce the norm by one digit on each v-cycle.  
+Thus to attain a norm less than 1e-15, we required 11 v-cycles.
+
+
+                                  0            1            2            3            4            5            6
+                              128^3         64^3         32^3         16^3          8^3          4^3          2^3        total
+smooth                     2.244879     0.288221     0.020186     0.003279     0.000672     0.000267     0.000000     2.557504
+residual                   0.569046     0.035340     0.001833     0.000328     0.000077     0.000036     0.000030     0.606691
+restriction                0.041538     0.003994     0.000310     0.000072     0.000032     0.000028     0.000000     0.045975
+interpolation              0.076533     0.006586     0.000567     0.000105     0.000038     0.000032     0.000000     0.083860
+applyOp                    0.000000     0.000000     0.000000     0.000000     0.000000     0.000000     0.001715     0.001715
+BLAS1                      0.157396     0.004949     0.000776     0.000184     0.000055     0.000027     0.014614     0.178002
+BLAS3                      0.000000     0.000000     0.000000     0.000000     0.000000     0.000000     0.000000     0.000000
+communication              0.314615     0.069810     0.024858     0.017584     0.009740     0.005763     0.318338     0.760707
+  local exchange           0.047781     0.008262     0.001819     0.000730     0.000368     0.000233     0.001743     0.060936
+  pack MPI buffers         0.047688     0.007722     0.001089     0.000569     0.000294     0.000215     0.001630     0.059207
+  unpack MPI buffers       0.022835     0.004058     0.001226     0.000530     0.000349     0.000231     0.001712     0.030940
+  MPI_Isend                0.002422     0.002161     0.000856     0.000659     0.000779     0.000374     0.002755     0.010005
+  MPI_Irecv                0.000456     0.000402     0.000152     0.000205     0.000119     0.000079     0.000677     0.002089
+  MPI_Waitall              0.169658     0.047091     0.019666     0.014850     0.007801     0.004603     0.022721     0.286390
+  MPI_collectives          0.023637     0.000000     0.000000     0.000000     0.000000     0.000000     0.286850     0.310487
+--------------         ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Total by level             3.386964     0.404774     0.047960     0.021436     0.010595     0.006159     0.334945     4.212834
+
+  Total time in MGBuild       0.081082
+  Total time in MGSolve       4.235200
+              " v-cycles      4.213100
+      number of v-cycles            11
+Bottom solver iterations           397
+
+Finally, we see a timing report.  Vertically are the key operations within the v-cycle (communication is further broken down into its constituient operations).  Horizontally is a breakdown of time (in seconds) by level in the v-cycle.  Thus, one can see the difference in time spent in each operation at each level.  These times are totaled by level and by function.  Finally, the total time required to build the solver (note geometric multigrid solves can be built extremely quickly), the time spent in the solver, the number of v-cycles, and the total number of bottom solver (e.g. BiCGStab) iterations summed across all v-cycles is reported.
+
+We thus observe that this 8 billion DOF problem was solved in 4.23 seconds on 512 processes (6144 cores).  It required 397 BiCGStab iterations on the coarse grid spread over 11 vcycles (approx 36 terations per v-cycle).  
+
+As the time spent smoothing the fine grid was non-trivial (2.244 seconds), one might be motivated to analyze it.
+Each box has (128+2)^3 cells including ghost zones.
+There are 8 boxes (2 2 2) per process
+Each call to smooth moves 64 bytes of data per cell per stencil sweep.
+There are 4 calls to smooth and 2 stencil sweeps per smooth in the v-cycle.
+There are 11 v-cycles.
+Thus, smooth requires one move *at least* 8 * 130^3 * 64 * 8 * 11 bytes of data = 98.99e9 bytes.
+Moving this data in 2.244 seconds suggests each process attained an average DRAM bandwidth of 44 GB/s.  This is quite good given this was vanilla OpenMP code without optimization and one could never hope for better than 54GB/s on this machine.
diff --git a/Util/hpgmg/finite-volume/source/TODO b/Util/hpgmg/finite-volume/source/TODO
new file mode 100644
index 00000000..589d1185
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/TODO
@@ -0,0 +1,6 @@
+- cubical problem size -> rectahedral problem size ... init problem, restriction rules, etc...
+- rectahedral problem size -> arbitrary problem shape...
+- more efficient ghost zone exchange (box intersection algebra) when communicating edges and corners
+- overlap BC with exchange
+- add a VECTOR_INTERNAL
+- iterate on F's (faster for 2nd order, slower for 4th order)?
diff --git a/Util/hpgmg/finite-volume/source/call_hpgmg_setup.c b/Util/hpgmg/finite-volume/source/call_hpgmg_setup.c
new file mode 100644
index 00000000..06d9852e
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/call_hpgmg_setup.c
@@ -0,0 +1,60 @@
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+
+void hpgmg_setup (const int log2_box_dim,
+                  const int target_boxes_per_rank,
+                  const int OMP_Threads,
+                  const int OMP_Nested,
+                  const int requested_threading_model,
+                  const int actual_threading_model);
+
+int
+main (int argc, char *argv[])
+{
+
+  const int log2_box_dim = 6;
+  const int target_boxes_per_rank = 1;
+
+  int OMP_Threads = 1;
+  int OMP_Nested = 0;
+
+#ifdef _OPENMP
+#pragma omp parallel
+  {
+#pragma omp master
+    {
+      OMP_Threads = omp_get_num_threads ();
+      OMP_Nested = omp_get_nested ();
+    }
+  }
+#endif
+
+#ifdef USE_MPI
+  int actual_threading_model = -1;
+  int requested_threading_model = -1;
+  requested_threading_model = MPI_THREAD_SINGLE;
+  //requested_threading_model = MPI_THREAD_FUNNELED;
+  //requested_threading_model = MPI_THREAD_SERIALIZED;
+  //requested_threading_model = MPI_THREAD_MULTIPLE;
+  //MPI_Init(&argc, &argv);
+#ifdef _OPENMP
+  requested_threading_model = MPI_THREAD_FUNNELED;
+  //requested_threading_model = MPI_THREAD_SERIALIZED;
+  //requested_threading_model = MPI_THREAD_MULTIPLE;
+  //MPI_Init_thread(&argc, &argv, requested_threading_model, &actual_threading_model);
+#endif
+  MPI_Init_thread (&argc, &argv, requested_threading_model,
+                   &actual_threading_model);
+#ifdef USE_HPM                  // IBM HPM counters for BGQ...
+  HPM_Init ();
+#endif
+#endif
+
+  hpgmg_setup (log2_box_dim,
+               target_boxes_per_rank,
+               OMP_Threads,
+               OMP_Nested, requested_threading_model, actual_threading_model);
+
+  return 0;
+}
diff --git a/Util/hpgmg/finite-volume/source/compile b/Util/hpgmg/finite-volume/source/compile
new file mode 100644
index 00000000..ce96c1df
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/compile
@@ -0,0 +1,16 @@
+
+
+#=======================================================================================================================
+# mira
+#=======================================================================================================================
+soft add +mpiwrapper-xl
+qsub -t 00:10:00 -n  64 --proccount   64 --mode c1  -A PEACEndStation --env BG_SHAREDMEMSIZE=32MB:PAMID_VERBOSE=1:BG_COREDUMPDISABLED=1:BG_SMP_FAST_WAKEUP=YES:BG_THREADLAYOUT=2:OMP_PROC_BIND=TRUE:OMP_NUM_THREADS=64:OMP_WAIT_POLICY=active ./run.bgq 7 1
+qsub -t 00:10:00 -n  64 --proccount   64 --mode c1  -A PEACEndStation --env BG_SHAREDMEMSIZE=32MB:PAMID_VERBOSE=1:BG_COREDUMPDISABLED=1:BG_SMP_FAST_WAKEUP=YES:BG_THREADLAYOUT=2:OMP_PROC_BIND=TRUE:OMP_NUM_THREADS=64:OMP_WAIT_POLICY=active:OMP_NESTED=true ./run.bgq 6 8
+
+
+mpixlc_r -O5 -qsmp=omp:noauto level.c operators.fv4.c mg.c solvers.c hpgmg-fv.c timers.c -DUSE_MPI -DUSE_FCYCLES -DUSE_GSRB -DUSE_BICGSTAB -DBLOCKCOPY_TILE_K=1 -DBLOCKCOPY_TILE_J=32 -o run.bgq.1x32 -DUSE_HPM -L/soft/perftools/hpctw/lib -L/soft/perftools/bgpm/lib -lmpihpm_smp -lbgpm 
+
+
+mpirun.mic -n 8 -ppn 8 -hostfile micfile.$PBS_JOBID -env OMP_NUM_THREADS=30 -env KMP_AFFINITY=compact -env I_MPI_FABRICS=shm      -env I_MPI_PIN_DOMAIN=30 ./run.babbage.baseline 7 1
+mpirun.mic -n 8 -ppn 8 -hostfile micfile.$PBS_JOBID -env OMP_NUM_THREADS=30 -env KMP_AFFINITY=compact -env I_MPI_FABRICS=shm:ofa  -env I_MPI_PIN_DOMAIN=30 ./run.babbage.baseline 7 1
+mpirun.mic -n 8 -ppn 8 -hostfile micfile.$PBS_JOBID -env OMP_NUM_THREADS=30 -env KMP_AFFINITY=compact -env I_MPI_FABRICS=shm:dapl -env I_MPI_PIN_DOMAIN=30 ./run.babbage.baseline 7 1
diff --git a/Util/hpgmg/finite-volume/source/defines.h b/Util/hpgmg/finite-volume/source/defines.h
new file mode 100644
index 00000000..0d283514
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/defines.h
@@ -0,0 +1,27 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+// Lu = a*alpha[]*u[] - b*divergence( beta[]*gradient(u[]) )
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef DEFINES_H
+#define DEFINES_H
+//------------------------------------------------------------------------------------------------------------------------------
+#define  VECTOR_TEMP         0 // 
+#define  VECTOR_E            1 // error used in residual correction FMG
+#define  VECTOR_F_MINUS_AV   2 // cell centered residual (f-Av)
+//------------------------------------------------------------------------------------------------------------------------------
+#define  VECTOR_F            3 // original right-hand side (Au=f), cell centered
+#define  VECTOR_U            4 // numerical solution
+#define  VECTOR_ALPHA        5 // cell centered coefficient
+#define  VECTOR_BETA_I       6 // face centered coefficient (n.b. element 0 is the left face of the ghost zone element)
+#define  VECTOR_BETA_J       7 // face centered coefficient (n.b. element 0 is the back face of the ghost zone element)
+#define  VECTOR_BETA_K       8 // face centered coefficient (n.b. element 0 is the bottom face of the ghost zone element)
+//------------------------------------------------------------------------------------------------------------------
+#define  VECTOR_DINV         9 // cell centered relaxation parameter (e.g. inverse of the diagonal)
+#define  VECTOR_L1INV       10 // cell centered relaxation parameter (e.g. inverse of the L1 norm of each row)
+//------------------------------------------------------------------------------------------------------------------
+#define VECTORS_RESERVED    11 // total number of vectors and the starting location for any auxillary bottom solver vectors
+//------------------------------------------------------------------------------------------------------------------------------
+#endif
diff --git a/Util/hpgmg/finite-volume/source/hpgmg-fv.c b/Util/hpgmg/finite-volume/source/hpgmg-fv.c
new file mode 100644
index 00000000..ff53f471
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/hpgmg-fv.c
@@ -0,0 +1,382 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Copyright Notice 
+//------------------------------------------------------------------------------------------------------------------------------
+// HPGMG, Copyright (c) 2014, The Regents of the University of
+// California, through Lawrence Berkeley National Laboratory (subject to
+// receipt of any required approvals from the U.S. Dept. of Energy).  All
+// rights reserved.
+// 
+// If you have questions about your rights to use or distribute this
+// software, please contact Berkeley Lab's Technology Transfer Department
+// at  TTD@lbl.gov.
+// 
+// NOTICE.  This software is owned by the U.S. Department of Energy.  As
+// such, the U.S. Government has been granted for itself and others
+// acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide
+// license in the Software to reproduce, prepare derivative works, and
+// perform publicly and display publicly.  Beginning five (5) years after
+// the date permission to assert copyright is obtained from the U.S.
+// Department of Energy, and subject to any subsequent five (5) year
+// renewals, the U.S. Government is granted for itself and others acting
+// on its behalf a paid-up, nonexclusive, irrevocable, worldwide license
+// in the Software to reproduce, prepare derivative works, distribute
+// copies to the public, perform publicly and display publicly, and to
+// permit others to do so.
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#include "timers.h"
+#include "defines.h"
+#include "level.h"
+#include "mg.h"
+#include "operators.h"
+#include "solvers.h"
+//------------------------------------------------------------------------------------------------------------------------------
+void bench_hpgmg(mg_type *all_grids, int onLevel, double a, double b, double dtol, double rtol){
+     int     doTiming;
+     int    minSolves = 10; // do at least minSolves MGSolves
+  double timePerSolve = 0;
+
+  for(doTiming=0;doTiming<=1;doTiming++){ // first pass warms up, second pass times
+
+    #ifdef USE_HPM // IBM performance counters for BGQ...
+    if( (doTiming==1) && (onLevel==0) )HPM_Start("FMGSolve()");
+    #endif
+
+    #ifdef USE_MPI
+    double minTime   = 60.0; // minimum time in seconds that the benchmark should run
+    double startTime = MPI_Wtime();
+    if(doTiming==1){
+      if((minTime/timePerSolve)>minSolves)minSolves=(minTime/timePerSolve); // if one needs to do more than minSolves to run for minTime, change minSolves
+    }
+    #endif
+
+    if(all_grids->levels[onLevel]->my_rank==0){
+      if(doTiming==0){fprintf(stdout,"\n\n===== Warming up by running %d solves ==========================================\n",minSolves);}
+                 else{fprintf(stdout,"\n\n===== Running %d solves ========================================================\n",minSolves);}
+      fflush(stdout);
+    }
+
+    int numSolves =  0; // solves completed
+    MGResetTimers(all_grids);
+    while( (numSolves<minSolves) ){
+      zero_vector(all_grids->levels[onLevel],VECTOR_U);
+      #ifdef USE_FCYCLES
+      FMGSolve(all_grids,onLevel,VECTOR_U,VECTOR_F,a,b,dtol,rtol);
+      #else
+       MGSolve(all_grids,onLevel,VECTOR_U,VECTOR_F,a,b,dtol,rtol);
+      #endif
+      numSolves++;
+    }
+
+    #ifdef USE_MPI
+    if(doTiming==0){
+      double endTime = MPI_Wtime();
+      timePerSolve = (endTime-startTime)/numSolves;
+      MPI_Bcast(&timePerSolve,1,MPI_DOUBLE,0,MPI_COMM_WORLD); // after warmup, process 0 broadcasts the average time per solve (consensus)
+    }
+    #endif
+
+    #ifdef USE_HPM // IBM performance counters for BGQ...
+    if( (doTiming==1) && (onLevel==0) )HPM_Stop("FMGSolve()");
+    #endif
+  }
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+int main(int argc, char **argv){
+  int my_rank=0;
+  int num_tasks=1;
+  int OMP_Threads = 1;
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  
+  #ifdef _OPENMP
+  #pragma omp parallel 
+  {
+    #pragma omp master
+    {
+      OMP_Threads = omp_get_num_threads();
+    }
+  }
+  #endif
+    
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  
+  // initialize MPI and HPM
+  #ifdef USE_MPI
+  int    actual_threading_model = -1;
+  int requested_threading_model = -1;
+      requested_threading_model = MPI_THREAD_SINGLE;
+    //requested_threading_model = MPI_THREAD_FUNNELED;
+    //requested_threading_model = MPI_THREAD_SERIALIZED;
+    //requested_threading_model = MPI_THREAD_MULTIPLE;
+    #ifdef _OPENMP
+      requested_threading_model = MPI_THREAD_FUNNELED;
+    //requested_threading_model = MPI_THREAD_SERIALIZED;
+    //requested_threading_model = MPI_THREAD_MULTIPLE;
+    #endif
+  MPI_Init_thread(&argc, &argv, requested_threading_model, &actual_threading_model);
+  MPI_Comm_size(MPI_COMM_WORLD, &num_tasks);
+  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+  #ifdef USE_HPM // IBM HPM counters for BGQ...
+  HPM_Init();
+  #endif
+  #endif // USE_MPI
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  
+  // parse the arguments...
+  int     log2_box_dim           =  6; // 64^3
+  int     target_boxes_per_rank  =  1;
+//int64_t target_memory_per_rank = -1; // not specified
+  int64_t box_dim                = -1;
+  int64_t boxes_in_i             = -1;
+  int64_t target_boxes           = -1;
+
+  if(argc==3){
+             log2_box_dim=atoi(argv[1]);
+    target_boxes_per_rank=atoi(argv[2]);
+
+    if(log2_box_dim>9){
+      // NOTE, in order to use 32b int's for array indexing, box volumes must be less than 2^31 doubles
+      if(my_rank==0){fprintf(stderr,"log2_box_dim must be less than 10\n");}
+      #ifdef USE_MPI
+      MPI_Finalize();
+      #endif
+      exit(0);
+    }
+
+    if(log2_box_dim<4){
+      if(my_rank==0){fprintf(stderr,"log2_box_dim must be at least 4\n");}
+      #ifdef USE_MPI
+      MPI_Finalize();
+      #endif
+      exit(0);
+    }
+
+    if(target_boxes_per_rank<1){
+      if(my_rank==0){fprintf(stderr,"target_boxes_per_rank must be at least 1\n");}
+      #ifdef USE_MPI
+      MPI_Finalize();
+      #endif
+      exit(0);
+    }
+
+    #ifndef MAX_COARSE_DIM
+    #define MAX_COARSE_DIM 11
+    #endif
+    box_dim=1<<log2_box_dim;
+    target_boxes = (int64_t)target_boxes_per_rank*(int64_t)num_tasks;
+    boxes_in_i = -1;
+    int64_t bi;
+    for(bi=1;bi<1000;bi++){ // search all possible problem sizes to find acceptable boxes_in_i
+      int64_t total_boxes = bi*bi*bi;
+      if(total_boxes<=target_boxes){
+        int64_t coarse_grid_dim = box_dim*bi;
+        while( (coarse_grid_dim%2) == 0){coarse_grid_dim=coarse_grid_dim/2;}
+        if(coarse_grid_dim<=MAX_COARSE_DIM){
+          boxes_in_i = bi;
+        }
+      }
+    }
+    if(boxes_in_i<1){
+      if(my_rank==0){fprintf(stderr,"failed to find an acceptable problem size\n");}
+      #ifdef USE_MPI
+      MPI_Finalize();
+      #endif
+      exit(0);
+    }
+  } // argc==3
+
+  #if 0
+  else if(argc==2){ // interpret argv[1] as target_memory_per_rank
+    char *ptr = argv[1];
+    char *tmp;
+    target_memory_per_rank = strtol(ptr,&ptr,10);
+    if(target_memory_per_rank<1){
+      if(my_rank==0){fprintf(stderr,"unrecognized target_memory_per_rank... '%s'\n",argv[1]);}
+      #ifdef USE_MPI
+      MPI_Finalize();
+      #endif
+      exit(0);
+    }
+    tmp=strstr(ptr,"TB");if(tmp){ptr=tmp+2;target_memory_per_rank *= (uint64_t)(1<<30)*(1<<10);}
+    tmp=strstr(ptr,"GB");if(tmp){ptr=tmp+2;target_memory_per_rank *= (uint64_t)(1<<30);}
+    tmp=strstr(ptr,"MB");if(tmp){ptr=tmp+2;target_memory_per_rank *= (uint64_t)(1<<20);}
+    tmp=strstr(ptr,"tb");if(tmp){ptr=tmp+2;target_memory_per_rank *= (uint64_t)(1<<30)*(1<<10);}
+    tmp=strstr(ptr,"gb");if(tmp){ptr=tmp+2;target_memory_per_rank *= (uint64_t)(1<<30);}
+    tmp=strstr(ptr,"mb");if(tmp){ptr=tmp+2;target_memory_per_rank *= (uint64_t)(1<<20);}
+    if( (ptr) && (*ptr != '\0') ){
+      if(my_rank==0){fprintf(stderr,"unrecognized units... '%s'\n",ptr);}
+      #ifdef USE_MPI
+      MPI_Finalize();
+      #endif
+      exit(0);
+    }
+    // FIX, now search for an 'acceptable' box_dim and boxes_in_i constrained by target_memory_per_rank, num_tasks, and MAX_COARSE_DIM
+  } // argc==2
+  #endif
+
+
+  else{
+    if(my_rank==0){fprintf(stderr,"usage: ./hpgmg-fv  [log2_box_dim]  [target_boxes_per_rank]\n");}
+                 //fprintf(stderr,"       ./hpgmg-fv  [target_memory_per_rank[MB,GB,TB]]\n");}
+    #ifdef USE_MPI
+    MPI_Finalize();
+    #endif
+    exit(0);
+  }
+
+
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  if(my_rank==0){
+  fprintf(stdout,"\n\n");
+  fprintf(stdout,"********************************************************************************\n");
+  fprintf(stdout,"***                            HPGMG-FV Benchmark                            ***\n");
+  fprintf(stdout,"********************************************************************************\n");
+  #ifdef USE_MPI
+       if(requested_threading_model == MPI_THREAD_MULTIPLE  )fprintf(stdout,"Requested MPI_THREAD_MULTIPLE, ");
+  else if(requested_threading_model == MPI_THREAD_SINGLE    )fprintf(stdout,"Requested MPI_THREAD_SINGLE, ");
+  else if(requested_threading_model == MPI_THREAD_FUNNELED  )fprintf(stdout,"Requested MPI_THREAD_FUNNELED, ");
+  else if(requested_threading_model == MPI_THREAD_SERIALIZED)fprintf(stdout,"Requested MPI_THREAD_SERIALIZED, ");
+  else if(requested_threading_model == MPI_THREAD_MULTIPLE  )fprintf(stdout,"Requested MPI_THREAD_MULTIPLE, ");
+  else                                                       fprintf(stdout,"Requested Unknown MPI Threading Model (%d), ",requested_threading_model);
+       if(actual_threading_model    == MPI_THREAD_MULTIPLE  )fprintf(stdout,"got MPI_THREAD_MULTIPLE\n");
+  else if(actual_threading_model    == MPI_THREAD_SINGLE    )fprintf(stdout,"got MPI_THREAD_SINGLE\n");
+  else if(actual_threading_model    == MPI_THREAD_FUNNELED  )fprintf(stdout,"got MPI_THREAD_FUNNELED\n");
+  else if(actual_threading_model    == MPI_THREAD_SERIALIZED)fprintf(stdout,"got MPI_THREAD_SERIALIZED\n");
+  else if(actual_threading_model    == MPI_THREAD_MULTIPLE  )fprintf(stdout,"got MPI_THREAD_MULTIPLE\n");
+  else                                                       fprintf(stdout,"got Unknown MPI Threading Model (%d)\n",actual_threading_model);
+  #endif
+  fprintf(stdout,"%d MPI Tasks of %d threads\n",num_tasks,OMP_Threads);
+  fprintf(stdout,"\n\n===== Benchmark setup ==========================================================\n");
+  }
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // create the fine level...
+  #ifdef USE_PERIODIC_BC
+  int bc = BC_PERIODIC;
+  int minCoarseDim = 2; // avoid problems with black box calculation of D^{-1} for poisson with periodic BC's on a 1^3 grid
+  #else
+  int bc = BC_DIRICHLET;
+  int minCoarseDim = 1; // assumes you can drop order on the boundaries
+  #endif
+  level_type level_h;
+  int ghosts=stencil_get_radius();
+  create_level(&level_h,boxes_in_i,box_dim,ghosts,VECTORS_RESERVED,bc,my_rank,num_tasks);
+  #ifdef USE_HELMHOLTZ
+  double a=1.0;double b=1.0; // Helmholtz
+  if(my_rank==0)fprintf(stdout,"  Creating Helmholtz (a=%f, b=%f) test problem\n",a,b);
+  #else
+  double a=0.0;double b=1.0; // Poisson
+  if(my_rank==0)fprintf(stdout,"  Creating Poisson (a=%f, b=%f) test problem\n",a,b);
+  #endif
+  double h=1.0/( (double)boxes_in_i*(double)box_dim );  // [0,1]^3 problem
+  initialize_problem(&level_h,h,a,b);                   // initialize VECTOR_ALPHA, VECTOR_BETA*, and VECTOR_F
+  rebuild_operator(&level_h,NULL,a,b);                  // calculate Dinv and lambda_max
+  if(level_h.boundary_condition.type == BC_PERIODIC){   // remove any constants from the RHS for periodic problems
+    double average_value_of_f = mean(&level_h,VECTOR_F);
+    if(average_value_of_f!=0.0){
+      if(my_rank==0){fprintf(stderr,"  WARNING... Periodic boundary conditions, but f does not sum to zero... mean(f)=%e\n",average_value_of_f);}
+      shift_vector(&level_h,VECTOR_F,VECTOR_F,-average_value_of_f);
+    }
+  }
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // create the MG hierarchy...
+  mg_type MG_h;
+  MGBuild(&MG_h,&level_h,a,b,minCoarseDim);             // build the Multigrid Hierarchy 
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // HPGMG-500 benchmark proper
+  // evaluate performance on problem sizes of h, 2h, and 4h
+  // (i.e. examine dynamic range for problem sizes N, N/8, and N/64)
+//double dtol=1e-15;double rtol=  0.0; // converged if ||D^{-1}(b-Ax)|| < dtol
+  double dtol=  0.0;double rtol=1e-10; // converged if ||b-Ax|| / ||b|| < rtol
+  int l;
+  #ifndef TEST_ERROR
+
+  double AverageSolveTime[3];
+  for(l=0;l<3;l++){
+    if(l>0)restriction(MG_h.levels[l],VECTOR_F,MG_h.levels[l-1],VECTOR_F,RESTRICT_CELL);
+    bench_hpgmg(&MG_h,l,a,b,dtol,rtol);
+    AverageSolveTime[l] = (double)MG_h.timers.MGSolve / (double)MG_h.MGSolves_performed;
+    if(my_rank==0){fprintf(stdout,"\n\n===== Timing Breakdown =========================================================\n");}
+    MGPrintTiming(&MG_h,l);
+  }
+
+  if(my_rank==0){
+    #ifdef CALIBRATE_TIMER
+    double _timeStart=getTime();sleep(1);double _timeEnd=getTime();
+    double SecondsPerCycle = (double)1.0/(double)(_timeEnd-_timeStart);
+    #else
+    double SecondsPerCycle = 1.0;
+    #endif
+    fprintf(stdout,"\n\n===== Performance Summary ======================================================\n");
+    for(l=0;l<3;l++){
+      double DOF = (double)MG_h.levels[l]->dim.i*(double)MG_h.levels[l]->dim.j*(double)MG_h.levels[l]->dim.k;
+      double seconds = SecondsPerCycle*(double)AverageSolveTime[l];
+      double DOFs = DOF / seconds;
+      fprintf(stdout,"  h=%0.15e  DOF=%0.15e  time=%0.6f  DOF/s=%0.3e  MPI=%d  OMP=%d\n",MG_h.levels[l]->h,DOF,seconds,DOFs,num_tasks,OMP_Threads);
+    }
+  }
+  #endif
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  if(my_rank==0){fprintf(stdout,"\n\n===== Richardson error analysis ================================================\n");}
+  // solve A^h u^h = f^h
+  // solve A^2h u^2h = f^2h
+  // solve A^4h u^4h = f^4h
+  // error analysis...
+  MGResetTimers(&MG_h);
+  for(l=0;l<3;l++){
+    if(l>0)restriction(MG_h.levels[l],VECTOR_F,MG_h.levels[l-1],VECTOR_F,RESTRICT_CELL);
+           zero_vector(MG_h.levels[l],VECTOR_U);
+    #ifdef USE_FCYCLES
+    FMGSolve(&MG_h,l,VECTOR_U,VECTOR_F,a,b,dtol,rtol);
+    #else
+     MGSolve(&MG_h,l,VECTOR_U,VECTOR_F,a,b,dtol,rtol);
+    #endif
+  }
+  richardson_error(&MG_h,0,VECTOR_U);
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  if(my_rank==0){fprintf(stdout,"\n\n===== Deallocating memory ======================================================\n");}
+  MGDestroy(&MG_h);
+  destroy_level(&level_h);
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  if(my_rank==0){fprintf(stdout,"\n\n===== Done =====================================================================\n");}
+
+  #ifdef USE_MPI
+  #ifdef USE_HPM // IBM performance counters for BGQ...
+  HPM_Print();
+  #endif
+  MPI_Finalize();
+  #endif
+  return(0);
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+}
diff --git a/Util/hpgmg/finite-volume/source/hpgmg_setup.c b/Util/hpgmg/finite-volume/source/hpgmg_setup.c
new file mode 100644
index 00000000..f5882cf3
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/hpgmg_setup.c
@@ -0,0 +1,235 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Copyright Notice
+//------------------------------------------------------------------------------------------------------------------------------
+// HPGMG, Copyright (c) 2014, The Regents of the University of
+// California, through Lawrence Berkeley National Laboratory (subject to
+// receipt of any required approvals from the U.S. Dept. of Energy).  All
+// rights reserved.
+//
+// If you have questions about your rights to use or distribute this
+// software, please contact Berkeley Lab's Technology Transfer Department
+// at  TTD@lbl.gov.
+//
+// NOTICE.  This software is owned by the U.S. Department of Energy.  As
+// such, the U.S. Government has been granted for itself and others
+// acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide
+// license in the Software to reproduce, prepare derivative works, and
+// perform publicly and display publicly.  Beginning five (5) years after
+// the date permission to assert copyright is obtained from the U.S.
+// Department of Energy, and subject to any subsequent five (5) year
+// renewals, the U.S. Government is granted for itself and others acting
+// on its behalf a paid-up, nonexclusive, irrevocable, worldwide license
+// in the Software to reproduce, prepare derivative works, distribute
+// copies to the public, perform publicly and display publicly, and to
+// permit others to do so.
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#include "defines.h"
+#include "level.h"
+#include "mg.h"
+#include "operators.h"
+#include "solvers.h"
+//------------------------------------------------------------------------------------------------------------------------------
+void hpgmg_setup(const int log2_box_dim,
+                 const int target_boxes_per_rank,
+                 const int OMP_Threads,
+                 const int OMP_Nested,
+                 const int requested_threading_model,
+                 const int actual_threading_model) {
+  int my_rank=0;
+  int num_tasks=1;
+
+  #ifdef USE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &num_tasks);
+  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+//if(actual_threading_model>requested_threading_model)actual_threading_model=requested_threading_model;
+  if(my_rank==0){
+       if(requested_threading_model == MPI_THREAD_MULTIPLE  )printf("Requested MPI_THREAD_MULTIPLE, ");
+  else if(requested_threading_model == MPI_THREAD_SINGLE    )printf("Requested MPI_THREAD_SINGLE, ");
+  else if(requested_threading_model == MPI_THREAD_FUNNELED  )printf("Requested MPI_THREAD_FUNNELED, ");
+  else if(requested_threading_model == MPI_THREAD_SERIALIZED)printf("Requested MPI_THREAD_SERIALIZED, ");
+  else if(requested_threading_model == MPI_THREAD_MULTIPLE  )printf("Requested MPI_THREAD_MULTIPLE, ");
+  else                                                       printf("Requested Unknown MPI Threading Model (%d), ",requested_threading_model);
+       if(actual_threading_model    == MPI_THREAD_MULTIPLE  )printf("got MPI_THREAD_MULTIPLE\n");
+  else if(actual_threading_model    == MPI_THREAD_SINGLE    )printf("got MPI_THREAD_SINGLE\n");
+  else if(actual_threading_model    == MPI_THREAD_FUNNELED  )printf("got MPI_THREAD_FUNNELED\n");
+  else if(actual_threading_model    == MPI_THREAD_SERIALIZED)printf("got MPI_THREAD_SERIALIZED\n");
+  else if(actual_threading_model    == MPI_THREAD_MULTIPLE  )printf("got MPI_THREAD_MULTIPLE\n");
+  else                                                       printf("got Unknown MPI Threading Model (%d)\n",actual_threading_model);
+  }
+  #endif
+
+
+  if(log2_box_dim<4){
+    if(my_rank==0){printf("log2_box_dim must be at least 4\n");}
+    #ifdef USE_MPI
+    MPI_Finalize();
+    #endif
+    exit(0);
+  }
+
+  if(target_boxes_per_rank<1){
+    if(my_rank==0){printf("target_boxes_per_rank must be at least 1\n");}
+    #ifdef USE_MPI
+    MPI_Finalize();
+    #endif
+    exit(0);
+  }
+
+  if(my_rank==0){
+    if(OMP_Nested)fprintf(stdout,"%d MPI Tasks of %d threads (OMP_NESTED=TRUE)\n\n" ,num_tasks,OMP_Threads);
+             else fprintf(stdout,"%d MPI Tasks of %d threads (OMP_NESTED=FALSE)\n\n",num_tasks,OMP_Threads);
+  }
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // calculate the problem size...
+  #ifndef MAX_COARSE_DIM
+  #define MAX_COARSE_DIM 11
+  #endif
+  int64_t box_dim=1<<log2_box_dim;
+  int64_t target_boxes = (int64_t)target_boxes_per_rank*(int64_t)num_tasks;
+  int64_t boxes_in_i = -1;
+  int64_t bi;
+  for(bi=1;bi<1000;bi++){ // all possible problem sizes
+    int64_t total_boxes = bi*bi*bi;
+    if(total_boxes<=target_boxes){
+      int64_t coarse_grid_dim = box_dim*bi;
+      while( (coarse_grid_dim%2) == 0){coarse_grid_dim=coarse_grid_dim/2;}
+      if(coarse_grid_dim<=MAX_COARSE_DIM){
+        boxes_in_i = bi;
+      }
+    }
+  }
+  if(boxes_in_i<1){
+    if(my_rank==0){printf("failed to find an acceptable problem size\n");}
+    #ifdef USE_MPI
+    MPI_Finalize();
+    #endif
+    exit(0);
+  }
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // create the fine level...
+  #ifdef USE_PERIODIC_BC
+  int bc = BC_PERIODIC;
+  #else
+  int bc = BC_DIRICHLET;
+  #endif
+  level_type fine_grid;
+  int ghosts=stencil_get_radius();
+  create_level(&fine_grid,boxes_in_i,box_dim,ghosts,VECTORS_RESERVED,bc,my_rank,num_tasks);
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  #ifdef USE_HELMHOLTZ
+  double a=1.0;double b=1.0; // Helmholtz
+  if(my_rank==0)fprintf(stdout,"  Creating Helmholtz (a=%f, b=%f) test problem\n",a,b);
+  #else
+  double a=0.0;double b=1.0; // Poisson
+  if(my_rank==0)fprintf(stdout,"  Creating Poisson (a=%f, b=%f) test problem\n",a,b);
+  #endif
+  double h0=1.0/( (double)boxes_in_i*(double)box_dim );
+  initialize_problem(&fine_grid,h0,a,b); // calculate VECTOR_ALPHA, VECTOR_BETA, and VECTOR_UTRUE
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  if( ((a==0.0)||(dot(&fine_grid,VECTOR_ALPHA,VECTOR_ALPHA)==0.0)) && (fine_grid.boundary_condition.type == BC_PERIODIC) ){
+    // Poisson w/ periodic BC's...
+    // nominally, u shifted by any constant is still a valid solution.
+    // However, by convention, we assume u sums to zero.
+    double average_value_of_u = mean(&fine_grid,VECTOR_UTRUE);
+    if(my_rank==0){fprintf(stdout,"  average value of u_true = %20.12e... shifting u_true to ensure it sums to zero...\n",average_value_of_u);}
+    shift_vector(&fine_grid,VECTOR_UTRUE,VECTOR_UTRUE,-average_value_of_u);
+  }
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  //apply_op(&fine_grid,VECTOR_F,VECTOR_UTRUE,a,b); // by construction, f = A(u_true)
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  if(fine_grid.boundary_condition.type == BC_PERIODIC){
+    double average_value_of_f = mean(&fine_grid,VECTOR_F);
+    if(average_value_of_f!=0.0){
+      if(my_rank==0){fprintf(stderr,"  WARNING... Periodic boundary conditions, but f does not sum to zero... mean(f)=%e\n",average_value_of_f);}
+      //shift_vector(&fine_grid,VECTOR_F,VECTOR_F,-average_value_of_f);
+    }
+  }
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  mg_type all_grids;
+  int minCoarseDim = 1;
+  rebuild_operator(&fine_grid,NULL,a,b); // i.e. calculate Dinv and lambda_max
+  MGBuild(&all_grids,&fine_grid,a,b,minCoarseDim); // build the Multigrid Hierarchy
+  double dtol=  0.0;double rtol=1e-10; // converged if ||b-Ax|| / ||b|| < rtol
+//double dtol=1e-15;double rtol=  0.0; // converged if ||D^{-1}(b-Ax)|| < dtol
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+     int     doTiming;
+     int    minSolves = 10; // do at least minSolves MGSolves
+  double timePerSolve = 0;
+  for(doTiming=0;doTiming<=1;doTiming++){ // first pass warms up, second pass times
+
+    #ifdef USE_HPM // IBM performance counters for BGQ...
+    if(doTiming)HPM_Start("FMGSolve()");
+    #endif
+
+    #ifdef USE_MPI
+    double minTime   = 30.0; // minimum time in seconds that the benchmark should run
+    double startTime = MPI_Wtime();
+    if(doTiming==1){
+      if((minTime/timePerSolve)>minSolves)minSolves=(minTime/timePerSolve); // if one needs to do more than minSolves to run for minTime, change minSolves
+    }
+    #endif
+
+    if(my_rank==0){
+      if(doTiming==0){fprintf(stdout,"\n\n===== warming up by running %d solves ===============================\n",minSolves);}
+                 else{fprintf(stdout,"\n\n===== running %d solves =============================================\n",minSolves);}
+      fflush(stdout);
+    }
+
+    int numSolves =  0; // solves completed
+    MGResetTimers(&all_grids);
+    while( (numSolves<minSolves) ){
+      zero_vector(all_grids.levels[0],VECTOR_U);
+      #ifdef USE_FCYCLES
+      FMGSolve(&all_grids,0,VECTOR_U,VECTOR_F,a,b,dtol,rtol);
+      #else
+       MGSolve(&all_grids,0,VECTOR_U,VECTOR_F,a,b,dtol,rtol);
+      #endif
+      numSolves++;
+    }
+
+    #ifdef USE_MPI
+    if(doTiming==0){
+      double endTime = MPI_Wtime();
+      timePerSolve = (endTime-startTime)/numSolves;
+      MPI_Bcast(&timePerSolve,1,MPI_DOUBLE,0,MPI_COMM_WORLD); // after warmup, process 0 broadcasts the average time per solve (consensus)
+    }
+    #endif
+
+    #ifdef USE_HPM // IBM performance counters for BGQ...
+    if(doTiming)HPM_Stop("FMGSolve()");
+    #endif
+  }
+  MGPrintTiming(&all_grids); // don't include the error check in the timing results
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  if(my_rank==0){fprintf(stdout,"calculating error...  ");}
+  double fine_error = error(&fine_grid,VECTOR_U,VECTOR_UTRUE);
+  if(my_rank==0){fprintf(stdout,"h = %22.15e  ||error|| = %22.15e\n\n",h0,fine_error);fflush(stdout);}
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // MGDestroy()
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  #ifdef USE_MPI
+  #ifdef USE_HPM // IBM performance counters for BGQ...
+  HPM_Print();
+  #endif
+  MPI_Finalize();
+  #endif
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  return;
+}
diff --git a/Util/hpgmg/finite-volume/source/level.c b/Util/hpgmg/finite-volume/source/level.c
new file mode 100644
index 00000000..0bf8fa61
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/level.c
@@ -0,0 +1,1396 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#include "timers.h"
+#include "defines.h"
+#include "level.h"
+#include "operators.h"
+//------------------------------------------------------------------------------------------------------------------------------
+void print_communicator(int printSendRecv, int rank, int level, communicator_type *comm){
+  int i;
+  printf("rank=%2d level=%d ",rank,level);
+  if(printSendRecv & 0x1){
+  printf("num_sends=%2d ",comm->num_sends);
+  printf("send_ranks=[ ");for(i=0;i<comm->num_sends;i++)printf("%2d ",comm->send_ranks[i]);printf("] ");
+  printf("send_sizes=[ ");for(i=0;i<comm->num_sends;i++)printf("%2d ",comm->send_sizes[i]);printf("] ");
+  printf("send_buffers=[ ");for(i=0;i<comm->num_sends;i++)printf("%08lx ",(uint64_t)comm->send_buffers[i]);printf("] ");
+  for(i=0;i<comm->num_blocks[0];i++)printf("[ %dx%dx%d from %d %d %d %d %d to %d %d %d %d %d ] ",comm->blocks[0][i].dim.i,comm->blocks[0][i].dim.j,comm->blocks[0][i].dim.k,comm->blocks[0][i].read.i,comm->blocks[0][i].read.j,comm->blocks[0][i].read.k,comm->blocks[0][i].read.jStride,comm->blocks[0][i].read.kStride,comm->blocks[0][i].write.i,comm->blocks[0][i].write.j,comm->blocks[0][i].write.k,comm->blocks[0][i].write.jStride,comm->blocks[0][i].write.kStride);
+  printf("\n");
+  }
+  if(printSendRecv & 0x2){
+  for(i=0;i<comm->num_blocks[1];i++)printf("[ %dx%dx%d from %d %d %d %d %d to %d %d %d %d %d ] ",comm->blocks[1][i].dim.i,comm->blocks[1][i].dim.j,comm->blocks[1][i].dim.k,comm->blocks[1][i].read.i,comm->blocks[1][i].read.j,comm->blocks[1][i].read.k,comm->blocks[1][i].read.jStride,comm->blocks[1][i].read.kStride,comm->blocks[1][i].write.i,comm->blocks[1][i].write.j,comm->blocks[1][i].write.k,comm->blocks[1][i].write.jStride,comm->blocks[1][i].write.kStride);
+  printf("\n");
+  }
+  if(printSendRecv & 0x4){
+  printf("num_recvs=%2d ",comm->num_recvs);
+  printf("recv_ranks=[ ");for(i=0;i<comm->num_recvs;i++)printf("%2d ",comm->recv_ranks[i]);printf("] ");
+  printf("recv_sizes=[ ");for(i=0;i<comm->num_recvs;i++)printf("%2d ",comm->recv_sizes[i]);printf("] ");
+  printf("recv_buffers=[ ");for(i=0;i<comm->num_recvs;i++)printf("%08lx ",(uint64_t)comm->recv_buffers[i]);printf("] ");
+  for(i=0;i<comm->num_blocks[2];i++)printf("[ %dx%dx%d from %d %d %d %d %d to %d %d %d %d %d ] ",comm->blocks[2][i].dim.i,comm->blocks[2][i].dim.j,comm->blocks[2][i].dim.k,comm->blocks[2][i].read.i,comm->blocks[2][i].read.j,comm->blocks[2][i].read.k,comm->blocks[2][i].read.jStride,comm->blocks[2][i].read.kStride,comm->blocks[2][i].write.i,comm->blocks[2][i].write.j,comm->blocks[2][i].write.k,comm->blocks[2][i].write.jStride,comm->blocks[2][i].write.kStride);
+  printf("\n");
+  }
+  fflush(stdout);
+}
+//------------------------------------------------------------------------------------------------------------------------------
+typedef struct {
+  int sendRank;
+  int sendBoxID;
+  int sendBox;
+  int sendDir;
+  int recvRank;
+  int recvBoxID;
+  int recvBox;
+} GZ_type;
+
+
+int qsortGZ(const void *a, const void*b){
+  GZ_type *gza = (GZ_type*)a;
+  GZ_type *gzb = (GZ_type*)b;
+  // by convention, MPI buffers are first sorted by sendRank
+  if(gza->sendRank < gzb->sendRank)return(-1);
+  if(gza->sendRank > gzb->sendRank)return( 1);
+  // then by sendBoxID
+  if(gza->sendBoxID < gzb->sendBoxID)return(-1);
+  if(gza->sendBoxID > gzb->sendBoxID)return( 1);
+  // and finally by the direction sent
+  if(gza->sendDir < gzb->sendDir)return(-1);
+  if(gza->sendDir > gzb->sendDir)return( 1);
+  return(0);
+}
+
+
+int qsortInt(const void *a, const void *b){
+  int *ia = (int*)a;
+  int *ib = (int*)b;
+  if(*ia < *ib)return(-1);
+  if(*ia > *ib)return( 1);
+               return( 0);
+}
+
+int qsortBlock(const void *a, const void *b){
+  blockCopy_type *ba = (blockCopy_type*)a;
+  blockCopy_type *bb = (blockCopy_type*)b;
+
+  if(ba->write.box >= 0){
+    // sort by box...
+    if(ba->write.box < bb->write.box)return(-1);
+    if(ba->write.box > bb->write.box)return( 1);
+    // now sort by k
+    if(ba->write.k   < bb->write.k  )return(-1);
+    if(ba->write.k   > bb->write.k  )return( 1);
+    // now sort by j
+    if(ba->write.j   < bb->write.j  )return(-1);
+    if(ba->write.j   > bb->write.j  )return( 1);
+    // now sort by i
+    if(ba->write.i   < bb->write.i  )return(-1);
+    if(ba->write.i   > bb->write.i  )return( 1);
+  }else if(ba->read.box >= 0){
+    // sort by box...
+    if(ba->read.box  < bb->read.box )return(-1);
+    if(ba->read.box  > bb->read.box )return( 1);
+    // now sort by k
+    if(ba->read.k    < bb->read.k   )return(-1);
+    if(ba->read.k    > bb->read.k   )return( 1);
+    // now sort by j
+    if(ba->read.j    < bb->read.j   )return(-1);
+    if(ba->read.j    > bb->read.j   )return( 1);
+    // now sort by i
+    if(ba->read.i    < bb->read.i   )return(-1);
+    if(ba->read.i    > bb->read.i   )return( 1);
+  }
+                                     return( 0);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void decompose_level_lex(int *rank_of_box, int idim, int jdim, int kdim, int ranks){
+  // simple lexicographical decomposition of the domain (i-j-k ordering)
+  // load balancing is easily realized
+  // unfortunately, each process will likely receive one or two long pencils of boxes. 
+  // as such, the resultant surface:volum ratio will likely be poor
+  int boxes = idim*jdim*kdim;
+  int i,j,k;
+  for(k=0;k<kdim;k++){
+  for(j=0;j<jdim;j++){
+  for(i=0;i<idim;i++){
+    int b = k*jdim*idim + j*idim + i;
+    rank_of_box[b] = ((uint64_t)ranks*(uint64_t)b)/(uint64_t)boxes; // ranks*b can be as larger than ranks^2... can over flow int
+  }}} 
+}
+
+
+//---------------------------------------------------------------------------------------------------------------------------------------------------
+void decompose_level_bisection_special(int *rank_of_box, int jStride, int kStride, int ilo, int jlo, int klo, int idim, int jdim, int kdim, int rank_lo, int ranks){
+  // if possible, recursively partition the domain by a prime number (e.g. try an parition a 9^3 array into 3 equal pieces instead of 5x9^2 and 4x9^2)
+  // if not, default to simple bisection
+  // this function should ensure that each process receives a compact rectahedral collection of boxes
+  // however, load imbalance can occur
+  // the choice of whether to try and partition with the largest prime or smallest prime first is up to the user
+
+  #define numPrimes 13
+  //int primes[numPrimes] = {41,37,31,29,23,19,17,13,11,7,5,3,2};
+  int primes[numPrimes] = {2,3,5,7,11,13,17,19,23,29,31,37,41};
+  int i,j,k,p,f,ff;
+
+
+  // base case, no further recursion...
+  if( (ranks==1)|| ((idim==1)&&(jdim==1)&&(kdim==1)) ){
+    for(i=ilo;i<ilo+idim;i++){
+    for(j=jlo;j<jlo+jdim;j++){
+    for(k=klo;k<klo+kdim;k++){
+      int b = i + j*jStride + k*kStride;
+      rank_of_box[b] = rank_lo;
+    }}}
+    return;
+  }
+
+
+  // special cases for perfectly matched problem sizes with numbers of processes (but not powers of 2)...
+  for(p=0;p<numPrimes;p++){
+    f=primes[p];
+    if( (kdim>=idim)&&(kdim>=jdim) ){if( (kdim%f==0) && (ranks%f==0) ){for(ff=0;ff<f;ff++)decompose_level_bisection_special(rank_of_box,jStride,kStride,ilo,jlo,klo+ff*kdim/f,idim,jdim,kdim/f,rank_lo+ff*ranks/f,ranks/f);return;}}
+    if( (jdim>=idim)&&(jdim>=kdim) ){if( (jdim%f==0) && (ranks%f==0) ){for(ff=0;ff<f;ff++)decompose_level_bisection_special(rank_of_box,jStride,kStride,ilo,jlo+ff*jdim/f,klo,idim,jdim/f,kdim,rank_lo+ff*ranks/f,ranks/f);return;}}
+    if( (idim>=jdim)&&(idim>=kdim) ){if( (idim%f==0) && (ranks%f==0) ){for(ff=0;ff<f;ff++)decompose_level_bisection_special(rank_of_box,jStride,kStride,ilo+ff*idim/f,jlo,klo,idim/f,jdim,kdim,rank_lo+ff*ranks/f,ranks/f);return;}}
+  }
+
+
+  // try and bisect the domain in the i-dimension
+  if( (idim>=jdim)&&(idim>=kdim) ){
+    int dim0 = (int)(0.5*(double)idim + 0.50);
+    int dim1 = idim-dim0;
+    int r0 = (int)( 0.5 + (double)ranks*(double)dim0/(double)idim );
+    int r1 = ranks-r0;
+    decompose_level_bisection_special(rank_of_box,jStride,kStride,ilo     ,jlo,klo,dim0,jdim,kdim,rank_lo   ,r0); // lo
+    decompose_level_bisection_special(rank_of_box,jStride,kStride,ilo+dim0,jlo,klo,dim1,jdim,kdim,rank_lo+r0,r1); // hi
+    return;
+  }
+  // try and bisect the domain in the j-dimension
+  if( (jdim>=idim)&&(jdim>=kdim) ){
+    int dim0 = (int)(0.5*(double)jdim + 0.50);
+    int dim1 = jdim-dim0;
+    int r0 = (int)( 0.5 + (double)ranks*(double)dim0/(double)jdim );
+    int r1 = ranks-r0;
+    decompose_level_bisection_special(rank_of_box,jStride,kStride,ilo,jlo     ,klo,idim,dim0,kdim,rank_lo   ,r0); // lo
+    decompose_level_bisection_special(rank_of_box,jStride,kStride,ilo,jlo+dim0,klo,idim,dim1,kdim,rank_lo+r0,r1); // hi
+    return;
+  }
+  // try and bisect the domain in the k-dimension
+  if( (kdim>=idim)&&(kdim>=jdim) ){
+    int dim0 = (int)(0.5*(double)kdim + 0.50);
+    int dim1 = kdim-dim0;
+    int r0 = (int)( 0.5 + (double)ranks*(double)dim0/(double)kdim );
+    int r1 = ranks-r0;
+    decompose_level_bisection_special(rank_of_box,jStride,kStride,ilo,jlo,klo     ,idim,jdim,dim0,rank_lo   ,r0); // lo
+    decompose_level_bisection_special(rank_of_box,jStride,kStride,ilo,jlo,klo+dim0,idim,jdim,dim1,rank_lo+r0,r1); // hi
+    return;
+  }
+  fprintf(stderr,"decompose_level_bisection_special failed !!!\n");exit(0);
+}
+
+
+//---------------------------------------------------------------------------------------------------------------------------------------------------
+void decompose_level_bisection(int *rank_of_box, int jStride, int kStride, int ilo, int jlo, int klo, int idim, int jdim, int kdim, int ranks, int sfc_offset, int sfc_max_length){
+
+  // base case... 
+  if( (idim==1) && (jdim==1) && (kdim==1) ){
+    int b = ilo + jlo*jStride + klo*kStride;
+    rank_of_box[b] = ((uint64_t)ranks*(uint64_t)sfc_offset)/(uint64_t)sfc_max_length; // sfc_max_length is the precomputed maximum length
+    return;
+  }
+
+  // try and bisect the domain in the i-dimension
+  if( (idim>=jdim)&&(idim>=kdim) ){
+    int dim0 = (int)(0.5*(double)idim + 0.50);
+    int dim1 = idim-dim0;
+    int sfc_delta = dim0*jdim*kdim;
+    decompose_level_bisection(rank_of_box,jStride,kStride,ilo     ,jlo,klo,dim0,jdim,kdim,ranks,sfc_offset          ,sfc_max_length); // lo
+    decompose_level_bisection(rank_of_box,jStride,kStride,ilo+dim0,jlo,klo,dim1,jdim,kdim,ranks,sfc_offset+sfc_delta,sfc_max_length); // hi
+    return;
+  }
+
+  // try and bisect the domain in the j-dimension
+  if( (jdim>=idim)&&(jdim>=kdim) ){
+    int dim0 = (int)(0.5*(double)jdim + 0.50);
+    int dim1 = jdim-dim0;
+    int sfc_delta = idim*dim0*kdim;
+    decompose_level_bisection(rank_of_box,jStride,kStride,ilo,jlo     ,klo,idim,dim0,kdim,ranks,sfc_offset          ,sfc_max_length); // lo
+    decompose_level_bisection(rank_of_box,jStride,kStride,ilo,jlo+dim0,klo,idim,dim1,kdim,ranks,sfc_offset+sfc_delta,sfc_max_length); // hi
+    return;
+  }
+
+  // try and bisect the domain in the k-dimension
+  if( (kdim>=idim)&&(kdim>=jdim) ){
+    int dim0 = (int)(0.5*(double)kdim + 0.50);
+    int dim1 = kdim-dim0;
+    int sfc_delta = idim*jdim*dim0;
+    decompose_level_bisection(rank_of_box,jStride,kStride,ilo,jlo,klo     ,idim,jdim,dim0,ranks,sfc_offset          ,sfc_max_length); // lo
+    decompose_level_bisection(rank_of_box,jStride,kStride,ilo,jlo,klo+dim0,idim,jdim,dim1,ranks,sfc_offset+sfc_delta,sfc_max_length); // hi
+    return;
+  }
+
+  // failure...
+  fprintf(stderr,"decompose_level_bisection failed !!!\n");exit(0);
+}
+
+
+//---------------------------------------------------------------------------------------------------------------------------------------------------
+// Given a bounding box (idim,jdim,kdim) use a Z-morton Space Filling Curve (SFC) to assign the boxes within the (boxes_in_i,boxes_in_j,boxes_in_k) valid region domain
+//  sfc_offset is the current offset within the space filling curve (starts with 0)
+//  this function returns the new offset based on how many actual boxes it found within (ilo,jlo,klo) + (idim,jdim,kdim)
+//  sfc_max_length is the maximum length of the SFC.  Note, if this length exceeds boxes_in_i*boxes_in_j*boxes_in_k, then some processes with receive no work
+int decompose_level_zmort(int *rank_of_box, int boxes_in_i, int boxes_in_j, int boxes_in_k, int ilo, int jlo, int klo, int idim, int jdim, int kdim, int ranks, int sfc_offset, int sfc_max_length){
+
+  // invalid cases...
+  if(idim<1)return(sfc_offset);
+  if(jdim<1)return(sfc_offset);
+  if(kdim<1)return(sfc_offset);
+  if(ilo <0)return(sfc_offset);
+  if(jlo <0)return(sfc_offset);
+  if(klo <0)return(sfc_offset);
+
+  // base case... 
+  if( (idim==1) && (jdim==1) && (kdim==1) ){
+    if( (ilo<boxes_in_i) && (jlo<boxes_in_j) && (klo<boxes_in_k) ){
+      // deemed a valid box (could be augmented for irregular domains)
+      int b = ilo + jlo*boxes_in_i + klo*boxes_in_i*boxes_in_j;
+      rank_of_box[b] = ((uint64_t)ranks*(uint64_t)(sfc_offset))/(uint64_t)sfc_max_length; // sfc_max_length is the precomputed maximum length
+      return(sfc_offset+1);
+    }
+    return(sfc_offset); // region outside valid domain;  sfc_offset is unchanged
+  }
+
+  // bisect in 3D...
+  int imid = ilo + (idim/2);
+  int jmid = jlo + (jdim/2);
+  int kmid = klo + (kdim/2);
+
+  sfc_offset=decompose_level_zmort(rank_of_box,boxes_in_i,boxes_in_j,boxes_in_k,ilo ,jlo ,klo ,     idim/2,     jdim/2,     kdim/2,ranks,sfc_offset,sfc_max_length);
+  sfc_offset=decompose_level_zmort(rank_of_box,boxes_in_i,boxes_in_j,boxes_in_k,imid,jlo ,klo ,idim-idim/2,     jdim/2,     kdim/2,ranks,sfc_offset,sfc_max_length);
+  sfc_offset=decompose_level_zmort(rank_of_box,boxes_in_i,boxes_in_j,boxes_in_k,ilo ,jmid,klo ,     idim/2,jdim-jdim/2,     kdim/2,ranks,sfc_offset,sfc_max_length);
+  sfc_offset=decompose_level_zmort(rank_of_box,boxes_in_i,boxes_in_j,boxes_in_k,imid,jmid,klo ,idim-idim/2,jdim-jdim/2,     kdim/2,ranks,sfc_offset,sfc_max_length);
+  sfc_offset=decompose_level_zmort(rank_of_box,boxes_in_i,boxes_in_j,boxes_in_k,ilo ,jlo ,kmid,     idim/2,     jdim/2,kdim-kdim/2,ranks,sfc_offset,sfc_max_length);
+  sfc_offset=decompose_level_zmort(rank_of_box,boxes_in_i,boxes_in_j,boxes_in_k,imid,jlo ,kmid,idim-idim/2,     jdim/2,kdim-kdim/2,ranks,sfc_offset,sfc_max_length);
+  sfc_offset=decompose_level_zmort(rank_of_box,boxes_in_i,boxes_in_j,boxes_in_k,ilo ,jmid,kmid,     idim/2,jdim-jdim/2,kdim-kdim/2,ranks,sfc_offset,sfc_max_length);
+  sfc_offset=decompose_level_zmort(rank_of_box,boxes_in_i,boxes_in_j,boxes_in_k,imid,jmid,kmid,idim-idim/2,jdim-jdim/2,kdim-kdim/2,ranks,sfc_offset,sfc_max_length);
+  return(sfc_offset);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+//int decompose_level_hilbert(int *rank_of_box, int boxes_in_i, int boxes_in_j, int boxes_in_k, int ilo, int jlo, int klo, int idim, int jdim, int kdim, int ranks, int sfc_offset, int sfc_max_length){
+// implements a 3D hilbert curve on the non-power of two domain using a power of two bounding box
+//}
+
+
+//---------------------------------------------------------------------------------------------------------------------------------------------------
+void print_decomposition(level_type *level){
+  if(level->my_rank!=0)return;
+  printf("\n");
+  int i,j,k;
+  int jStride = level->boxes_in.i;
+  int kStride = level->boxes_in.i*level->boxes_in.j;
+  for(k=level->boxes_in.k-1;k>=0;k--){ // (i,j,k)=(0,0,0) is bottom left corner
+  for(j=level->boxes_in.j-1;j>=0;j--){ // (i,j)=(0,0) is bottom left corner
+  for(i=0;i<j;i++)printf(" ");
+  for(i=0;i<level->boxes_in.i;i++){
+    int b = i + j*jStride + k*kStride;
+    printf("%4d ",level->rank_of_box[b]);
+  }printf("\n");
+  }printf("\n\n");
+  }
+  fflush(stdout);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// append the specified block (logical region) to the current list of blocks
+// each block may be tiled to...
+//  - create more parallelism across the list of blocks
+//  - limit parallelism within a block
+//  - limit the memory requirements for each block
+#ifndef BLOCK_LIST_MIN_SIZE
+#define BLOCK_LIST_MIN_SIZE 1000
+#endif
+void append_block_to_list(blockCopy_type ** blocks, int *allocated_blocks, int *num_blocks,
+                          int dim_i, int dim_j, int dim_k,
+                          int  read_box, double*  read_ptr, int  read_i, int  read_j, int  read_k, int  read_jStride, int  read_kStride, int  read_scale,
+                          int write_box, double* write_ptr, int write_i, int write_j, int write_k, int write_jStride, int write_kStride, int write_scale,
+                          int blockcopy_tile_i, int blockcopy_tile_j, int blockcopy_tile_k, 
+                          int subtype
+                         ){
+  // Take a dim_j x dim_k iteration space and tile it into smaller faces of size blockcopy_tile_j x blockcopy_tile_k
+  // This increases the number of blockCopies in the ghost zone exchange and thereby increases the thread-level parallelism
+
+  #if 0
+  // use recursive (z-mort) ordering of tiles in order to improve locality on deep memory hierarchies...
+  int doRecursion=0;
+  if(dim_i > blockcopy_tile_i)doRecursion=1;
+  if(dim_j > blockcopy_tile_j)doRecursion=1;
+  if(dim_k > blockcopy_tile_k)doRecursion=1;
+  if( read_scale != 1)doRecursion=0; // disable recursion for restriction
+  if(write_scale != 1)doRecursion=0; // disable recursion for interpolation
+  if(doRecursion){
+    int mid_i = (dim_i + 1)/2;
+    int mid_j = (dim_j + 1)/2;
+    int mid_k = (dim_k + 1)/2;
+        mid_i = blockcopy_tile_i*( (mid_i+blockcopy_tile_i-1)/blockcopy_tile_i);
+        mid_j = blockcopy_tile_j*( (mid_j+blockcopy_tile_j-1)/blockcopy_tile_j);
+        mid_k = blockcopy_tile_k*( (mid_k+blockcopy_tile_k-1)/blockcopy_tile_k);
+     if(mid_i>dim_i)mid_i=dim_i;
+     if(mid_j>dim_j)mid_j=dim_j;
+     if(mid_k>dim_k)mid_k=dim_k;
+    append_block_to_list(blocks,allocated_blocks,num_blocks,      mid_i,      mid_j,      mid_k,
+                          read_box, read_ptr, read_i      , read_j      , read_k      , read_jStride, read_kStride, read_scale,
+                         write_box,write_ptr,write_i      ,write_j      ,write_k      ,write_jStride,write_kStride,write_scale,
+                         blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype);
+    append_block_to_list(blocks,allocated_blocks,num_blocks,dim_i-mid_i,      mid_j,      mid_k,
+                          read_box, read_ptr, read_i+mid_i, read_j      , read_k      , read_jStride, read_kStride, read_scale,
+                         write_box,write_ptr,write_i+mid_i,write_j      ,write_k      ,write_jStride,write_kStride,write_scale,
+                         blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype);
+    append_block_to_list(blocks,allocated_blocks,num_blocks,      mid_i,dim_j-mid_j,      mid_k,
+                          read_box, read_ptr, read_i      , read_j+mid_j, read_k      , read_jStride, read_kStride, read_scale,
+                         write_box,write_ptr,write_i      ,write_j+mid_j,write_k      ,write_jStride,write_kStride,write_scale,
+                         blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype);
+    append_block_to_list(blocks,allocated_blocks,num_blocks,dim_i-mid_i,dim_j-mid_j,      mid_k,
+                          read_box, read_ptr, read_i+mid_i, read_j+mid_j, read_k      , read_jStride, read_kStride, read_scale,
+                         write_box,write_ptr,write_i+mid_i,write_j+mid_j,write_k      ,write_jStride,write_kStride,write_scale,
+                         blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype);
+    append_block_to_list(blocks,allocated_blocks,num_blocks,      mid_i,      mid_j,dim_k-mid_k,
+                          read_box, read_ptr, read_i      , read_j      , read_k+mid_k, read_jStride, read_kStride, read_scale,
+                         write_box,write_ptr,write_i      ,write_j      ,write_k+mid_k,write_jStride,write_kStride,write_scale,
+                         blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype);
+    append_block_to_list(blocks,allocated_blocks,num_blocks,dim_i-mid_i,      mid_j,dim_k-mid_k,
+                          read_box, read_ptr, read_i+mid_i, read_j      , read_k+mid_k, read_jStride, read_kStride, read_scale,
+                         write_box,write_ptr,write_i+mid_i,write_j      ,write_k+mid_k,write_jStride,write_kStride,write_scale,
+                         blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype);
+    append_block_to_list(blocks,allocated_blocks,num_blocks,      mid_i,dim_j-mid_j,dim_k-mid_k,
+                          read_box, read_ptr, read_i      , read_j+mid_j, read_k+mid_k, read_jStride, read_kStride, read_scale,
+                         write_box,write_ptr,write_i      ,write_j+mid_j,write_k+mid_k,write_jStride,write_kStride,write_scale,
+                         blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype);
+    append_block_to_list(blocks,allocated_blocks,num_blocks,dim_i-mid_i,dim_j-mid_j,dim_k-mid_k,
+                          read_box, read_ptr, read_i+mid_i, read_j+mid_j, read_k+mid_k, read_jStride, read_kStride, read_scale,
+                         write_box,write_ptr,write_i+mid_i,write_j+mid_j,write_k+mid_k,write_jStride,write_kStride,write_scale,
+                         blockcopy_tile_i,blockcopy_tile_j,blockcopy_tile_k,subtype);
+    return;
+  }
+  #endif
+  // read_/write_scale are used to stride appropriately when read and write loop iterations spaces are different 
+  // ghostZone:     read_scale=1, write_scale=1
+  // interpolation: read_scale=1, write_scale=2
+  // restriction:   read_scale=2, write_scale=1
+  // FIX... dim_i,j,k -> read_dim_i,j,k, write_dim_i,j,k
+  int ii,jj,kk;
+  for(kk=0;kk<dim_k;kk+=blockcopy_tile_k){
+  for(jj=0;jj<dim_j;jj+=blockcopy_tile_j){
+  for(ii=0;ii<dim_i;ii+=blockcopy_tile_i){
+    int dim_k_mod = dim_k-kk;if(dim_k_mod>blockcopy_tile_k)dim_k_mod=blockcopy_tile_k;
+    int dim_j_mod = dim_j-jj;if(dim_j_mod>blockcopy_tile_j)dim_j_mod=blockcopy_tile_j;
+    int dim_i_mod = dim_i-ii;if(dim_i_mod>blockcopy_tile_i)dim_i_mod=blockcopy_tile_i;
+    if(*num_blocks >= *allocated_blocks){
+      int oldSize = *allocated_blocks;
+      if(*allocated_blocks == 0){*allocated_blocks=BLOCK_LIST_MIN_SIZE;*blocks=(blockCopy_type*) malloc(                 (*allocated_blocks)*sizeof(blockCopy_type));}
+                            else{*allocated_blocks*=2;                 *blocks=(blockCopy_type*)realloc((void*)(*blocks),(*allocated_blocks)*sizeof(blockCopy_type));}
+      if(*blocks == NULL){fprintf(stderr,"realloc failed - append_block_to_list (%d -> %d)\n",oldSize,*allocated_blocks);exit(0);}
+    }
+    (*blocks)[*num_blocks].subtype       = subtype;
+    (*blocks)[*num_blocks].dim.i         = dim_i_mod;
+    (*blocks)[*num_blocks].dim.j         = dim_j_mod;
+    (*blocks)[*num_blocks].dim.k         = dim_k_mod;
+    (*blocks)[*num_blocks].read.box      = read_box;
+    (*blocks)[*num_blocks].read.ptr      = read_ptr;
+    (*blocks)[*num_blocks].read.i        = read_i + read_scale*ii;
+    (*blocks)[*num_blocks].read.j        = read_j + read_scale*jj;
+    (*blocks)[*num_blocks].read.k        = read_k + read_scale*kk;
+    (*blocks)[*num_blocks].read.jStride  = read_jStride;
+    (*blocks)[*num_blocks].read.kStride  = read_kStride;
+    (*blocks)[*num_blocks].write.box     = write_box;
+    (*blocks)[*num_blocks].write.ptr     = write_ptr;
+    (*blocks)[*num_blocks].write.i       = write_i + write_scale*ii;
+    (*blocks)[*num_blocks].write.j       = write_j + write_scale*jj;
+    (*blocks)[*num_blocks].write.k       = write_k + write_scale*kk;
+    (*blocks)[*num_blocks].write.jStride = write_jStride;
+    (*blocks)[*num_blocks].write.kStride = write_kStride;
+             (*num_blocks)++;
+  }}}
+}
+
+
+//----------------------------------------------------------------------------------------------------------------------------------------------------
+// create a mini program that traverses the domain boundary intersecting with this process's boxes
+// This includes faces, corners, and edges
+void build_boundary_conditions(level_type *level, int shape){
+  level->boundary_condition.blocks[shape]           = NULL;	// default for periodic (i.e. no BC's)
+  level->boundary_condition.num_blocks[shape]       = 0;	// default for periodic (i.e. no BC's)
+  level->boundary_condition.allocated_blocks[shape] = 0;	// default for periodic (i.e. no BC's)
+  if(level->boundary_condition.type == BC_PERIODIC)return;
+
+//int    faces[27] = {0,0,0,0,1,0,0,0,0,  0,1,0,1,0,1,0,1,0,  0,0,0,0,1,0,0,0,0};
+  int    edges[27] = {0,1,0,1,0,1,0,1,0,  1,0,1,0,0,0,1,0,1,  0,1,0,1,0,1,0,1,0};
+  int  corners[27] = {1,0,1,0,0,0,1,0,1,  0,0,0,0,0,0,0,0,0,  1,0,1,0,0,0,1,0,1};
+
+  int box, di,dj,dk;
+  for(box=0;box<level->num_my_boxes;box++){	// traverse my list of boxes...
+  for(dk=-1;dk<=1;dk++){			// for each box, examine its 26 neighbors...
+  for(dj=-1;dj<=1;dj++){
+  for(di=-1;di<=1;di++){
+    int dir = 13+di+3*dj+9*dk; // face/edge/corner of *THIS* box (not the domain)
+
+    // determine if this region (box's di,dj,dk ghost zone) is outside of the domain
+    int regionIsOutside=0;
+    int normal = 13; // normal effectively defines the normal vector to the *DOMAIN* for this region... 
+                     // this addition is necessary for linearly interpolated BC's as a box's corner is not necessarily a domain's corner
+    int myBox_i = level->my_boxes[box].low.i / level->box_dim;
+    int myBox_j = level->my_boxes[box].low.j / level->box_dim;
+    int myBox_k = level->my_boxes[box].low.k / level->box_dim;
+    int neighborBox_i = (  myBox_i + di );
+    int neighborBox_j = (  myBox_j + dj );
+    int neighborBox_k = (  myBox_k + dk );
+    if( neighborBox_i < 0                 ){regionIsOutside=1;normal-=1;}
+    if( neighborBox_j < 0                 ){regionIsOutside=1;normal-=3;}
+    if( neighborBox_k < 0                 ){regionIsOutside=1;normal-=9;}
+    if( neighborBox_i >=level->boxes_in.i ){regionIsOutside=1;normal+=1;}
+    if( neighborBox_j >=level->boxes_in.j ){regionIsOutside=1;normal+=3;}
+    if( neighborBox_k >=level->boxes_in.k ){regionIsOutside=1;normal+=9;}
+
+    // calculate ghost zone region size and coordinates relative to the first non-ghost zone element (0,0,0)
+    int block_i=-1,block_j=-1,block_k=-1;
+    int   dim_i=-1,  dim_j=-1,  dim_k=-1;
+    switch(di){
+      case -1:dim_i=level->box_ghosts;block_i=0-level->box_ghosts;break;
+      case  0:dim_i=level->box_dim;   block_i=0;                  break;
+      case  1:dim_i=level->box_ghosts;block_i=0+level->box_dim;   break;
+    }
+    switch(dj){
+      case -1:dim_j=level->box_ghosts;block_j=0-level->box_ghosts;break;
+      case  0:dim_j=level->box_dim;   block_j=0;                  break;
+      case  1:dim_j=level->box_ghosts;block_j=0+level->box_dim;   break;
+    }
+    switch(dk){
+      case -1:dim_k=level->box_ghosts;block_k=0-level->box_ghosts;break;
+      case  0:dim_k=level->box_dim;   block_k=0;                  break;
+      case  1:dim_k=level->box_ghosts;block_k=0+level->box_dim;   break;
+    }
+
+    // use regionIsOutside to short circuit logic and cull unnecessary regions...
+    switch(shape){
+      case STENCIL_SHAPE_STAR:      if(edges[dir]||corners[dir])regionIsOutside=0;break; // star-shaped stencils don't need BC's enforced on corners or edges
+      case STENCIL_SHAPE_NO_CORNERS:if(            corners[dir])regionIsOutside=0;break; // these stencils don't need BC's enforced on edges
+    }
+
+    // default tile sizes...
+    // NOTE, BC's may never tile smaller than the ghost zone depth
+    int blockcopy_i = (BLOCKCOPY_TILE_I < level->box_ghosts) ? level->box_ghosts : BLOCKCOPY_TILE_I;
+    int blockcopy_j = (BLOCKCOPY_TILE_J < level->box_ghosts) ? level->box_ghosts : BLOCKCOPY_TILE_J;
+    int blockcopy_k = (BLOCKCOPY_TILE_K < level->box_ghosts) ? level->box_ghosts : BLOCKCOPY_TILE_K;
+
+    #if 0
+    // 2D tiling of faces
+    // 1D tiling of edges
+    // corners use defaults
+    switch(dir){
+      case  1:blockcopy_i=    8;blockcopy_j=10000;blockcopy_k=10000;break; //  i edge
+      case  3:blockcopy_i=10000;blockcopy_j=    8;blockcopy_k=10000;break; //  j edge
+      case  4:blockcopy_i=    8;blockcopy_j=    8;blockcopy_k=10000;break; // ij face
+      case  5:blockcopy_i=10000;blockcopy_j=    8;blockcopy_k=10000;break; //  j edge
+      case  7:blockcopy_i=    8;blockcopy_j=10000;blockcopy_k=10000;break; //  i edge
+
+      case  9:blockcopy_i=10000;blockcopy_j=10000;blockcopy_k=    8;break; //  k edge
+      case 10:blockcopy_i=    8;blockcopy_j=10000;blockcopy_k=    8;break; // ik face
+      case 11:blockcopy_i=10000;blockcopy_j=10000;blockcopy_k=    8;break; //  k edge
+      case 12:blockcopy_i=10000;blockcopy_j=    8;blockcopy_k=    8;break; // jk face
+
+      case 14:blockcopy_i=10000;blockcopy_j=    8;blockcopy_k=    8;break; // jk face
+      case 15:blockcopy_i=10000;blockcopy_j=10000;blockcopy_k=    8;break; //  k edge
+      case 16:blockcopy_i=    8;blockcopy_j=10000;blockcopy_k=    8;break; // ik face
+      case 17:blockcopy_i=10000;blockcopy_j=10000;blockcopy_k=    8;break; //  k edge
+
+      case 19:blockcopy_i=    8;blockcopy_j=10000;blockcopy_k=10000;break; //  i edge
+      case 21:blockcopy_i=10000;blockcopy_j=    8;blockcopy_k=10000;break; //  j edge
+      case 22:blockcopy_i=    8;blockcopy_j=    8;blockcopy_k=10000;break; // ij face
+      case 23:blockcopy_i=10000;blockcopy_j=    8;blockcopy_k=10000;break; //  j edge
+      case 25:blockcopy_i=    8;blockcopy_j=10000;blockcopy_k=10000;break; //  i edge
+    }
+    #endif
+
+    if(regionIsOutside){
+    append_block_to_list(&(level->boundary_condition.blocks[shape]),&(level->boundary_condition.allocated_blocks[shape]),&(level->boundary_condition.num_blocks[shape]),
+      /* dim.i         = */ dim_i,
+      /* dim.j         = */ dim_j,
+      /* dim.k         = */ dim_k,
+      /* read.box      = */ box,
+      /* read.ptr      = */ NULL,
+      /* read.i        = */ block_i,
+      /* read.j        = */ block_j,
+      /* read.k        = */ block_k,
+      /* read.jStride  = */ level->my_boxes[box].jStride,
+      /* read.kStride  = */ level->my_boxes[box].kStride,
+      /* read.scale    = */ 1,
+      /* write.box     = */ box,
+      /* write.ptr     = */ NULL,
+      /* write.i       = */ block_i,
+      /* write.j       = */ block_j,
+      /* write.k       = */ block_k,
+      /* write.jStride = */ level->my_boxes[box].jStride,
+      /* write.kStride = */ level->my_boxes[box].kStride,
+      /* write.scale   = */ 1,
+      /* blockcopy_i   = */ blockcopy_i,
+      /* blockcopy_j   = */ blockcopy_j,
+      /* blockcopy_k   = */ blockcopy_k,
+      /* subtype       = */ normal
+    );
+  }}}}}
+
+  #ifdef BLOCK_SPATIAL_SORT
+  // sort all the resultant blocks by box,k,j,i (good locality)
+  qsort(level->boundary_condition.blocks[shape],level->boundary_condition.num_blocks[shape],sizeof(blockCopy_type),qsortBlock);
+  #endif
+}
+
+//----------------------------------------------------------------------------------------------------------------------------------------------------
+// create a mini program that packs data into MPI recv buffers, exchanges local data, and unpacks the MPI send buffers
+//   broadly speaking... 
+//   1. traverse my list of Boxes and create a list of ghosts that must be sent
+//   2. create a list of neighbors to send to
+//   3. allocate and populate the pack list and allocate the send buffers
+//   4. allocate and populate the local exchange list
+//   5. traverse my list of Boxes and create a list of ghosts that must be received
+//   6. create a list of neighbors to receive from
+//   7. allocate and populate the unpack list and allocate the recv buffers
+//
+//   thus a ghost zone exchange is
+//   1. prepost a Irecv for each MPI recv buffer (1 per neighbor)
+//   2. traverse the pack list
+//   3. post the Isends for each MPI send buffer (1 per neighbor)
+//   4. traverse the local copy list
+//   5. waitall
+//   6. traverse the unpack list
+//
+//     / 24 25 26 /
+//    / 21 22 23 /	(k+1)
+//   / 18 19 20 /
+//
+//     / 15 16 17 /
+//    / 12 13 14 /	(k)
+//   /  9 10 11 /
+//
+//     /  6  7  8 /
+//    /  3  4  5 /	(k-1)
+//   /  0  1  2 /
+//
+void build_exchange_ghosts(level_type *level, int shape){
+  int    faces[27] = {0,0,0,0,1,0,0,0,0,  0,1,0,1,0,1,0,1,0,  0,0,0,0,1,0,0,0,0};
+  int    edges[27] = {0,1,0,1,0,1,0,1,0,  1,0,1,0,0,0,1,0,1,  0,1,0,1,0,1,0,1,0};
+  int  corners[27] = {1,0,1,0,0,0,1,0,1,  0,0,0,0,0,0,0,0,0,  1,0,1,0,0,0,1,0,1};
+
+  // initialize to defaults...
+  level->exchange_ghosts[shape].num_recvs           = 0;
+  level->exchange_ghosts[shape].num_sends           = 0;
+  level->exchange_ghosts[shape].recv_ranks          = NULL;
+  level->exchange_ghosts[shape].send_ranks          = NULL;
+  level->exchange_ghosts[shape].recv_sizes          = NULL;
+  level->exchange_ghosts[shape].send_sizes          = NULL;
+  level->exchange_ghosts[shape].recv_buffers        = NULL;
+  level->exchange_ghosts[shape].send_buffers        = NULL;
+  level->exchange_ghosts[shape].blocks[0]           = NULL;
+  level->exchange_ghosts[shape].blocks[1]           = NULL;
+  level->exchange_ghosts[shape].blocks[2]           = NULL;
+  level->exchange_ghosts[shape].num_blocks[0]       = 0;
+  level->exchange_ghosts[shape].num_blocks[1]       = 0;
+  level->exchange_ghosts[shape].num_blocks[2]       = 0;
+  level->exchange_ghosts[shape].allocated_blocks[0] = 0;
+  level->exchange_ghosts[shape].allocated_blocks[1] = 0;
+  level->exchange_ghosts[shape].allocated_blocks[2] = 0;
+  #ifdef USE_MPI
+  level->exchange_ghosts[shape].requests            = NULL;
+  level->exchange_ghosts[shape].status              = NULL;
+  #endif
+
+  int    n,CommunicateThisDir[27];for(n=0;n<27;n++)CommunicateThisDir[n] = faces[n] + edges[n] + corners[n];// to be safe, communicate everything
+  switch(shape){
+    case STENCIL_SHAPE_BOX:       for(n=0;n<27;n++)CommunicateThisDir[n] = faces[n] + edges[n] + corners[n];break;
+    case STENCIL_SHAPE_STAR:      for(n=0;n<27;n++)CommunicateThisDir[n] = faces[n]                        ;break;
+    case STENCIL_SHAPE_NO_CORNERS:for(n=0;n<27;n++)CommunicateThisDir[n] = faces[n] + edges[n]             ;break;
+  }
+
+  int sendBox,recvBox;
+  int stage;
+  int _rank;
+  int ghost,numGhosts,numGhostsRemote;
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // traverse my list of boxes and create a lists of neighboring boxes and neighboring ranks
+  GZ_type *ghostsToSend = (GZ_type*)malloc(26*level->num_my_boxes*sizeof(GZ_type)); // There are at most 26 neighbors per box.
+         int *sendRanks = (    int*)malloc(26*level->num_my_boxes*sizeof(    int)); // There are at most 26 neighbors per box.
+  if(level->num_my_boxes>0){
+  if(ghostsToSend == NULL){fprintf(stderr,"malloc failed - build_exchange_ghosts/ghostsToSend\n");exit(0);}
+  if(sendRanks    == NULL){fprintf(stderr,"malloc failed - build_exchange_ghosts/sendRanks   \n");exit(0);}
+  }
+  numGhosts       = 0;
+  numGhostsRemote = 0;
+  for(sendBox=0;sendBox<level->num_my_boxes;sendBox++){
+    int di,dj,dk;
+    for(dk=-1;dk<=1;dk++){
+    for(dj=-1;dj<=1;dj++){
+    for(di=-1;di<=1;di++){
+      int dir = 13+di+3*dj+9*dk;if(CommunicateThisDir[dir]){
+      int       myBoxID = level->my_boxes[sendBox].global_box_id;
+      int       myBox_i = level->my_boxes[sendBox].low.i / level->box_dim;
+      int       myBox_j = level->my_boxes[sendBox].low.j / level->box_dim;
+      int       myBox_k = level->my_boxes[sendBox].low.k / level->box_dim;
+      int neighborBoxID = -1;
+      if(level->boundary_condition.type == BC_PERIODIC){
+        int neighborBox_i = (  myBox_i + di + level->boxes_in.i) % level->boxes_in.i;
+        int neighborBox_j = (  myBox_j + dj + level->boxes_in.j) % level->boxes_in.j;
+        int neighborBox_k = (  myBox_k + dk + level->boxes_in.k) % level->boxes_in.k;
+            neighborBoxID =  neighborBox_i + neighborBox_j*level->boxes_in.i + neighborBox_k*level->boxes_in.i*level->boxes_in.j;
+      }else{
+        int neighborBox_i = (  myBox_i + di );
+        int neighborBox_j = (  myBox_j + dj );
+        int neighborBox_k = (  myBox_k + dk );
+        if( (neighborBox_i>=0) && (neighborBox_i<level->boxes_in.i) && 
+            (neighborBox_j>=0) && (neighborBox_j<level->boxes_in.j) && 
+            (neighborBox_k>=0) && (neighborBox_k<level->boxes_in.k) ){  // i.e. the neighbor is a valid box
+            neighborBoxID =  neighborBox_i + neighborBox_j*level->boxes_in.i + neighborBox_k*level->boxes_in.i*level->boxes_in.j;
+        }
+      }
+      if(neighborBoxID>=0){
+      if( level->rank_of_box[neighborBoxID] != -1 ){
+        ghostsToSend[numGhosts].sendRank  = level->my_rank;
+        ghostsToSend[numGhosts].sendBoxID = myBoxID;
+        ghostsToSend[numGhosts].sendBox   = sendBox;
+        ghostsToSend[numGhosts].sendDir   = dir;
+        ghostsToSend[numGhosts].recvRank  = level->rank_of_box[neighborBoxID];
+        ghostsToSend[numGhosts].recvBoxID = neighborBoxID;
+        ghostsToSend[numGhosts].recvBox   = -1;
+        if( level->rank_of_box[neighborBoxID] != level->my_rank ){
+          sendRanks[numGhostsRemote++] = level->rank_of_box[neighborBoxID];
+        }else{
+          int recvBox=0;while(level->my_boxes[recvBox].global_box_id!=neighborBoxID)recvBox++; // search my list of boxes for the appropriate recvBox index
+          ghostsToSend[numGhosts].recvBox   = recvBox;
+        }
+        numGhosts++;
+      }}
+    }}}}
+  }
+  // sort boxes by sendRank(==my rank) then by sendBoxID... ensures the sends and receive buffers are always sorted by sendBoxID...
+  qsort(ghostsToSend,numGhosts      ,sizeof(GZ_type),qsortGZ );
+  // sort the lists of neighboring ranks and remove duplicates...
+  qsort(sendRanks   ,numGhostsRemote,sizeof(    int),qsortInt);
+  int numSendRanks=0;_rank=-1;for(ghost=0;ghost<numGhostsRemote;ghost++)if(sendRanks[ghost] != _rank){_rank=sendRanks[ghost];sendRanks[numSendRanks++]=sendRanks[ghost];}
+
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // in a two-stage process, traverse the list of ghosts and allocate the pack/local lists as well as the MPI buffers, and then populate the pack/local lists
+  level->exchange_ghosts[shape].num_sends     =                  numSendRanks;
+  level->exchange_ghosts[shape].send_ranks    =     (int*)malloc(numSendRanks*sizeof(int));
+  level->exchange_ghosts[shape].send_sizes    =     (int*)malloc(numSendRanks*sizeof(int));
+  level->exchange_ghosts[shape].send_buffers  = (double**)malloc(numSendRanks*sizeof(double*));
+  if(numSendRanks>0){
+  if(level->exchange_ghosts[shape].send_ranks  ==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].send_ranks\n",shape);exit(0);}
+  if(level->exchange_ghosts[shape].send_sizes  ==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].send_sizes\n",shape);exit(0);}
+  if(level->exchange_ghosts[shape].send_buffers==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].send_buffers\n",shape);exit(0);}
+  }
+  level->exchange_ghosts[shape].blocks[0] = NULL;
+  level->exchange_ghosts[shape].blocks[1] = NULL;
+  level->exchange_ghosts[shape].num_blocks[0] = 0;
+  level->exchange_ghosts[shape].num_blocks[1] = 0;
+  level->exchange_ghosts[shape].allocated_blocks[0] = 0;
+  level->exchange_ghosts[shape].allocated_blocks[1] = 0;
+  for(stage=0;stage<=1;stage++){
+    // stage=0... traverse the list and calculate the buffer sizes
+    // stage=1... allocate MPI send buffers, traverse the list, and populate the unpack/local lists...
+    int neighbor;
+    for(neighbor=0;neighbor<numSendRanks;neighbor++){
+      if(stage==1){
+             level->exchange_ghosts[shape].send_buffers[neighbor] = (double*)malloc(level->exchange_ghosts[shape].send_sizes[neighbor]*sizeof(double));
+          if(level->exchange_ghosts[shape].send_sizes[neighbor]>0)
+          if(level->exchange_ghosts[shape].send_buffers[neighbor]==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].send_buffers[neighbor]\n",shape);exit(0);}
+      memset(level->exchange_ghosts[shape].send_buffers[neighbor],                0,level->exchange_ghosts[shape].send_sizes[neighbor]*sizeof(double));
+      }
+      level->exchange_ghosts[shape].send_ranks[neighbor]=sendRanks[neighbor];
+      level->exchange_ghosts[shape].send_sizes[neighbor]=0;
+    }
+    for(ghost=0;ghost<numGhosts;ghost++){
+      int  dim_i=-1, dim_j=-1, dim_k=-1;
+      int send_i=-1,send_j=-1,send_k=-1;
+      int recv_i=-1,recv_j=-1,recv_k=-1;
+  
+      // decode ghostsToSend[ghost].sendDir (direction sent) into di/dj/dk 
+      int di = ((ghostsToSend[ghost].sendDir % 3)  )-1;
+      int dj = ((ghostsToSend[ghost].sendDir % 9)/3)-1;
+      int dk = ((ghostsToSend[ghost].sendDir / 9)  )-1;
+      switch(di){ // direction relative to sender
+        case -1:send_i=0;                               dim_i=level->box_ghosts;recv_i=  level->box_dim;   break;
+        case  0:send_i=0;                               dim_i=level->box_dim;   recv_i=0;                  break;
+        case  1:send_i=level->box_dim-level->box_ghosts;dim_i=level->box_ghosts;recv_i=0-level->box_ghosts;break;
+      }
+      switch(dj){ // direction relative to sender
+        case -1:send_j=0;                               dim_j=level->box_ghosts;recv_j=  level->box_dim;   break;
+        case  0:send_j=0;                               dim_j=level->box_dim;   recv_j=0;                  break;
+        case  1:send_j=level->box_dim-level->box_ghosts;dim_j=level->box_ghosts;recv_j=0-level->box_ghosts;break;
+      }
+      switch(dk){ // direction relative to sender
+        case -1:send_k=0;                               dim_k=level->box_ghosts;recv_k=  level->box_dim;   break;
+        case  0:send_k=0;                               dim_k=level->box_dim;   recv_k=0;                  break;
+        case  1:send_k=level->box_dim-level->box_ghosts;dim_k=level->box_ghosts;recv_k=0-level->box_ghosts;break;
+      }
+ 
+      // determine if this ghost requires a pack or local exchange 
+      int LocalExchange; // 0 = pack list, 1 = local exchange list
+      if(ghostsToSend[ghost].recvRank != level->my_rank){
+        LocalExchange=0; // pack
+        neighbor=0;while(level->exchange_ghosts[shape].send_ranks[neighbor] != ghostsToSend[ghost].recvRank)neighbor++;
+      }else{
+        LocalExchange=1; // local
+        neighbor=-1;
+      }
+   
+      if(stage==1){ 
+      if(LocalExchange) // append to the local exchange list...
+      append_block_to_list(&(level->exchange_ghosts[shape].blocks[1]),&(level->exchange_ghosts[shape].allocated_blocks[1]),&(level->exchange_ghosts[shape].num_blocks[1]),
+        /* dim.i         = */ dim_i,
+        /* dim.j         = */ dim_j,
+        /* dim.k         = */ dim_k,
+        /* read.box      = */ ghostsToSend[ghost].sendBox,
+        /* read.ptr      = */ NULL,
+        /* read.i        = */ send_i,
+        /* read.j        = */ send_j,
+        /* read.k        = */ send_k,
+        /* read.jStride  = */ level->my_boxes[ghostsToSend[ghost].sendBox].jStride,
+        /* read.kStride  = */ level->my_boxes[ghostsToSend[ghost].sendBox].kStride,
+        /* read.scale    = */ 1,
+        /* write.box     = */ ghostsToSend[ghost].recvBox,
+        /* write.ptr     = */ NULL,
+        /* write.i       = */ recv_i,
+        /* write.j       = */ recv_j,
+        /* write.k       = */ recv_k,
+        /* write.jStride = */ level->my_boxes[ghostsToSend[ghost].recvBox].jStride,
+        /* write.kStride = */ level->my_boxes[ghostsToSend[ghost].recvBox].kStride,
+        /* write.scale   = */ 1,
+        /* blockcopy_i   = */ BLOCKCOPY_TILE_I, // default
+        /* blockcopy_j   = */ BLOCKCOPY_TILE_J, // default
+        /* blockcopy_k   = */ BLOCKCOPY_TILE_K, // default
+        /* subtype       = */ 0  
+      );
+      else // append to the MPI pack list...
+      append_block_to_list(&(level->exchange_ghosts[shape].blocks[0]),&(level->exchange_ghosts[shape].allocated_blocks[0]),&(level->exchange_ghosts[shape].num_blocks[0]),
+        /* dim.i         = */ dim_i,
+        /* dim.j         = */ dim_j,
+        /* dim.k         = */ dim_k,
+        /* read.box      = */ ghostsToSend[ghost].sendBox,
+        /* read.ptr      = */ NULL,
+        /* read.i        = */ send_i,
+        /* read.j        = */ send_j,
+        /* read.k        = */ send_k,
+        /* read.jStride  = */ level->my_boxes[ghostsToSend[ghost].sendBox].jStride,
+        /* read.kStride  = */ level->my_boxes[ghostsToSend[ghost].sendBox].kStride,
+        /* read.scale    = */ 1,
+        /* write.box     = */ -1,
+        /* write.ptr     = */ level->exchange_ghosts[shape].send_buffers[neighbor], // NOTE, 1. count _sizes, 2. allocate _buffers, 3. populate blocks
+        /* write.i       = */ level->exchange_ghosts[shape].send_sizes[neighbor], // current offset in the MPI send buffer
+        /* write.j       = */ 0,
+        /* write.k       = */ 0,
+        /* write.jStride = */ dim_i,       // contiguous block
+        /* write.kStride = */ dim_i*dim_j, // contiguous block
+        /* write.scale   = */ 1,
+        /* blockcopy_i   = */ BLOCKCOPY_TILE_I, // default
+        /* blockcopy_j   = */ BLOCKCOPY_TILE_J, // default
+        /* blockcopy_k   = */ BLOCKCOPY_TILE_K, // default
+        /* subtype       = */ 0  
+      );}
+      if(neighbor>=0)level->exchange_ghosts[shape].send_sizes[neighbor]+=dim_i*dim_j*dim_k;
+    } // ghost for-loop
+  } // stage for-loop
+
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // free temporary storage...
+  free(ghostsToSend);
+  free(sendRanks);
+
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // traverse my list of boxes and create a lists of neighboring boxes and neighboring ranks
+  GZ_type *ghostsToRecv = (GZ_type*)malloc(26*level->num_my_boxes*sizeof(GZ_type)); // There are at most 26 neighbors per box.
+         int *recvRanks = (    int*)malloc(26*level->num_my_boxes*sizeof(    int)); // There are at most 26 neighbors per box.
+  if(level->num_my_boxes>0){
+  if(ghostsToRecv == NULL){fprintf(stderr,"malloc failed - build_exchange_ghosts/ghostsToRecv\n");exit(0);}
+  if(recvRanks    == NULL){fprintf(stderr,"malloc failed - build_exchange_ghosts/recvRanks   \n");exit(0);}
+  }
+  numGhosts       = 0;
+  numGhostsRemote = 0;
+  for(recvBox=0;recvBox<level->num_my_boxes;recvBox++){
+    int di,dj,dk;
+    for(dk=-1;dk<=1;dk++){
+    for(dj=-1;dj<=1;dj++){
+    for(di=-1;di<=1;di++){
+      int dir = 13+di+3*dj+9*dk;if(CommunicateThisDir[dir]){
+      int       myBoxID = level->my_boxes[recvBox].global_box_id;
+      int       myBox_i = level->my_boxes[recvBox].low.i / level->box_dim;
+      int       myBox_j = level->my_boxes[recvBox].low.j / level->box_dim;
+      int       myBox_k = level->my_boxes[recvBox].low.k / level->box_dim;
+      int neighborBoxID = -1;
+      if(level->boundary_condition.type == BC_PERIODIC){
+        int neighborBox_i = (  myBox_i + di + level->boxes_in.i) % level->boxes_in.i;
+        int neighborBox_j = (  myBox_j + dj + level->boxes_in.j) % level->boxes_in.j;
+        int neighborBox_k = (  myBox_k + dk + level->boxes_in.k) % level->boxes_in.k;
+            neighborBoxID =  neighborBox_i + neighborBox_j*level->boxes_in.i + neighborBox_k*level->boxes_in.i*level->boxes_in.j;
+      }else{
+        int neighborBox_i = (  myBox_i + di );
+        int neighborBox_j = (  myBox_j + dj );
+        int neighborBox_k = (  myBox_k + dk );
+        if( (neighborBox_i>=0) && (neighborBox_i<level->boxes_in.i) && 
+            (neighborBox_j>=0) && (neighborBox_j<level->boxes_in.j) && 
+            (neighborBox_k>=0) && (neighborBox_k<level->boxes_in.k) ){  // i.e. the neighbor is a valid box
+            neighborBoxID =  neighborBox_i + neighborBox_j*level->boxes_in.i + neighborBox_k*level->boxes_in.i*level->boxes_in.j;
+        }
+      }
+      if(neighborBoxID>=0){
+      if( (level->rank_of_box[neighborBoxID] != -1) && (level->rank_of_box[neighborBoxID] != level->my_rank)  ){
+        ghostsToRecv[numGhosts].sendRank  = level->rank_of_box[neighborBoxID];
+        ghostsToRecv[numGhosts].sendBoxID = neighborBoxID;
+        ghostsToRecv[numGhosts].sendBox   = -1;
+        ghostsToRecv[numGhosts].sendDir   = 26-dir;
+        ghostsToRecv[numGhosts].recvRank  = level->my_rank;
+        ghostsToRecv[numGhosts].recvBoxID = myBoxID;
+        ghostsToRecv[numGhosts].recvBox   = recvBox;
+                     numGhosts++;
+        recvRanks[numGhostsRemote++] = level->rank_of_box[neighborBoxID];
+      }}
+    }}}}
+  }
+  // sort boxes by sendRank then by sendBoxID... ensures the recvs and receive buffers are always sorted by sendBoxID...
+  qsort(ghostsToRecv,numGhosts      ,sizeof(GZ_type),qsortGZ );
+  // sort the lists of neighboring ranks and remove duplicates...
+  qsort(recvRanks   ,numGhostsRemote,sizeof(    int),qsortInt);
+  int numRecvRanks=0;_rank=-1;for(ghost=0;ghost<numGhostsRemote;ghost++)if(recvRanks[ghost] != _rank){_rank=recvRanks[ghost];recvRanks[numRecvRanks++]=recvRanks[ghost];}
+
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // in a two-stage process, traverse the list of ghosts and allocate the unpack lists as well as the MPI buffers, and then populate the unpack list
+  level->exchange_ghosts[shape].num_recvs     =                  numRecvRanks;
+  level->exchange_ghosts[shape].recv_ranks    =     (int*)malloc(numRecvRanks*sizeof(int));
+  level->exchange_ghosts[shape].recv_sizes    =     (int*)malloc(numRecvRanks*sizeof(int));
+  level->exchange_ghosts[shape].recv_buffers  = (double**)malloc(numRecvRanks*sizeof(double*));
+  if(numRecvRanks>0){
+  if(level->exchange_ghosts[shape].recv_ranks  ==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].recv_ranks\n",shape);exit(0);}
+  if(level->exchange_ghosts[shape].recv_sizes  ==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].recv_sizes\n",shape);exit(0);}
+  if(level->exchange_ghosts[shape].recv_buffers==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].recv_buffers\n",shape);exit(0);}
+  }
+  level->exchange_ghosts[shape].blocks[2] = NULL;
+  level->exchange_ghosts[shape].num_blocks[2] = 0;
+  level->exchange_ghosts[shape].allocated_blocks[2] = 0;
+  for(stage=0;stage<=1;stage++){
+    // stage=0... traverse the list and calculate the buffer sizes
+    // stage=1... allocate MPI recv buffers, traverse the list, and populate the unpack/local lists...
+    int neighbor;
+    for(neighbor=0;neighbor<numRecvRanks;neighbor++){
+      if(stage==1){
+             level->exchange_ghosts[shape].recv_buffers[neighbor] = (double*)malloc(level->exchange_ghosts[shape].recv_sizes[neighbor]*sizeof(double));
+          if(level->exchange_ghosts[shape].recv_sizes[neighbor]>0)
+          if(level->exchange_ghosts[shape].recv_buffers[neighbor]==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].recv_buffers[neighbor]\n",shape);exit(0);}
+      memset(level->exchange_ghosts[shape].recv_buffers[neighbor],                0,level->exchange_ghosts[shape].recv_sizes[neighbor]*sizeof(double));
+      }
+      level->exchange_ghosts[shape].recv_ranks[neighbor]=recvRanks[neighbor];
+      level->exchange_ghosts[shape].recv_sizes[neighbor]=0;
+    }
+    for(ghost=0;ghost<numGhosts;ghost++){
+      int  dim_i=-1, dim_j=-1, dim_k=-1;
+    //int send_i=-1,send_j=-1,send_k=-1;
+      int recv_i=-1,recv_j=-1,recv_k=-1;
+  
+      // decode ghostsToRecv[ghost].sendDir (direction sent) into di/dj/dk 
+      int di = ((ghostsToRecv[ghost].sendDir % 3)  )-1;
+      int dj = ((ghostsToRecv[ghost].sendDir % 9)/3)-1;
+      int dk = ((ghostsToRecv[ghost].sendDir / 9)  )-1;
+      switch(di){ // direction relative to sender
+        case -1:dim_i=level->box_ghosts;recv_i=  level->box_dim;   break;
+        case  0:dim_i=level->box_dim;   recv_i=0;                  break;
+        case  1:dim_i=level->box_ghosts;recv_i=0-level->box_ghosts;break;
+      }
+      switch(dj){ // direction relative to sender
+        case -1:dim_j=level->box_ghosts;recv_j=  level->box_dim;   break;
+        case  0:dim_j=level->box_dim;   recv_j=0;                  break;
+        case  1:dim_j=level->box_ghosts;recv_j=0-level->box_ghosts;break;
+      }
+      switch(dk){ // direction relative to sender
+        case -1:dim_k=level->box_ghosts;recv_k=  level->box_dim;   break;
+        case  0:dim_k=level->box_dim;   recv_k=0;                  break;
+        case  1:dim_k=level->box_ghosts;recv_k=0-level->box_ghosts;break;
+      }
+ 
+      // determine if this ghost requires a pack or local exchange 
+      neighbor=0;while(level->exchange_ghosts[shape].recv_ranks[neighbor] != ghostsToRecv[ghost].sendRank)neighbor++;
+      if(stage==1)append_block_to_list(&(level->exchange_ghosts[shape].blocks[2]),&(level->exchange_ghosts[shape].allocated_blocks[2]),&(level->exchange_ghosts[shape].num_blocks[2]),
+      /*dim.i         = */ dim_i,
+      /*dim.j         = */ dim_j,
+      /*dim.k         = */ dim_k,
+      /*read.box      = */ -1,
+      /*read.ptr      = */ level->exchange_ghosts[shape].recv_buffers[neighbor], // NOTE, 1. count _sizes, 2. allocate _buffers, 3. populate blocks
+      /*read.i        = */ level->exchange_ghosts[shape].recv_sizes[neighbor], // current offset in the MPI recv buffer
+      /*read.j        = */ 0,
+      /*read.k        = */ 0,
+      /*read.jStride  = */ dim_i,       // contiguous block
+      /*read.kStride  = */ dim_i*dim_j, // contiguous block
+      /*read.scale    = */ 1,
+      /*write.box     = */ ghostsToRecv[ghost].recvBox,
+      /*write.ptr     = */ NULL,
+      /*write.i       = */ recv_i,
+      /*write.j       = */ recv_j,
+      /*write.k       = */ recv_k,
+      /*write.jStride = */ level->my_boxes[ghostsToRecv[ghost].recvBox].jStride,
+      /*write.kStride = */ level->my_boxes[ghostsToRecv[ghost].recvBox].kStride,
+      /*write.scale   = */ 1,
+      /* blockcopy_i  = */ BLOCKCOPY_TILE_I, // default
+      /* blockcopy_j  = */ BLOCKCOPY_TILE_J, // default
+      /* blockcopy_k  = */ BLOCKCOPY_TILE_K, // default
+      /* subtype      = */ 0  
+      );
+      if(neighbor>=0)level->exchange_ghosts[shape].recv_sizes[neighbor]+=dim_i*dim_j*dim_k;
+    } // ghost for-loop
+  } // stage for-loop
+
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // free temporary storage...
+  free(ghostsToRecv);
+  free(recvRanks);
+
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // malloc MPI requests/status arrays
+  #ifdef USE_MPI
+  level->exchange_ghosts[shape].requests = (MPI_Request*)malloc((level->exchange_ghosts[shape].num_sends+level->exchange_ghosts[shape].num_recvs)*sizeof(MPI_Request));
+  level->exchange_ghosts[shape].status   = (MPI_Status *)malloc((level->exchange_ghosts[shape].num_sends+level->exchange_ghosts[shape].num_recvs)*sizeof(MPI_Status ));
+  if((level->exchange_ghosts[shape].num_sends+level->exchange_ghosts[shape].num_recvs)>0){
+  if(level->exchange_ghosts[shape].requests==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].requests\n",shape);exit(0);}
+  if(level->exchange_ghosts[shape].status  ==NULL){fprintf(stderr,"malloc failed - exchange_ghosts[%d].status\n",shape);exit(0);}
+  }
+  #endif
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  #ifdef BLOCK_SPATIAL_SORT
+  // sort all the resultant blocks by box,k,j,i (good locality)
+  qsort(level->exchange_ghosts[shape].blocks[0],level->exchange_ghosts[shape].num_blocks[0],sizeof(blockCopy_type),qsortBlock);
+  qsort(level->exchange_ghosts[shape].blocks[1],level->exchange_ghosts[shape].num_blocks[1],sizeof(blockCopy_type),qsortBlock);
+  qsort(level->exchange_ghosts[shape].blocks[2],level->exchange_ghosts[shape].num_blocks[2],sizeof(blockCopy_type),qsortBlock);
+  #endif
+}
+
+
+
+//---------------------------------------------------------------------------------------------------------------------------------------------------
+// create the pointers in level_type to the contiguous vector FP data (useful for bulk copies to/from accelerators)
+// create the pointers in each box to their respective segment of the level's vector FP data (useful for box-relative operators)
+// if( (level->numVectors > 0) && (numVectors > level->numVectors) ) then allocate additional space for (numVectors-level->numVectors) and copy old leve->numVectors data
+void create_vectors(level_type *level, int numVectors){
+  if(numVectors <= level->numVectors)return; // already have enough space
+  double          * old_vectors_base = level->vectors_base; // save a pointer to the originally allocated data for subsequent free()
+  double               * old_vector0 = NULL;
+  if(level->numVectors>0)old_vector0 = level->vectors[0];   // save a pointer to old FP data to copy
+
+
+  // calculate the size of each box...
+  level->box_jStride =                    (level->box_dim+2*level->box_ghosts);while(level->box_jStride % BOX_ALIGN_JSTRIDE)level->box_jStride++; // pencil
+  level->box_kStride = level->box_jStride*(level->box_dim+2*level->box_ghosts);while(level->box_kStride % BOX_ALIGN_KSTRIDE)level->box_kStride++; // plane
+  level->box_volume  = level->box_kStride*(level->box_dim+2*level->box_ghosts);while(level->box_volume  % BOX_ALIGN_VOLUME )level->box_volume++;  // volume
+
+
+  #define VECTOR_MALLOC_BULK
+  #ifdef  VECTOR_MALLOC_BULK
+    // allocate one aligned, double-precision array and divide it among vectors...
+    uint64_t malloc_size = (uint64_t)numVectors*level->num_my_boxes*level->box_volume*sizeof(double) + 4096;
+    level->vectors_base = (double*)malloc(malloc_size);
+    if((numVectors>0)&&(level->vectors_base==NULL)){fprintf(stderr,"malloc failed - level->vectors_base\n");exit(0);}
+    double * tmpbuf = level->vectors_base;
+    while( (uint64_t)(tmpbuf+level->box_ghosts*(1+level->box_jStride+level->box_kStride)) & 0xff ){tmpbuf++;} // align first *non-ghost* zone element of first component to a 256-Byte boundary
+    uint64_t ofs;
+    #ifdef _OPENMP
+    #pragma omp parallel for
+    #endif
+    for(ofs=0;ofs<(uint64_t)numVectors*level->num_my_boxes*level->box_volume;ofs++){tmpbuf[ofs]=0.0;} // Faster in MPI+OpenMP environments, but not NUMA-aware
+    // if there is existing FP data... copy it, then free old data and pointer array
+    if(level->numVectors>0){
+      memcpy(tmpbuf,old_vector0,(uint64_t)level->numVectors*level->num_my_boxes*level->box_volume*sizeof(double)); // FIX... omp thread ???
+      if(old_vectors_base)free(old_vectors_base); // free old data...
+    }
+    // allocate an array of pointers which point to the union of boxes for each vector
+    // NOTE, this requires just one copyin per vector to an accelerator rather than requiring one copyin per box per vector
+    if(level->numVectors>0)free(level->vectors); // free any previously allocated vector array
+    level->vectors = (double **)malloc(numVectors*sizeof(double*));
+    if((numVectors>0)&&(level->vectors==NULL)){fprintf(stderr,"malloc failed - level->vectors\n");exit(0);}
+    uint64_t c;for(c=0;c<numVectors;c++){level->vectors[c] = tmpbuf + (uint64_t)c*level->num_my_boxes*level->box_volume;}
+  #else
+    // allocate vectors individually (simple, but may cause conflict misses)
+    double ** old_vectors = level->vectors;
+    level->vectors = (double **)malloc(numVectors*sizeof(double*));
+    uint64_t c;
+    for(c=                0;c<level->numVectors;c++){level->vectors[c] = old_vectors[c];}
+    for(c=level->numVectors;c<       numVectors;c++){
+      level->vectors[c] = (double*)malloc((uint64_t)level->num_my_boxes*level->box_volume*sizeof(double));
+      uint64_t ofs;
+      #ifdef _OPENMP
+      #pragma omp parallel for
+      #endif
+      for(ofs=0;ofs<(uint64_t)level->num_my_boxes*level->box_volume;ofs++){level->vectors[c][ofs]=0.0;} // Faster in MPI+OpenMP environments, but not NUMA-aware
+    }
+    free(old_vectors);
+  #endif
+
+
+  // build the list of boxes...
+  int box=0;
+  int i,j,k;
+  for(k=0;k<level->boxes_in.k;k++){
+  for(j=0;j<level->boxes_in.j;j++){
+  for(i=0;i<level->boxes_in.i;i++){
+    int jStride = level->boxes_in.i;
+    int kStride = level->boxes_in.i*level->boxes_in.j;
+    int b=i + j*jStride + k*kStride;
+    if(level->rank_of_box[b]==level->my_rank){
+      if(level->numVectors>0)free(level->my_boxes[box].vectors); // free previously allocated vector array
+      level->my_boxes[box].vectors = (double **)malloc(numVectors*sizeof(double*));
+      if((numVectors>0)&&(level->my_boxes[box].vectors==NULL)){fprintf(stderr,"malloc failed - level->my_boxes[box].vectors\n");exit(0);}
+      uint64_t c;for(c=0;c<numVectors;c++){level->my_boxes[box].vectors[c] = level->vectors[c] + (uint64_t)box*level->box_volume;}
+      level->my_boxes[box].numVectors = numVectors;
+      level->my_boxes[box].dim        = level->box_dim;
+      level->my_boxes[box].ghosts     = level->box_ghosts;
+      level->my_boxes[box].jStride    = level->box_jStride;
+      level->my_boxes[box].kStride    = level->box_kStride;
+      level->my_boxes[box].volume     = level->box_volume;
+      level->my_boxes[box].low.i      = i*level->box_dim;
+      level->my_boxes[box].low.j      = j*level->box_dim;
+      level->my_boxes[box].low.k      = k*level->box_dim;
+      level->my_boxes[box].global_box_id = b;
+      box++;
+  }}}}
+
+  // level now has created/initialized vector FP data
+  level->numVectors = numVectors;
+}
+
+
+//---------------------------------------------------------------------------------------------------------------------------------------------------
+// create a level by populating the basic data structure, distribute boxes within the level among processes, allocate memory, and create any auxilliaries
+// box_ghosts must be >= stencil_get_radius()
+// numVectors represents an estimate of the number of vectors needed in this level.  Additional vectors can be added via subsequent calls to create_vectors()
+void create_level(level_type *level, int boxes_in_i, int box_dim, int box_ghosts, int numVectors, int domain_boundary_condition, int my_rank, int num_ranks, const MPI_Comm comm){
+  int box;
+  int TotalBoxes = boxes_in_i*boxes_in_i*boxes_in_i;
+
+  if(my_rank==0){
+  //if(domain_boundary_condition==BC_DIRICHLET)fprintf(stdout,"\nattempting to create a %d^3 level (with Dirichlet BC) using a %d^3 grid of %d^3 boxes and %d tasks...\n",box_dim*boxes_in_i,boxes_in_i,box_dim,num_ranks);
+  //if(domain_boundary_condition==BC_PERIODIC )fprintf(stdout,"\nattempting to create a %d^3 level (with Periodic BC) using a %d^3 grid of %d^3 boxes and %d tasks...\n", box_dim*boxes_in_i,boxes_in_i,box_dim,num_ranks);
+                                               fprintf(stdout,"\nattempting to create a %d^3 level from %d x %d^3 boxes distributed among %d tasks...\n", box_dim*boxes_in_i,TotalBoxes,box_dim,num_ranks);
+    if(domain_boundary_condition==BC_DIRICHLET)fprintf(stdout,"  boundary condition = BC_DIRICHLET\n");
+    if(domain_boundary_condition==BC_PERIODIC )fprintf(stdout,"  boundary condition = BC_PERIODIC\n");
+    
+  }
+
+  int omp_threads = 1;
+
+  #ifdef _OPENMP
+  #pragma omp parallel 
+  {
+    #pragma omp master
+    {
+      omp_threads = omp_get_num_threads();
+    }
+  }
+  #endif
+
+  if(box_ghosts < stencil_get_radius() ){
+    if(my_rank==0)fprintf(stderr,"ghosts(%d) must be >= stencil_get_radius(%d)\n",box_ghosts,stencil_get_radius());
+    exit(0);
+  }
+
+  level->box_dim        = box_dim;
+  level->box_ghosts     = box_ghosts;
+  level->numVectors     = 0; // no vectors have been allocated yet
+  level->vectors_base   = NULL; // pointer returned by bulk malloc
+  level->vectors        = NULL; // pointers to individual vectors
+  level->boxes_in.i     = boxes_in_i;
+  level->boxes_in.j     = boxes_in_i;
+  level->boxes_in.k     = boxes_in_i;
+  level->dim.i          = box_dim*level->boxes_in.i;
+  level->dim.j          = box_dim*level->boxes_in.j;
+  level->dim.k          = box_dim*level->boxes_in.k;
+  level->active         = 1;
+  level->my_rank        = my_rank;
+  level->num_ranks      = num_ranks;
+  level->boundary_condition.type = domain_boundary_condition;
+  level->must_subtract_mean = -1;
+  level->num_threads      = omp_threads;
+  level->my_blocks        = NULL;
+  level->num_my_blocks    = 0;
+  level->allocated_blocks = 0;
+  level->tag              = log2(level->dim.i);
+  level->fluxes           = NULL;
+
+
+  // allocate 3D array of integers to hold the MPI rank of the corresponding box and initialize to -1 (unassigned)
+     level->rank_of_box = (int*)malloc(level->boxes_in.i*level->boxes_in.j*level->boxes_in.k*sizeof(int));
+  if(level->rank_of_box==NULL){fprintf(stderr,"malloc of level->rank_of_box failed\n");exit(0);}
+  for(box=0;box<level->boxes_in.i*level->boxes_in.j*level->boxes_in.k;box++){level->rank_of_box[box]=-1;}  // -1 denotes that there is no actual box assigned to this region
+
+
+  // parallelize the level (i.e. assign a process rank to each box)...
+  #ifdef DECOMPOSE_LEX
+  // lexicographical ordering... good load balance, potentially high bisection bandwidth requirements, bad surface:volume ratio when #boxes/proc is large
+  if(my_rank==0){fprintf(stdout,"  Decomposing level via lexicographical ordering... ");fflush(stdout);}
+  decompose_level_lex(level->rank_of_box,level->boxes_in.i,level->boxes_in.j,level->boxes_in.k,num_ranks);
+  #elif DECOMPOSE_BISECTION_SPECIAL
+  // recursive partitioning by primes
+  if(my_rank==0){fprintf(stdout,"  Decomposing level via partitioning by primes... ");fflush(stdout);}
+  decompose_level_bisection_special(level->rank_of_box,level->boxes_in.i,level->boxes_in.i*level->boxes_in.j,0,0,0,level->boxes_in.i,level->boxes_in.j,level->boxes_in.k,0,num_ranks);
+  #elif DECOMPOSE_BISECTION
+  // recursive bisection
+  if(my_rank==0){fprintf(stdout,"  Decomposing level via recursive bisection... ");fflush(stdout);}
+  decompose_level_bisection(level->rank_of_box,level->boxes_in.i,level->boxes_in.i*level->boxes_in.j,0,0,0,level->boxes_in.i,level->boxes_in.j,level->boxes_in.k,num_ranks,0,level->boxes_in.i*level->boxes_in.j*level->boxes_in.k);
+  #else//#elif DECOMPOSE_ZMORT
+  if(my_rank==0){fprintf(stdout,"  Decomposing level via Z-mort ordering... ");fflush(stdout);}
+  #if 0 // Z-Mort over a power of two bounding box skipping boxes outside the domain
+  int idim_padded=1;while(idim_padded<level->boxes_in.i)idim_padded*=2;
+  int jdim_padded=1;while(jdim_padded<level->boxes_in.j)jdim_padded*=2;
+  int kdim_padded=1;while(kdim_padded<level->boxes_in.k)kdim_padded*=2;
+  #else // Z-Mort over the valid domain wtih odd-sized base cases (i.e. zmort on 3x3)
+  int idim_padded=level->boxes_in.i;
+  int jdim_padded=level->boxes_in.j;
+  int kdim_padded=level->boxes_in.k;
+  #endif
+  decompose_level_zmort(level->rank_of_box,level->boxes_in.i,level->boxes_in.j,level->boxes_in.k,0,0,0,idim_padded,jdim_padded,kdim_padded,num_ranks,0,level->boxes_in.i*level->boxes_in.j*level->boxes_in.k);
+  #endif
+  if(my_rank==0){fprintf(stdout,"done\n");fflush(stdout);}
+//print_decomposition(level);// for debug purposes only
+
+
+  // calculate how many boxes I own...
+  level->num_my_boxes=0;
+  for(box=0;box<level->boxes_in.i*level->boxes_in.j*level->boxes_in.k;box++){if(level->rank_of_box[box]==level->my_rank)level->num_my_boxes++;} 
+  level->my_boxes = (box_type*)malloc(level->num_my_boxes*sizeof(box_type));
+  if((level->num_my_boxes>0)&&(level->my_boxes==NULL)){fprintf(stderr,"malloc failed - create_level/level->my_boxes\n");exit(0);}
+
+
+  // allocate flattened vector FP data and create pointers...
+  if(my_rank==0){fprintf(stdout,"  Allocating vectors... ");fflush(stdout);}
+  create_vectors(level,numVectors);
+  if(my_rank==0){fprintf(stdout,"done\n");fflush(stdout);}
+
+
+  // Build and auxilarlly data structure that flattens boxes into blocks...
+  for(box=0;box<level->num_my_boxes;box++){
+    int blockcopy_i = BLOCKCOPY_TILE_I;
+    int blockcopy_j = BLOCKCOPY_TILE_J;
+    int blockcopy_k = BLOCKCOPY_TILE_K;
+
+    append_block_to_list(&(level->my_blocks),&(level->allocated_blocks),&(level->num_my_blocks),
+      /* dim.i         = */ level->my_boxes[box].dim,
+      /* dim.j         = */ level->my_boxes[box].dim,
+      /* dim.k         = */ level->my_boxes[box].dim,
+      /* read.box      = */ box,
+      /* read.ptr      = */ NULL,
+      /* read.i        = */ 0,
+      /* read.j        = */ 0,
+      /* read.k        = */ 0,
+      /* read.jStride  = */ level->my_boxes[box].jStride,
+      /* read.kStride  = */ level->my_boxes[box].kStride,
+      /* read.scale    = */ 1,
+      /* write.box     = */ box,
+      /* write.ptr     = */ NULL,
+      /* write.i       = */ 0,
+      /* write.j       = */ 0,
+      /* write.k       = */ 0,
+      /* write.jStride = */ level->my_boxes[box].jStride,
+      /* write.kStride = */ level->my_boxes[box].kStride,
+      /* write.scale   = */ 1,
+      /* blockcopy_i   = */ blockcopy_i,
+      /* blockcopy_j   = */ blockcopy_j,
+      /* blockcopy_k   = */ blockcopy_k,
+      /* subtype       = */ 0  
+    );
+  }
+
+
+  // build an assist structure for Gauss Seidel Red Black that would facilitate unrolling and SIMDization...
+  level->RedBlack_base = NULL;
+  level->RedBlack_FP   = NULL;
+  if(level->num_my_boxes){
+    int i,j;
+    int kStride = level->my_boxes[0].kStride;
+    int jStride = level->my_boxes[0].jStride;
+    level->RedBlack_base = (double*)malloc(2*kStride*sizeof(double)+256); // used for free()
+    level->RedBlack_FP   = level->RedBlack_base; // aligned version
+    // align first *non-ghost* zone element to a 64-Byte boundary...
+    while( (uint64_t)(level->RedBlack_FP + level->box_ghosts*(1+level->box_jStride)) & 0x3f ){level->RedBlack_FP++;}
+    // initialize RedBlack array...
+    for(j=0-level->box_ghosts;j<level->box_dim+level->box_ghosts;j++){
+    for(i=0-level->box_ghosts;i<level->box_dim+level->box_ghosts;i++){
+      int ij = (i+level->box_ghosts) + (j+level->box_ghosts)*jStride;
+      if((i^j^1)&0x1){
+        level->RedBlack_FP[ij        ]=1.0;
+        level->RedBlack_FP[ij+kStride]=0.0;
+      }else{
+        level->RedBlack_FP[ij        ]=0.0;
+        level->RedBlack_FP[ij+kStride]=1.0;
+      }
+      // Never update ghost zones
+      //if( (i<0) || (i>=level->box_dim) || (j<0) || (j>=level->box_dim) ){
+      //  level->RedBlack_FP[ij        ]=0.0;
+      //  level->RedBlack_FP[ij+kStride]=0.0;
+      //}
+    }}
+  }
+
+
+  int shape;
+  // create mini program for each stencil shape to perform a ghost zone exchange...
+  for(shape=0;shape<STENCIL_MAX_SHAPES;shape++)build_exchange_ghosts(    level,shape);
+  // create mini program for each stencil shape to perform a boundary condition...
+  for(shape=0;shape<STENCIL_MAX_SHAPES;shape++)build_boundary_conditions(level,shape);
+
+
+  // duplicate MPI_COMM_WORLD to be the communicator for each level
+  #ifdef USE_MPI
+  level->MPI_COMM_ALLREDUCE = comm;
+  /*
+  if(my_rank==0){fprintf(stdout,"  Duplicating MPI_COMM_WORLD... ");fflush(stdout);}
+  double time_start = MPI_Wtime();
+  MPI_Comm_dup(comm,&level->MPI_COMM_ALLREDUCE);
+  double time_end = MPI_Wtime();
+  double time_in_comm_dup = 0;
+  double time_in_comm_dup_send = time_end-time_start;
+  MPI_Allreduce(&time_in_comm_dup_send,&time_in_comm_dup,1,MPI_DOUBLE,MPI_MAX,level->MPI_COMM_ALLREDUCE);
+  if(my_rank==0){fprintf(stdout,"done (%0.6f seconds)\n",time_in_comm_dup);fflush(stdout);}
+  */
+  #endif
+
+  // report on potential load imbalance
+  int BoxesPerProcess = level->num_my_boxes;
+  #ifdef USE_MPI
+  int BoxesPerProcessSend = level->num_my_boxes;
+  MPI_Allreduce(&BoxesPerProcessSend,&BoxesPerProcess,1,MPI_INT,MPI_MAX,level->MPI_COMM_ALLREDUCE);
+  #endif
+  if(my_rank==0){fprintf(stdout,"  Calculating boxes per process... target=%0.3f, max=%d\n",(double)TotalBoxes/(double)num_ranks,BoxesPerProcess);}
+}
+
+
+
+//---------------------------------------------------------------------------------------------------------------------------------------------------
+// zeros are the timers within this level
+// useful if one wishes to separate setup(build) timing from solve timing
+void reset_level_timers(level_type *level){
+  // cycle counters information...
+  level->timers.smooth                  = 0;
+  level->timers.apply_op                = 0;
+  level->timers.residual                = 0;
+  level->timers.blas1                   = 0;
+  level->timers.blas3                   = 0;
+  level->timers.boundary_conditions     = 0;
+  level->timers.restriction_total       = 0;
+  level->timers.restriction_pack        = 0;
+  level->timers.restriction_local       = 0;
+  level->timers.restriction_unpack      = 0;
+  level->timers.restriction_recv        = 0;
+  level->timers.restriction_send        = 0;
+  level->timers.restriction_wait        = 0;
+  level->timers.interpolation_total     = 0;
+  level->timers.interpolation_pack      = 0;
+  level->timers.interpolation_local     = 0;
+  level->timers.interpolation_unpack    = 0;
+  level->timers.interpolation_recv      = 0;
+  level->timers.interpolation_send      = 0;
+  level->timers.interpolation_wait      = 0;
+  level->timers.ghostZone_total         = 0;
+  level->timers.ghostZone_pack          = 0;
+  level->timers.ghostZone_local         = 0;
+  level->timers.ghostZone_unpack        = 0;
+  level->timers.ghostZone_recv          = 0;
+  level->timers.ghostZone_send          = 0;
+  level->timers.ghostZone_wait          = 0;
+  level->timers.collectives             = 0;
+  level->timers.Total                   = 0;
+  // solver events information...
+  level->Krylov_iterations              = 0;
+  level->CAKrylov_formations_of_G       = 0;
+  level->vcycles_from_this_level        = 0;
+}
+
+//---------------------------------------------------------------------------------------------------------------------------------------------------
+// free all memory allocated by this level
+// n.b. in some cases a malloc was used as the basis for an array of pointers.  As such free(x[0])
+void destroy_level(level_type *level){
+  int i,j;
+  if(level->my_rank==0){fprintf(stdout,"attempting to free the %5d^3 level... ",level->dim.i);fflush(stdout);}
+
+  // box ...
+  for(i=0;i<level->num_my_boxes;i++)if(level->my_boxes[i].vectors)free(level->my_boxes[i].vectors);
+
+  // misc ...
+  if(level->rank_of_box )free(level->rank_of_box);
+  if(level->my_boxes    )free(level->my_boxes);
+  if(level->my_blocks   )free(level->my_blocks);
+  if(level->RedBlack_base)free(level->RedBlack_base);
+
+  // FP vector data...
+  #ifdef VECTOR_MALLOC_BULK
+  if(level->vectors_base)free(level->vectors_base);
+  if(level->vectors     )free(level->vectors);
+  #else
+  for(i=0;i<level->numVectors;i++)if(level->vectors[i])free(level->vectors[i]);
+  if(level->vectors     )free(level->vectors);
+  #endif
+
+  // boundary condition mini program...
+  for(i=0;i<STENCIL_MAX_SHAPES;i++){
+    if(level->boundary_condition.blocks[i])free(level->boundary_condition.blocks[i]);
+  }
+
+  // ghost zone exchange mini programs...
+  for(i=0;i<STENCIL_MAX_SHAPES;i++){
+    if(level->exchange_ghosts[i].num_recvs>0){
+    for(j=0;j<level->exchange_ghosts[i].num_recvs;j++)if(level->exchange_ghosts[i].recv_buffers[j])free(level->exchange_ghosts[i].recv_buffers[j]);
+    if(level->exchange_ghosts[i].recv_buffers)free(level->exchange_ghosts[i].recv_buffers);
+    if(level->exchange_ghosts[i].recv_ranks  )free(level->exchange_ghosts[i].recv_ranks  );
+    if(level->exchange_ghosts[i].recv_sizes  )free(level->exchange_ghosts[i].recv_sizes  );
+    }
+    if(level->exchange_ghosts[i].num_sends>0){
+    for(j=0;j<level->exchange_ghosts[i].num_sends;j++)if(level->exchange_ghosts[i].send_buffers[j])free(level->exchange_ghosts[i].send_buffers[j]);
+    if(level->exchange_ghosts[i].send_buffers)free(level->exchange_ghosts[i].send_buffers);
+    if(level->exchange_ghosts[i].send_ranks  )free(level->exchange_ghosts[i].send_ranks  );
+    if(level->exchange_ghosts[i].send_sizes  )free(level->exchange_ghosts[i].send_sizes  );
+    }
+    if(level->exchange_ghosts[i].blocks[0]   )free(level->exchange_ghosts[i].blocks[0]   );
+    if(level->exchange_ghosts[i].blocks[1]   )free(level->exchange_ghosts[i].blocks[1]   );
+    if(level->exchange_ghosts[i].blocks[2]   )free(level->exchange_ghosts[i].blocks[2]   );
+    #ifdef USE_MPI
+    if(level->exchange_ghosts[i].requests    )free(level->exchange_ghosts[i].requests    );
+    if(level->exchange_ghosts[i].status      )free(level->exchange_ghosts[i].status      );
+    #endif
+  }
+
+  /*
+  // MPI subcommunicator
+  #ifdef USE_MPI
+  #ifdef USE_SUBCOMM
+  MPI_Comm_free(&level->MPI_COMM_ALLREDUCE);
+  #endif
+  #endif
+  */
+
+  if(level->my_rank==0){fprintf(stdout,"done\n");}
+}
diff --git a/Util/hpgmg/finite-volume/source/level.h b/Util/hpgmg/finite-volume/source/level.h
new file mode 100644
index 00000000..6a632c3c
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/level.h
@@ -0,0 +1,199 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef LEVEL_H
+#define LEVEL_H
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+// supported boundary conditions
+#define BC_PERIODIC  0
+#define BC_DIRICHLET 1
+//------------------------------------------------------------------------------------------------------------------------------
+// regiment communication by defining a series of stencil shapes...
+#define STENCIL_SHAPE_BOX         0	// faces, edges, and corners
+#define STENCIL_SHAPE_STAR        1	// just faces
+#define STENCIL_SHAPE_NO_CORNERS  2	// faces and edges, but no corners
+#define STENCIL_MAX_SHAPES        3
+//------------------------------------------------------------------------------------------------------------------------------
+// regiment threading around the 'block' or 'tile' concepts.  Define default tilings...
+#ifndef BLOCKCOPY_TILE_I
+#define BLOCKCOPY_TILE_I 10000
+#else
+#warning By overriding BLOCKCOPY_TILE_I, you are tiling in the unit stride.  I hope you know what you are doing.
+#endif
+#ifndef BLOCKCOPY_TILE_J
+#define BLOCKCOPY_TILE_J 8
+#endif
+#ifndef BLOCKCOPY_TILE_K
+#define BLOCKCOPY_TILE_K 8
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+// FP data for a vector within a box is padded to ensure alignment
+#ifndef BOX_ALIGN_JSTRIDE
+#define BOX_ALIGN_JSTRIDE   4  // j-stride(unit stride dimension including ghosts and padding) is a multiple of BOX_ALIGN_JSTRIDE... useful for SIMD in j+/-1
+#endif
+#ifndef BOX_ALIGN_KSTRIDE
+#define BOX_ALIGN_KSTRIDE   8  // k-stride is a multiple of BOX_ALIGN_KSTRIDE ... useful for SIMD in k+/-1
+#endif
+#ifndef BOX_ALIGN_VOLUME
+#define BOX_ALIGN_VOLUME    8  // box volumes are a multiple of BOX_ALIGN_VOLUME ... useful for SIMD on different vectors
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+typedef struct {
+  int subtype;			// e.g. used to calculate normal to domain for BC's
+  struct {int i, j, k;}dim;	// dimensions of the block to copy
+  struct {int box, i, j, k, jStride, kStride;double * __restrict__ ptr;}read,write;
+  // coordinates in the read grid to extract data, 
+  // coordinates in the write grid to insert data
+  // if read/write.box<0, then use write/read.ptr, otherwise use boxes[box].vectors[id]
+  // Thus, you can do grid->grid, grid->buf, buf->grid, or buf->buf
+} __attribute__((aligned(64))) blockCopy_type;
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+typedef struct {
+    int                           num_recvs;	//   number of neighbors by type
+    int                           num_sends;	//   number of neighbors by type
+    int     * __restrict__       recv_ranks;	//   MPI rank of each neighbor...          recv_ranks[neighbor]
+    int     * __restrict__       send_ranks;	//   MPI rank of each neighbor...          send_ranks[neighbor]
+    int     * __restrict__       recv_sizes;	//   size of each MPI recv buffer...       recv_sizes[neighbor]
+    int     * __restrict__       send_sizes;	//   size of each MPI send buffer...       send_sizes[neighbor]
+    double ** __restrict__     recv_buffers;	//   MPI recv buffer for each neighbor...  recv_buffers[neighbor][ recv_sizes[neighbor] ]
+    double ** __restrict__     send_buffers;	//   MPI send buffer for each neighbor...  send_buffers[neighbor][ send_sizes[neighbor] ]
+    int                 allocated_blocks[3];	//   number of blocks allocated (not necessarily used) each list...
+    int                       num_blocks[3];	//   number of blocks in each list...        num_blocks[pack,local,unpack]
+    blockCopy_type *              blocks[3];	//   list of block copies...                     blocks[pack,local,unpack]
+    #ifdef USE_MPI
+    MPI_Request * __restrict__     requests;
+    MPI_Status  * __restrict__       status;
+    #endif
+} communicator_type;
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+typedef struct {
+  int                         global_box_id;	// used to inded into level->rank_of_box
+  struct {int i, j, k;}low;			// global coordinates of the first (non-ghost) element of subdomain
+  int                                   dim;	// dimension of this box's core (owned)
+  int                                ghosts;	// ghost zone depth
+  int                jStride,kStride,volume;	// useful for offsets
+  int                            numVectors;	//
+  double   ** __restrict__          vectors;	// vectors[c] = pointer to 3D array for vector c for one box
+} box_type;
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+typedef struct {
+  double h;					// grid spacing at this level
+  int active;					// I am an active process (I have work to do on this or subsequent levels)
+  int num_ranks;				// total number of MPI ranks
+  int my_rank;					// my MPI rank
+  int box_dim;					// dimension of each cubical box (not counting ghost zones)
+  int box_ghosts;				// ghost zone depth for each box
+  int box_jStride,box_kStride,box_volume;	// useful for offsets
+  int numVectors;				// number of vectors stored in each box
+  int tag;					// tag each level uniquely... FIX... replace with sub commuicator
+  struct {int i, j, k;}boxes_in;		// total number of boxes in i,j,k across this level
+  struct {int i, j, k;}dim;			// global dimensions at this level (NOTE: dim.i == boxes_in.i * box_dim)
+
+  int * rank_of_box;				// 3D array containing rank of each box.  i-major ordering
+  int    num_my_boxes;				//           number of boxes owned by this rank
+  box_type * my_boxes;				// pointer to array of boxes owned by this rank
+
+  // create flattened FP data... useful for CUDA/OpenMP4/OpenACC when you want to copy an entire vector to/from an accelerator
+  double   ** __restrict__          vectors;	// vectors[v][box][k][j][i] = pointer to 5D array for vector v encompasing all boxes on this process... 
+  double    * __restrict__     vectors_base;    // pointer used for malloc/free.  vectors[v] are shifted from this for alignment
+
+  int       allocated_blocks;			//       number of blocks allocated by this rank (note, this represents a flattening of the box/cell hierarchy to facilitate threading)
+  int          num_my_blocks;			//       number of blocks     owned by this rank (note, this represents a flattening of the box/cell hierarchy to facilitate threading)
+  blockCopy_type * my_blocks;			// pointer to array of blocks owned by this rank (note, this represents a flattening of the box/cell hierarchy to facilitate threading)
+
+  struct {
+    int                type;			// BC_PERIODIC or BC_DIRICHLET
+    int    allocated_blocks[STENCIL_MAX_SHAPES];// number of blocks allocated (not necessarily used) for boundary conditions on this level for [shape]
+    int          num_blocks[STENCIL_MAX_SHAPES];// number of blocks used for boundary conditions on this level for [shape]
+    blockCopy_type * blocks[STENCIL_MAX_SHAPES];// pointer to array of blocks used for boundary conditions on this level for [shape]
+  } boundary_condition;				// boundary conditions on this level
+
+  communicator_type exchange_ghosts[STENCIL_MAX_SHAPES];// mini program that performs a neighbor ghost zone exchange for [shape]
+  communicator_type restriction[4];			// mini program that performs restriction and agglomeration for [0=cell centered, 1=i-face, 2=j-face, 3-k-face]
+  communicator_type interpolation;			// mini program that performs interpolation and dissemination...
+  #ifdef USE_MPI
+  MPI_Comm MPI_COMM_ALLREDUCE;			// MPI sub communicator for just the ranks that have boxes on this level or any subsequent level... 
+  #endif
+  double dominant_eigenvalue_of_DinvA;		// estimate on the dominate eigenvalue of D^{-1}A
+  int must_subtract_mean;			// e.g. Poisson with Periodic BC's
+  double    * __restrict__ RedBlack_base;       // allocated pointer... will be aligned for the first non ghost zone element
+  double    * __restrict__ RedBlack_FP;	        // Red/Black Mask (i.e. 0.0 or 1.0) for even/odd planes (2*kStride).  
+
+  int num_threads;
+  double    * __restrict__ fluxes;		// temporary array used to hold the flux values used by FV operators
+
+  // statistics information...
+  struct {
+    double              smooth;
+    double            apply_op;
+    double            residual;
+    double               blas1;
+    double               blas3;
+    double boundary_conditions;
+    // Distributed Restriction
+    double   restriction_total;
+    double   restriction_pack;
+    double   restriction_local;
+    double   restriction_unpack;
+    double   restriction_recv;
+    double   restriction_send;
+    double   restriction_wait;
+    // Distributed interpolation
+    double interpolation_total;
+    double interpolation_pack;
+    double interpolation_local;
+    double interpolation_unpack;
+    double interpolation_recv;
+    double interpolation_send;
+    double interpolation_wait;
+    // Ghost Zone Exchanges...
+    double     ghostZone_total;
+    double     ghostZone_pack;
+    double     ghostZone_local;
+    double     ghostZone_unpack;
+    double     ghostZone_recv;
+    double     ghostZone_send;
+    double     ghostZone_wait;
+    // Collectives...
+    double   collectives;
+    double         Total;
+  }timers;
+  int Krylov_iterations;        // total number of bottom solver iterations
+  int CAKrylov_formations_of_G; // i.e. [G,g] = [P,R]^T[P,R,rt]
+  int vcycles_from_this_level;  // number of vcycles performed that were initiated from this level
+} level_type;
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void create_level(level_type *level, int boxes_in_i, int box_dim, int box_ghosts, int numVectors, int domain_boundary_condition, int my_rank, int num_ranks, const MPI_Comm comm);
+void destroy_level(level_type *level);
+void create_vectors(level_type *level, int numVectors);
+void reset_level_timers(level_type *level);
+int qsortInt(const void *a, const void *b);
+void append_block_to_list(blockCopy_type ** blocks, int *allocated_blocks, int *num_blocks,
+                          int dim_i, int dim_j, int dim_k,
+                          int  read_box, double*  read_ptr, int  read_i, int  read_j, int  read_k, int  read_jStride, int  read_kStride, int  read_scale,
+                          int write_box, double* write_ptr, int write_i, int write_j, int write_k, int write_jStride, int write_kStride, int write_scale,
+                          int my_blockcopy_tile_i, int my_blockcopy_tile_j, int my_blockcopy_tile_k,
+                          int subtype
+                         );
+//------------------------------------------------------------------------------------------------------------------------------
+#endif
diff --git a/Util/hpgmg/finite-volume/source/local.mk b/Util/hpgmg/finite-volume/source/local.mk
new file mode 100644
index 00000000..7fef990b
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/local.mk
@@ -0,0 +1,8 @@
+hpgmg-fv-y.c += $(call thisdir, \
+	timers.c \
+	level.c \
+	operators.fv4.c \
+	mg.c \
+	solvers.c \
+	hpgmg-fv.c \
+	)
diff --git a/Util/hpgmg/finite-volume/source/mg.h b/Util/hpgmg/finite-volume/source/mg.h
new file mode 100644
index 00000000..74c2eec4
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/mg.h
@@ -0,0 +1,46 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef MG_H
+#define MG_H
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef MG_AGGLOMERATION_START
+#define MG_AGGLOMERATION_START  8 // i.e. start the distributed v-cycle when boxes are smaller than 8^3
+#endif
+#ifndef MG_DEFAULT_BOTTOM_NORM
+#define MG_DEFAULT_BOTTOM_NORM  1e-3
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+typedef struct {
+  int num_ranks;	// total number of MPI ranks for MPI_COMM_WORLD
+  int my_rank;		// my MPI rank for MPI_COMM_WORLD
+  int       num_levels;	// depth of the v-cycle
+  level_type ** levels;	// array of pointers to levels
+
+  struct {
+    double MGBuild; // total time spent building the coefficients...
+    double MGSolve; // total time spent in MGSolve
+  }timers;
+  int MGSolves_performed;
+} mg_type;
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void          MGBuild(mg_type *all_grids, level_type *fine_grid, double a, double b, int minCoarseGridDim, const MPI_Comm comm);
+void          MGSolve(mg_type *all_grids, int onLevel, int u_id, int F_id, double a, double b, double dtol, double rtol);
+void         FMGSolve(mg_type *all_grids, int onLevel, int u_id, int F_id, double a, double b, double dtol, double rtol);
+void            MGPCG(mg_type *all_grids, int onLevel, int x_id, int F_id, double a, double b, double dtol, double rtol);
+void        MGDestroy(mg_type *all_grids);
+void    MGPrintTiming(mg_type *all_grids, int fromLevel);
+void    MGResetTimers(mg_type *all_grids);
+void richardson_error(mg_type *all_grids, int levelh, int u_id);
+//------------------------------------------------------------------------------------------------------------------------------
+#endif
diff --git a/Util/hpgmg/finite-volume/source/mg_hpgmg.c b/Util/hpgmg/finite-volume/source/mg_hpgmg.c
new file mode 100644
index 00000000..1a95ec93
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/mg_hpgmg.c
@@ -0,0 +1,1498 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <unistd.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#include "timers.h"
+#include "defines.h"
+#include "level.h"
+#include "operators.h"
+#include "solvers.h"
+#include "mg.h"
+//------------------------------------------------------------------------------------------------------------------------------
+// structs/routines used to construct the restriction and prolognation lists and ensure a convention on how data is ordered within an MPI buffer
+typedef struct {
+  int sendRank;
+  int sendBoxID;
+  int sendBox;
+  int recvRank;
+  int recvBoxID;
+  int recvBox;
+  int i,j,k; // offsets used to index into the coarse box
+} RP_type;
+
+
+int qsortRP(const void *a, const void*b){
+  RP_type *rpa = (RP_type*)a;
+  RP_type *rpb = (RP_type*)b;
+  // sort first by sendRank
+  if(rpa->sendRank < rpb->sendRank)return(-1);
+  if(rpa->sendRank > rpb->sendRank)return( 1);
+  // then by sendBoxID
+  if(rpa->sendBoxID < rpb->sendBoxID)return(-1);
+  if(rpa->sendBoxID > rpb->sendBoxID)return( 1);
+  return(0);
+}
+
+
+//----------------------------------------------------------------------------------------------------------------------------------------------------
+// print out average time per solve and then decompose by function and level
+// note, in FMG, some levels are accessed more frequently.  This routine only prints time per solve in that level
+void MGPrintTiming(mg_type *all_grids, int fromLevel){
+  if(all_grids->my_rank!=0)return;
+  int level,num_levels = all_grids->num_levels;
+  #ifdef CALIBRATE_TIMER
+  double _timeStart=getTime();sleep(1);double _timeEnd=getTime();
+  double SecondsPerCycle = (double)1.0/(double)(_timeEnd-_timeStart);
+  #else
+  double SecondsPerCycle = 1.0;
+  #endif
+  double scale = SecondsPerCycle/(double)all_grids->MGSolves_performed; // prints average performance per MGSolve
+
+  double time,total;
+          printf("\n\n");
+          printf("level                     ");for(level=fromLevel;level<(num_levels  );level++){printf("%12d ",level-fromLevel);}printf("\n");
+          printf("level dimension           ");for(level=fromLevel;level<(num_levels  );level++){printf("%10d^3 ",all_grids->levels[level]->dim.i  );}printf("\n");
+          printf("box dimension             ");for(level=fromLevel;level<(num_levels  );level++){printf("%10d^3 ",all_grids->levels[level]->box_dim);}printf("       total\n");
+  total=0;printf("------------------        ");for(level=fromLevel;level<(num_levels+1);level++){printf("------------ ");}printf("\n");
+  total=0;printf("smooth                    ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.smooth;               total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("residual                  ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.residual;             total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("applyOp                   ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.apply_op;             total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("BLAS1                     ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.blas1;                total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("BLAS3                     ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.blas3;                total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("Boundary Conditions       ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.boundary_conditions;  total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("Restriction               ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.restriction_total;    total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("  local restriction       ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.restriction_local;    total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  #ifdef USE_MPI
+  total=0;printf("  pack MPI buffers        ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.restriction_pack;     total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("  unpack MPI buffers      ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.restriction_unpack;   total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("  MPI_Isend               ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.restriction_send;     total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("  MPI_Irecv               ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.restriction_recv;     total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("  MPI_Waitall             ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.restriction_wait;     total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  #endif
+  total=0;printf("Interpolation             ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.interpolation_total;  total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("  local interpolation     ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.interpolation_local;  total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  #ifdef USE_MPI
+  total=0;printf("  pack MPI buffers        ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.interpolation_pack;   total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("  unpack MPI buffers      ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.interpolation_unpack; total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("  MPI_Isend               ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.interpolation_send;   total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("  MPI_Irecv               ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.interpolation_recv;   total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("  MPI_Waitall             ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.interpolation_wait;   total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  #endif
+  total=0;printf("Ghost Zone Exchange       ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.ghostZone_total;      total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("  local exchange          ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.ghostZone_local;      total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  #ifdef USE_MPI
+  total=0;printf("  pack MPI buffers        ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.ghostZone_pack;       total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("  unpack MPI buffers      ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.ghostZone_unpack;     total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("  MPI_Isend               ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.ghostZone_send;       total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("  MPI_Irecv               ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.ghostZone_recv;       total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  total=0;printf("  MPI_Waitall             ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.ghostZone_wait;       total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  #endif
+  #ifdef USE_MPI
+  total=0;printf("MPI_collectives           ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.collectives;          total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+  #endif
+  total=0;printf("------------------        ");for(level=fromLevel;level<(num_levels+1);level++){printf("------------ ");}printf("\n");
+  total=0;printf("Total by level            ");for(level=fromLevel;level<(num_levels  );level++){time=scale*(double)all_grids->levels[level]->timers.Total;                total+=time;printf("%12.6f ",time);}printf("%12.6f\n",total);
+
+  printf("\n");
+  printf( "   Total time in MGBuild  %12.6f seconds\n",SecondsPerCycle*(double)all_grids->timers.MGBuild);
+  printf( "   Total time in MGSolve  %12.6f seconds\n",scale*(double)all_grids->timers.MGSolve);
+  printf( "      number of v-cycles  %12d\n"  ,all_grids->levels[fromLevel]->vcycles_from_this_level/all_grids->MGSolves_performed);
+  printf( "Bottom solver iterations  %12d\n"  ,all_grids->levels[num_levels-1]->Krylov_iterations/all_grids->MGSolves_performed);
+  #if defined(USE_CABICGSTAB) || defined(USE_CACG)
+  printf( "     formations of G[][]  %12d\n"  ,all_grids->levels[num_levels-1]->CAKrylov_formations_of_G/all_grids->MGSolves_performed);
+  #endif
+  printf("\n\n");fflush(stdout);
+}
+
+
+//----------------------------------------------------------------------------------------------------------------------------------------------------
+// zeros all timers within this MG hierarchy
+void MGResetTimers(mg_type *all_grids){
+  int level;
+  for(level=0;level<all_grids->num_levels;level++)reset_level_timers(all_grids->levels[level]);
+//all_grids->timers.MGBuild     = 0;
+  all_grids->timers.MGSolve     = 0;
+  all_grids->MGSolves_performed = 0;
+}
+
+
+//----------------------------------------------------------------------------------------------------------------------------------------------------
+// build a list of operations and MPI buffers to affect distributed interpolation
+// the three lists constitute
+//   - buffer packing (i.e. interpolate a local box (or region of a box) and place the result in an MPI buffer)
+//   - local operations (i.e. interpolate a local box (or region of a box) and place the result in another local box)
+//   - buffer upacking (i.e. take interpolated data recieved from another process and use it to increment a local box)
+void build_interpolation(mg_type *all_grids){
+  int level;
+  for(level=0;level<all_grids->num_levels;level++){
+
+  // initialize to defaults...
+  all_grids->levels[level]->interpolation.num_recvs           = 0;
+  all_grids->levels[level]->interpolation.num_sends           = 0;
+  all_grids->levels[level]->interpolation.recv_ranks          = NULL;
+  all_grids->levels[level]->interpolation.send_ranks          = NULL;
+  all_grids->levels[level]->interpolation.recv_sizes          = NULL;
+  all_grids->levels[level]->interpolation.send_sizes          = NULL;
+  all_grids->levels[level]->interpolation.recv_buffers        = NULL;
+  all_grids->levels[level]->interpolation.send_buffers        = NULL;
+  all_grids->levels[level]->interpolation.blocks[0]           = NULL;
+  all_grids->levels[level]->interpolation.blocks[1]           = NULL;
+  all_grids->levels[level]->interpolation.blocks[2]           = NULL;
+  all_grids->levels[level]->interpolation.num_blocks[0]       = 0;
+  all_grids->levels[level]->interpolation.num_blocks[1]       = 0;
+  all_grids->levels[level]->interpolation.num_blocks[2]       = 0;
+  all_grids->levels[level]->interpolation.allocated_blocks[0] = 0;
+  all_grids->levels[level]->interpolation.allocated_blocks[1] = 0;
+  all_grids->levels[level]->interpolation.allocated_blocks[2] = 0;
+  #ifdef USE_MPI
+  all_grids->levels[level]->interpolation.requests            = NULL;
+  all_grids->levels[level]->interpolation.status              = NULL;
+  #endif
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // construct pack, send(to level-1), and local...
+  if( (level>0) && (all_grids->levels[level]->num_my_boxes>0) ){ // not top  *and*  I have boxes to send
+    // construct a list of fine boxes to be coarsened and sent to me...
+    int numFineBoxes = (all_grids->levels[level-1]->boxes_in.i/all_grids->levels[level]->boxes_in.i)*
+                       (all_grids->levels[level-1]->boxes_in.j/all_grids->levels[level]->boxes_in.j)*
+                       (all_grids->levels[level-1]->boxes_in.k/all_grids->levels[level]->boxes_in.k)*
+                                                               all_grids->levels[level]->num_my_boxes;
+        int *fineRanks = (    int*)malloc(numFineBoxes*sizeof(    int)); // high water mark (assumes every neighboring box is a different process)
+    RP_type *fineBoxes = (RP_type*)malloc(numFineBoxes*sizeof(RP_type)); 
+        numFineBoxes       = 0;
+    int numFineBoxesLocal  = 0;
+    int numFineBoxesRemote = 0;
+    int coarseBox;
+    for(coarseBox=0;coarseBox<all_grids->levels[level]->num_my_boxes;coarseBox++){
+      int bi,bj,bk;
+      int   coarseBoxID = all_grids->levels[level]->my_boxes[coarseBox].global_box_id;
+      int   coarseBox_i = all_grids->levels[level]->my_boxes[coarseBox].low.i / all_grids->levels[level]->box_dim;
+      int   coarseBox_j = all_grids->levels[level]->my_boxes[coarseBox].low.j / all_grids->levels[level]->box_dim;
+      int   coarseBox_k = all_grids->levels[level]->my_boxes[coarseBox].low.k / all_grids->levels[level]->box_dim;
+      for(bk=0;bk<all_grids->levels[level-1]->boxes_in.k/all_grids->levels[level]->boxes_in.k;bk++){
+      for(bj=0;bj<all_grids->levels[level-1]->boxes_in.j/all_grids->levels[level]->boxes_in.j;bj++){
+      for(bi=0;bi<all_grids->levels[level-1]->boxes_in.i/all_grids->levels[level]->boxes_in.i;bi++){
+        int fineBox_i = (all_grids->levels[level-1]->boxes_in.i/all_grids->levels[level]->boxes_in.i)*coarseBox_i + bi;
+        int fineBox_j = (all_grids->levels[level-1]->boxes_in.j/all_grids->levels[level]->boxes_in.j)*coarseBox_j + bj;
+        int fineBox_k = (all_grids->levels[level-1]->boxes_in.k/all_grids->levels[level]->boxes_in.k)*coarseBox_k + bk;
+        int fineBoxID =  fineBox_i + fineBox_j*all_grids->levels[level-1]->boxes_in.i + fineBox_k*all_grids->levels[level-1]->boxes_in.i*all_grids->levels[level-1]->boxes_in.j;
+        int fineBox   = -1;int f;for(f=0;f<all_grids->levels[level-1]->num_my_boxes;f++)if( all_grids->levels[level-1]->my_boxes[f].global_box_id == fineBoxID )fineBox=f; // try and find the index of a fineBox global_box_id == fineBoxID
+        fineBoxes[numFineBoxes].sendRank  = all_grids->levels[level  ]->rank_of_box[coarseBoxID];
+        fineBoxes[numFineBoxes].sendBoxID = coarseBoxID;
+        fineBoxes[numFineBoxes].sendBox   = coarseBox;
+        fineBoxes[numFineBoxes].recvRank  = all_grids->levels[level-1]->rank_of_box[  fineBoxID];
+        fineBoxes[numFineBoxes].recvBoxID = fineBoxID;
+        fineBoxes[numFineBoxes].recvBox   = fineBox;
+        fineBoxes[numFineBoxes].i         = bi*all_grids->levels[level-1]->box_dim/2;
+        fineBoxes[numFineBoxes].j         = bj*all_grids->levels[level-1]->box_dim/2;
+        fineBoxes[numFineBoxes].k         = bk*all_grids->levels[level-1]->box_dim/2;
+                  numFineBoxes++;
+        if(all_grids->levels[level-1]->rank_of_box[fineBoxID] != all_grids->levels[level]->my_rank){
+          fineRanks[numFineBoxesRemote++] = all_grids->levels[level-1]->rank_of_box[fineBoxID];
+        }else{numFineBoxesLocal++;}
+      }}}
+    } // my (coarse) boxes
+    // sort boxes by sendRank(==my rank) then by sendBoxID... ensures the sends and receive buffers are always sorted by sendBoxID...
+    qsort(fineBoxes,numFineBoxes      ,sizeof(RP_type),qsortRP );
+    // sort the lists of neighboring ranks and remove duplicates...
+    qsort(fineRanks,numFineBoxesRemote,sizeof(    int),qsortInt);
+    int numFineRanks=0;
+    int _rank=-1;int neighbor=0;
+    for(neighbor=0;neighbor<numFineBoxesRemote;neighbor++)if(fineRanks[neighbor] != _rank){_rank=fineRanks[neighbor];fineRanks[numFineRanks++]=fineRanks[neighbor];}
+
+    // allocate structures...
+    all_grids->levels[level]->interpolation.num_sends     =                         numFineRanks;
+    all_grids->levels[level]->interpolation.send_ranks    =            (int*)malloc(numFineRanks*sizeof(int));
+    all_grids->levels[level]->interpolation.send_sizes    =            (int*)malloc(numFineRanks*sizeof(int));
+    all_grids->levels[level]->interpolation.send_buffers  =        (double**)malloc(numFineRanks*sizeof(double*));
+    if(numFineRanks>0){
+    if(all_grids->levels[level]->interpolation.send_ranks  ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->interpolation.send_ranks\n",level);exit(0);}
+    if(all_grids->levels[level]->interpolation.send_sizes  ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->interpolation.send_sizes\n",level);exit(0);}
+    if(all_grids->levels[level]->interpolation.send_buffers==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->interpolation.send_buffers\n",level);exit(0);}
+    }
+
+    int elementSize = all_grids->levels[level-1]->box_dim*all_grids->levels[level-1]->box_dim*all_grids->levels[level-1]->box_dim;
+    double * all_send_buffers = (double*)malloc(numFineBoxesRemote*elementSize*sizeof(double));
+          if(numFineBoxesRemote*elementSize>0)
+          if(all_send_buffers==NULL){fprintf(stderr,"malloc failed - interpolation/all_send_buffers\n");exit(0);}
+                      memset(all_send_buffers,0,numFineBoxesRemote*elementSize*sizeof(double)); // DO NOT DELETE... you must initialize to 0 to avoid getting something like 0.0*NaN and corrupting the solve
+    //printf("level=%d, rank=%2d, send_buffers=%6d\n",level,all_grids->my_rank,numFineBoxesRemote*elementSize*sizeof(double));
+
+    // for each neighbor, construct the pack list and allocate the MPI send buffer... 
+    for(neighbor=0;neighbor<numFineRanks;neighbor++){
+      int fineBox;
+      int offset = 0;
+      all_grids->levels[level]->interpolation.send_buffers[neighbor] = all_send_buffers;
+      for(fineBox=0;fineBox<numFineBoxes;fineBox++)if(fineBoxes[fineBox].recvRank==fineRanks[neighbor]){
+        // pack the MPI send buffer...
+        append_block_to_list(&(all_grids->levels[level]->interpolation.blocks[0]),&(all_grids->levels[level]->interpolation.allocated_blocks[0]),&(all_grids->levels[level]->interpolation.num_blocks[0]),
+          /* dim.i         = */ all_grids->levels[level-1]->box_dim/2,
+          /* dim.j         = */ all_grids->levels[level-1]->box_dim/2,
+          /* dim.k         = */ all_grids->levels[level-1]->box_dim/2,
+          /* read.box      = */ fineBoxes[fineBox].sendBox,
+          /* read.ptr      = */ NULL,
+          /* read.i        = */ fineBoxes[fineBox].i,
+          /* read.j        = */ fineBoxes[fineBox].j,
+          /* read.k        = */ fineBoxes[fineBox].k,
+          /* read.jStride  = */ all_grids->levels[level]->my_boxes[fineBoxes[fineBox].sendBox].jStride,
+          /* read.kStride  = */ all_grids->levels[level]->my_boxes[fineBoxes[fineBox].sendBox].kStride,
+          /* read.scale    = */ 1,
+          /* write.box     = */ -1,
+          /* write.ptr     = */ all_grids->levels[level]->interpolation.send_buffers[neighbor],
+          /* write.i       = */ offset,
+          /* write.j       = */ 0,
+          /* write.k       = */ 0,
+          /* write.jStride = */ all_grids->levels[level-1]->box_dim,
+          /* write.kStride = */ all_grids->levels[level-1]->box_dim*all_grids->levels[level-1]->box_dim,
+          /* write.scale   = */ 2,
+          /* blockcopy_i   = */ BLOCKCOPY_TILE_I, // default
+          /* blockcopy_j   = */ BLOCKCOPY_TILE_J, // default
+          /* blockcopy_k   = */ BLOCKCOPY_TILE_K, // default
+          /* subtype       = */ 0
+        );
+        offset+=elementSize;
+      }
+      all_grids->levels[level]->interpolation.send_ranks[neighbor] = fineRanks[neighbor];
+      all_grids->levels[level]->interpolation.send_sizes[neighbor] = offset;
+      all_send_buffers+=offset;
+    } // neighbor
+    {
+      int fineBox;
+      for(fineBox=0;fineBox<numFineBoxes;fineBox++)if(fineBoxes[fineBox].recvRank==all_grids->my_rank){
+        // local interpolations...
+        append_block_to_list(&(all_grids->levels[level]->interpolation.blocks[1]),&(all_grids->levels[level]->interpolation.allocated_blocks[1]),&(all_grids->levels[level]->interpolation.num_blocks[1]),
+          /* dim.i         = */ all_grids->levels[level-1]->box_dim/2,
+          /* dim.j         = */ all_grids->levels[level-1]->box_dim/2,
+          /* dim.k         = */ all_grids->levels[level-1]->box_dim/2,
+          /* read.box      = */ fineBoxes[fineBox].sendBox,
+          /* read.ptr      = */ NULL,
+          /* read.i        = */ fineBoxes[fineBox].i,
+          /* read.j        = */ fineBoxes[fineBox].j,
+          /* read.k        = */ fineBoxes[fineBox].k,
+          /* read.jStride  = */ all_grids->levels[level]->my_boxes[fineBoxes[fineBox].sendBox].jStride,
+          /* read.kStride  = */ all_grids->levels[level]->my_boxes[fineBoxes[fineBox].sendBox].kStride,
+          /* read.scale    = */ 1,
+          /* write.box     = */ fineBoxes[fineBox].recvBox,
+          /* write.ptr     = */ NULL,
+          /* write.i       = */ 0,
+          /* write.j       = */ 0,
+          /* write.k       = */ 0,
+          /* write.jStride = */ all_grids->levels[level-1]->my_boxes[fineBoxes[fineBox].recvBox].jStride,
+          /* write.kStride = */ all_grids->levels[level-1]->my_boxes[fineBoxes[fineBox].recvBox].kStride,
+          /* write.scale   = */ 2,
+          /* blockcopy_i   = */ BLOCKCOPY_TILE_I, // default
+          /* blockcopy_j   = */ BLOCKCOPY_TILE_J, // default
+          /* blockcopy_k   = */ BLOCKCOPY_TILE_K, // default
+          /* subtype       = */ 0
+        );
+      }
+    } // local to local interpolation
+
+    // free temporary storage...
+    free(fineBoxes);
+    free(fineRanks);
+  } // pack/send/local
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // construct recv(from level+1) and unpack...
+  if( (level<all_grids->num_levels-1) && (all_grids->levels[level]->num_my_boxes>0) ){ // not bottom  *and*  I have boxes to receive
+
+    // construct the list of coarsened boxes and neighboring ranks that will be interpolated and sent to me...
+    int numCoarseBoxes = all_grids->levels[level]->num_my_boxes; // I may receive a block for each of my boxes
+        int *coarseRanks = (    int*)malloc(numCoarseBoxes*sizeof(    int)); // high water mark (assumes every neighboring box is a different process)
+    RP_type *coarseBoxes = (RP_type*)malloc(numCoarseBoxes*sizeof(RP_type)); 
+        numCoarseBoxes       = 0;
+    int fineBox;
+    for(fineBox=0;fineBox<all_grids->levels[level]->num_my_boxes;fineBox++){
+      int   fineBoxID = all_grids->levels[level]->my_boxes[fineBox].global_box_id;
+      int   fineBox_i = all_grids->levels[level]->my_boxes[fineBox].low.i / all_grids->levels[level]->box_dim;
+      int   fineBox_j = all_grids->levels[level]->my_boxes[fineBox].low.j / all_grids->levels[level]->box_dim;
+      int   fineBox_k = all_grids->levels[level]->my_boxes[fineBox].low.k / all_grids->levels[level]->box_dim;
+      int coarseBox_i = fineBox_i*all_grids->levels[level+1]->boxes_in.i/all_grids->levels[level]->boxes_in.i;
+      int coarseBox_j = fineBox_j*all_grids->levels[level+1]->boxes_in.j/all_grids->levels[level]->boxes_in.j;
+      int coarseBox_k = fineBox_k*all_grids->levels[level+1]->boxes_in.k/all_grids->levels[level]->boxes_in.k;
+      int coarseBoxID =  coarseBox_i + coarseBox_j*all_grids->levels[level+1]->boxes_in.i + coarseBox_k*all_grids->levels[level+1]->boxes_in.i*all_grids->levels[level+1]->boxes_in.j;
+      if(all_grids->levels[level]->my_rank != all_grids->levels[level+1]->rank_of_box[coarseBoxID]){
+        coarseBoxes[numCoarseBoxes].sendRank  = all_grids->levels[level+1]->rank_of_box[coarseBoxID];
+        coarseBoxes[numCoarseBoxes].sendBoxID = coarseBoxID;
+        coarseBoxes[numCoarseBoxes].sendBox   = -1; 
+        coarseBoxes[numCoarseBoxes].recvRank  = all_grids->levels[level  ]->rank_of_box[  fineBoxID];
+        coarseBoxes[numCoarseBoxes].recvBoxID = fineBoxID;
+        coarseBoxes[numCoarseBoxes].recvBox   = fineBox;
+        coarseRanks[numCoarseBoxes] = all_grids->levels[level+1]->rank_of_box[coarseBoxID];
+                    numCoarseBoxes++;
+      }
+    } // my (fine) boxes
+
+    // sort boxes by sendRank(==my rank) then by sendBoxID... ensures the sends and receive buffers are always sorted by sendBoxID...
+    qsort(coarseBoxes,numCoarseBoxes,sizeof(RP_type),qsortRP );
+    // sort the lists of neighboring ranks and remove duplicates...
+    qsort(coarseRanks,numCoarseBoxes,sizeof(    int),qsortInt);
+    int numCoarseRanks=0;
+    int _rank=-1;int neighbor=0;
+    for(neighbor=0;neighbor<numCoarseBoxes;neighbor++)if(coarseRanks[neighbor] != _rank){_rank=coarseRanks[neighbor];coarseRanks[numCoarseRanks++]=coarseRanks[neighbor];}
+
+    // allocate structures...
+    all_grids->levels[level]->interpolation.num_recvs     =                         numCoarseRanks;
+    all_grids->levels[level]->interpolation.recv_ranks    =            (int*)malloc(numCoarseRanks*sizeof(int));
+    all_grids->levels[level]->interpolation.recv_sizes    =            (int*)malloc(numCoarseRanks*sizeof(int));
+    all_grids->levels[level]->interpolation.recv_buffers  =        (double**)malloc(numCoarseRanks*sizeof(double*));
+    if(numCoarseRanks>0){
+    if(all_grids->levels[level]->interpolation.recv_ranks  ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->interpolation.recv_ranks\n",level);exit(0);}
+    if(all_grids->levels[level]->interpolation.recv_sizes  ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->interpolation.recv_sizes\n",level);exit(0);}
+    if(all_grids->levels[level]->interpolation.recv_buffers==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->interpolation.recv_buffers\n",level);exit(0);}
+    }
+
+    int elementSize = all_grids->levels[level]->box_dim*all_grids->levels[level]->box_dim*all_grids->levels[level]->box_dim;
+    double * all_recv_buffers = (double*)malloc(numCoarseBoxes*elementSize*sizeof(double)); 
+          if(numCoarseBoxes*elementSize>0)
+          if(all_recv_buffers==NULL){fprintf(stderr,"malloc failed - interpolation/all_recv_buffers\n");exit(0);}
+                      memset(all_recv_buffers,0,numCoarseBoxes*elementSize*sizeof(double)); // DO NOT DELETE... you must initialize to 0 to avoid getting something like 0.0*NaN and corrupting the solve
+    //printf("level=%d, rank=%2d, recv_buffers=%6d\n",level,all_grids->my_rank,numCoarseBoxes*elementSize*sizeof(double));
+
+    // for each neighbor, construct the unpack list and allocate the MPI recv buffer... 
+    for(neighbor=0;neighbor<numCoarseRanks;neighbor++){
+      int coarseBox;
+      int offset = 0;
+      all_grids->levels[level]->interpolation.recv_buffers[neighbor] = all_recv_buffers;
+      for(coarseBox=0;coarseBox<numCoarseBoxes;coarseBox++)if(coarseBoxes[coarseBox].sendRank==coarseRanks[neighbor]){
+        // unpack MPI recv buffer...
+        append_block_to_list(&(all_grids->levels[level]->interpolation.blocks[2]),&(all_grids->levels[level]->interpolation.allocated_blocks[2]),&(all_grids->levels[level]->interpolation.num_blocks[2]),
+          /* dim.i         = */ all_grids->levels[level]->box_dim,
+          /* dim.j         = */ all_grids->levels[level]->box_dim,
+          /* dim.k         = */ all_grids->levels[level]->box_dim,
+          /* read.box      = */ -1,
+          /* read.ptr      = */ all_grids->levels[level]->interpolation.recv_buffers[neighbor],
+          /* read.i        = */ offset,
+          /* read.j        = */ 0,
+          /* read.k        = */ 0,
+          /* read.jStride  = */ all_grids->levels[level]->box_dim,
+          /* read.kStride  = */ all_grids->levels[level]->box_dim*all_grids->levels[level]->box_dim,
+          /* read.scale    = */ 1,
+          /* write.box     = */ coarseBoxes[coarseBox].recvBox,
+          /* write.ptr     = */ NULL,
+          /* write.i       = */ 0,
+          /* write.j       = */ 0,
+          /* write.k       = */ 0,
+          /* write.jStride = */ all_grids->levels[level]->my_boxes[coarseBoxes[coarseBox].recvBox].jStride,
+          /* write.kStride = */ all_grids->levels[level]->my_boxes[coarseBoxes[coarseBox].recvBox].kStride,
+          /* write.scale   = */ 1,
+          /* blockcopy_i   = */ BLOCKCOPY_TILE_I, // default
+          /* blockcopy_j   = */ BLOCKCOPY_TILE_J, // default
+          /* blockcopy_k   = */ BLOCKCOPY_TILE_K, // default
+          /* subtype       = */ 0
+        );
+        offset+=elementSize;
+      }
+      all_grids->levels[level]->interpolation.recv_ranks[neighbor] = coarseRanks[neighbor];
+      all_grids->levels[level]->interpolation.recv_sizes[neighbor] = offset;
+      all_recv_buffers+=offset;
+    } // neighbor
+
+    // free temporary storage...
+    free(coarseBoxes);
+    free(coarseRanks);
+  } // recv/unpack
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  } // all levels
+
+
+  #ifdef USE_MPI
+  for(level=0;level<all_grids->num_levels;level++){
+    all_grids->levels[level]->interpolation.requests = NULL;
+    all_grids->levels[level]->interpolation.status   = NULL;
+    if(level<all_grids->num_levels-1){  // i.e. bottom never calls interpolation()
+    // by convention, level_f allocates a combined array of requests for both level_f recvs and level_c sends...
+    int nMessages = all_grids->levels[level+1]->interpolation.num_sends + all_grids->levels[level]->interpolation.num_recvs;
+    all_grids->levels[level]->interpolation.requests = (MPI_Request*)malloc(nMessages*sizeof(MPI_Request));
+    all_grids->levels[level]->interpolation.status   = (MPI_Status *)malloc(nMessages*sizeof(MPI_Status ));
+    }
+  }
+  #endif
+}
+
+
+//----------------------------------------------------------------------------------------------------------------------------------------------------
+// build a list of operations and MPI buffers to affect distributed restriction
+// the three lists constitute
+//   - buffer packing (i.e. restrict a local box and place the result in an MPI buffer to be sent to a remote coarse grid process)
+//   - local operations (i.e. restrict a local box and place the result in another local box or region of another local box)
+//   - buffer upacking (i.e. copy restricted data recieved from another process into a local box or region of a local box)
+void build_restriction(mg_type *all_grids, int restrictionType){
+  int level;
+  for(level=0;level<all_grids->num_levels;level++){
+
+  // initialize to defaults...
+  all_grids->levels[level]->restriction[restrictionType].num_recvs           = 0;
+  all_grids->levels[level]->restriction[restrictionType].num_sends           = 0;
+  all_grids->levels[level]->restriction[restrictionType].recv_ranks          = NULL;
+  all_grids->levels[level]->restriction[restrictionType].send_ranks          = NULL;
+  all_grids->levels[level]->restriction[restrictionType].recv_sizes          = NULL;
+  all_grids->levels[level]->restriction[restrictionType].send_sizes          = NULL;
+  all_grids->levels[level]->restriction[restrictionType].recv_buffers        = NULL;
+  all_grids->levels[level]->restriction[restrictionType].send_buffers        = NULL;
+  all_grids->levels[level]->restriction[restrictionType].blocks[0]           = NULL;
+  all_grids->levels[level]->restriction[restrictionType].blocks[1]           = NULL;
+  all_grids->levels[level]->restriction[restrictionType].blocks[2]           = NULL;
+  all_grids->levels[level]->restriction[restrictionType].allocated_blocks[0] = 0;
+  all_grids->levels[level]->restriction[restrictionType].allocated_blocks[1] = 0;
+  all_grids->levels[level]->restriction[restrictionType].allocated_blocks[2] = 0;
+  all_grids->levels[level]->restriction[restrictionType].num_blocks[0]       = 0; // number of unpack/insert operations  = number of boxes on level+1 that I don't own and restrict to 
+  all_grids->levels[level]->restriction[restrictionType].num_blocks[1]       = 0; // number of unpack/insert operations  = number of boxes on level+1 that I own and restrict to
+  all_grids->levels[level]->restriction[restrictionType].num_blocks[2]       = 0; // number of unpack/insert operations  = number of boxes on level-1 that I don't own that restrict to me
+  #ifdef USE_MPI
+  all_grids->levels[level]->restriction[restrictionType].requests            = NULL;
+  all_grids->levels[level]->restriction[restrictionType].status              = NULL;
+  #endif
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // construct pack, send, and local...
+  if( (level<all_grids->num_levels-1) && (all_grids->levels[level]->num_my_boxes>0) ){ // not bottom  *and*  I have boxes to send
+
+    // construct the list of coarsened boxes and neighboring ranks...
+    int numCoarseBoxes = (all_grids->levels[level]->boxes_in.i/all_grids->levels[level+1]->boxes_in.i)*
+                         (all_grids->levels[level]->boxes_in.j/all_grids->levels[level+1]->boxes_in.j)*
+                         (all_grids->levels[level]->boxes_in.k/all_grids->levels[level+1]->boxes_in.k)*
+                          all_grids->levels[level]->num_my_boxes;
+        int *coarseRanks = (    int*)malloc(numCoarseBoxes*sizeof(    int)); // high water mark (assumes every neighboring box is a different process)
+    RP_type *coarseBoxes = (RP_type*)malloc(numCoarseBoxes*sizeof(RP_type)); 
+        numCoarseBoxes       = 0;
+    int numCoarseBoxesLocal  = 0;
+    int numCoarseBoxesRemote = 0;
+    int fineBox;
+    for(fineBox=0;fineBox<all_grids->levels[level]->num_my_boxes;fineBox++){
+      int   fineBoxID = all_grids->levels[level]->my_boxes[fineBox].global_box_id;
+      int   fineBox_i = all_grids->levels[level]->my_boxes[fineBox].low.i / all_grids->levels[level]->box_dim;
+      int   fineBox_j = all_grids->levels[level]->my_boxes[fineBox].low.j / all_grids->levels[level]->box_dim;
+      int   fineBox_k = all_grids->levels[level]->my_boxes[fineBox].low.k / all_grids->levels[level]->box_dim;
+      int coarseBox_i = fineBox_i*all_grids->levels[level+1]->boxes_in.i/all_grids->levels[level]->boxes_in.i;
+      int coarseBox_j = fineBox_j*all_grids->levels[level+1]->boxes_in.j/all_grids->levels[level]->boxes_in.j;
+      int coarseBox_k = fineBox_k*all_grids->levels[level+1]->boxes_in.k/all_grids->levels[level]->boxes_in.k;
+      int coarseBoxID =  coarseBox_i + coarseBox_j*all_grids->levels[level+1]->boxes_in.i + coarseBox_k*all_grids->levels[level+1]->boxes_in.i*all_grids->levels[level+1]->boxes_in.j;
+      int coarseBox   = -1;int c;for(c=0;c<all_grids->levels[level+1]->num_my_boxes;c++)if( all_grids->levels[level+1]->my_boxes[c].global_box_id == coarseBoxID )coarseBox=c; // try and find the coarseBox index of a box with global_box_id == coaseBoxID
+      coarseBoxes[numCoarseBoxes].sendRank  = all_grids->levels[level  ]->rank_of_box[  fineBoxID];
+      coarseBoxes[numCoarseBoxes].sendBoxID = fineBoxID;
+      coarseBoxes[numCoarseBoxes].sendBox   = fineBox;
+      coarseBoxes[numCoarseBoxes].recvRank  = all_grids->levels[level+1]->rank_of_box[coarseBoxID];
+      coarseBoxes[numCoarseBoxes].recvBoxID = coarseBoxID;
+      coarseBoxes[numCoarseBoxes].recvBox   = coarseBox;  // -1 if off-node
+      coarseBoxes[numCoarseBoxes].i         = (all_grids->levels[level]->box_dim/2)*( fineBox_i % (all_grids->levels[level]->boxes_in.i/all_grids->levels[level+1]->boxes_in.i) );
+      coarseBoxes[numCoarseBoxes].j         = (all_grids->levels[level]->box_dim/2)*( fineBox_j % (all_grids->levels[level]->boxes_in.j/all_grids->levels[level+1]->boxes_in.j) );
+      coarseBoxes[numCoarseBoxes].k         = (all_grids->levels[level]->box_dim/2)*( fineBox_k % (all_grids->levels[level]->boxes_in.k/all_grids->levels[level+1]->boxes_in.k) );
+                  numCoarseBoxes++;
+      if(all_grids->levels[level]->my_rank != all_grids->levels[level+1]->rank_of_box[coarseBoxID]){
+        coarseRanks[numCoarseBoxesRemote++] = all_grids->levels[level+1]->rank_of_box[coarseBoxID];
+      }else{numCoarseBoxesLocal++;}
+    } // my (fine) boxes
+
+    // sort boxes by sendRank(==my rank) then by sendBoxID... ensures the sends and receive buffers are always sorted by sendBoxID...
+    qsort(coarseBoxes,numCoarseBoxes      ,sizeof(RP_type),qsortRP );
+    // sort the lists of neighboring ranks and remove duplicates...
+    qsort(coarseRanks,numCoarseBoxesRemote,sizeof(    int),qsortInt);
+    int numCoarseRanks=0;
+    int _rank=-1;int neighbor=0;
+    for(neighbor=0;neighbor<numCoarseBoxesRemote;neighbor++)if(coarseRanks[neighbor] != _rank){_rank=coarseRanks[neighbor];coarseRanks[numCoarseRanks++]=coarseRanks[neighbor];}
+
+    // allocate structures...
+    all_grids->levels[level]->restriction[restrictionType].num_sends     =                         numCoarseRanks;
+    all_grids->levels[level]->restriction[restrictionType].send_ranks    =            (int*)malloc(numCoarseRanks*sizeof(int));
+    all_grids->levels[level]->restriction[restrictionType].send_sizes    =            (int*)malloc(numCoarseRanks*sizeof(int));
+    all_grids->levels[level]->restriction[restrictionType].send_buffers  =        (double**)malloc(numCoarseRanks*sizeof(double*));
+    if(numCoarseRanks>0){
+    if(all_grids->levels[level]->restriction[restrictionType].send_ranks  ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->restriction[restrictionType].send_ranks\n",level);exit(0);}
+    if(all_grids->levels[level]->restriction[restrictionType].send_sizes  ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->restriction[restrictionType].send_sizes\n",level);exit(0);}
+    if(all_grids->levels[level]->restriction[restrictionType].send_buffers==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->restriction[restrictionType].send_buffers\n",level);exit(0);}
+    }
+
+    int elementSize;
+    int restrict_dim_i=-1;
+    int restrict_dim_j=-1;
+    int restrict_dim_k=-1;
+    switch(restrictionType){
+      case RESTRICT_CELL   : restrict_dim_i = (  all_grids->levels[level]->box_dim/2);
+                             restrict_dim_j = (  all_grids->levels[level]->box_dim/2);
+                             restrict_dim_k = (  all_grids->levels[level]->box_dim/2);break;
+      case RESTRICT_FACE_I : restrict_dim_i = (1+all_grids->levels[level]->box_dim/2);
+                             restrict_dim_j = (  all_grids->levels[level]->box_dim/2);
+                             restrict_dim_k = (  all_grids->levels[level]->box_dim/2);break;
+      case RESTRICT_FACE_J : restrict_dim_i = (  all_grids->levels[level]->box_dim/2);
+                             restrict_dim_j = (1+all_grids->levels[level]->box_dim/2);
+                             restrict_dim_k = (  all_grids->levels[level]->box_dim/2);break;
+      case RESTRICT_FACE_K : restrict_dim_i = (  all_grids->levels[level]->box_dim/2);
+                             restrict_dim_j = (  all_grids->levels[level]->box_dim/2);
+                             restrict_dim_k = (1+all_grids->levels[level]->box_dim/2);break;
+    }
+    elementSize = restrict_dim_i*restrict_dim_j*restrict_dim_k;
+   
+    double * all_send_buffers = (double*)malloc(numCoarseBoxesRemote*elementSize*sizeof(double));
+          if(numCoarseBoxesRemote*elementSize>0)
+          if(all_send_buffers==NULL){fprintf(stderr,"malloc failed - restriction/all_send_buffers\n");exit(0);}
+                      memset(all_send_buffers,0,numCoarseBoxesRemote*elementSize*sizeof(double)); // DO NOT DELETE... you must initialize to 0 to avoid getting something like 0.0*NaN and corrupting the solve
+
+    // for each neighbor, construct the pack list and allocate the MPI send buffer... 
+    for(neighbor=0;neighbor<numCoarseRanks;neighbor++){
+      int coarseBox;
+      int offset = 0;
+      all_grids->levels[level]->restriction[restrictionType].send_buffers[neighbor] = all_send_buffers;
+      for(coarseBox=0;coarseBox<numCoarseBoxes;coarseBox++)if(coarseBoxes[coarseBox].recvRank==coarseRanks[neighbor]){
+        // restrict to MPI send buffer...
+        append_block_to_list( &(all_grids->levels[level]->restriction[restrictionType].blocks[0]),
+                              &(all_grids->levels[level]->restriction[restrictionType].allocated_blocks[0]),
+                              &(all_grids->levels[level]->restriction[restrictionType].num_blocks[0]),
+          /* dim.i         = */ restrict_dim_i, 
+          /* dim.j         = */ restrict_dim_j, 
+          /* dim.k         = */ restrict_dim_k, 
+          /* read.box      = */ coarseBoxes[coarseBox].sendBox,
+          /* read.ptr      = */ NULL,
+          /* read.i        = */ 0,
+          /* read.j        = */ 0,
+          /* read.k        = */ 0,
+          /* read.jStride  = */ all_grids->levels[level]->my_boxes[coarseBoxes[coarseBox].sendBox].jStride,
+          /* read.kStride  = */ all_grids->levels[level]->my_boxes[coarseBoxes[coarseBox].sendBox].kStride,
+          /* read.scale    = */ 2,
+          /* write.box     = */ -1,
+          /* write.ptr     = */ all_grids->levels[level]->restriction[restrictionType].send_buffers[neighbor],
+          /* write.i       = */ offset,
+          /* write.j       = */ 0,
+          /* write.k       = */ 0,
+          /* write.jStride = */ restrict_dim_i,
+          /* write.kStride = */ restrict_dim_i*restrict_dim_j, 
+          /* write.scale   = */ 1,
+          /* blockcopy_i   = */ BLOCKCOPY_TILE_I, // default
+          /* blockcopy_j   = */ BLOCKCOPY_TILE_J, // default
+          /* blockcopy_k   = */ BLOCKCOPY_TILE_K, // default
+          /* subtype       = */ 0
+        );
+        offset+=elementSize;
+      }
+      all_grids->levels[level]->restriction[restrictionType].send_ranks[neighbor] = coarseRanks[neighbor];
+      all_grids->levels[level]->restriction[restrictionType].send_sizes[neighbor] = offset;
+      all_send_buffers+=offset;
+    }
+    // for construct the local restriction list... 
+    {
+      int coarseBox;
+      for(coarseBox=0;coarseBox<numCoarseBoxes;coarseBox++)if(coarseBoxes[coarseBox].recvRank==all_grids->levels[level+1]->my_rank){
+        // restrict to local...
+        append_block_to_list( &(all_grids->levels[level]->restriction[restrictionType].blocks[1]),
+                              &(all_grids->levels[level]->restriction[restrictionType].allocated_blocks[1]),
+                              &(all_grids->levels[level]->restriction[restrictionType].num_blocks[1]),
+          /* dim.i         = */ restrict_dim_i, 
+          /* dim.j         = */ restrict_dim_j, 
+          /* dim.k         = */ restrict_dim_k, 
+          /* read.box      = */ coarseBoxes[coarseBox].sendBox,
+          /* read.ptr      = */ NULL,
+          /* read.i        = */ 0, 
+          /* read.j        = */ 0,
+          /* read.k        = */ 0,
+          /* read.jStride  = */ all_grids->levels[level]->my_boxes[coarseBoxes[coarseBox].sendBox].jStride,
+          /* read.kStride  = */ all_grids->levels[level]->my_boxes[coarseBoxes[coarseBox].sendBox].kStride,
+          /* read.scale    = */ 2,
+          /* write.box     = */ coarseBoxes[coarseBox].recvBox,
+          /* write.ptr     = */ NULL,
+          /* write.i       = */ coarseBoxes[coarseBox].i,
+          /* write.j       = */ coarseBoxes[coarseBox].j,
+          /* write.k       = */ coarseBoxes[coarseBox].k,
+          /* write.jStride = */ all_grids->levels[level+1]->my_boxes[coarseBoxes[coarseBox].recvBox].jStride,
+          /* write.kStride = */ all_grids->levels[level+1]->my_boxes[coarseBoxes[coarseBox].recvBox].kStride,
+          /* write.scale   = */ 1,
+          /* blockcopy_i   = */ BLOCKCOPY_TILE_I, // default
+          /* blockcopy_j   = */ BLOCKCOPY_TILE_J, // default
+          /* blockcopy_k   = */ BLOCKCOPY_TILE_K, // default
+          /* subtype       = */ 0
+        );
+      }
+    } // local to local
+
+    // free temporary storage...
+    free(coarseBoxes);
+    free(coarseRanks);
+  } // send/pack/local
+
+
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // construct recv and unpack...
+  if( (level>0) && (all_grids->levels[level]->num_my_boxes>0) ){ // not top  *and*  I have boxes to receive
+    // construct a list of fine boxes to be coarsened and sent to me...
+    int numFineBoxesMax = (all_grids->levels[level-1]->boxes_in.i/all_grids->levels[level]->boxes_in.i)*
+                          (all_grids->levels[level-1]->boxes_in.j/all_grids->levels[level]->boxes_in.j)*
+                          (all_grids->levels[level-1]->boxes_in.k/all_grids->levels[level]->boxes_in.k)*
+                                                                  all_grids->levels[level]->num_my_boxes;
+        int *fineRanks = (    int*)malloc(numFineBoxesMax*sizeof(    int)); // high water mark (assumes every neighboring box is a different process)
+    RP_type *fineBoxes = (RP_type*)malloc(numFineBoxesMax*sizeof(RP_type)); 
+    int numFineBoxesRemote = 0;
+    int coarseBox;
+    for(coarseBox=0;coarseBox<all_grids->levels[level]->num_my_boxes;coarseBox++){
+      int bi,bj,bk;
+      int   coarseBoxID = all_grids->levels[level]->my_boxes[coarseBox].global_box_id;
+      int   coarseBox_i = all_grids->levels[level]->my_boxes[coarseBox].low.i / all_grids->levels[level]->box_dim;
+      int   coarseBox_j = all_grids->levels[level]->my_boxes[coarseBox].low.j / all_grids->levels[level]->box_dim;
+      int   coarseBox_k = all_grids->levels[level]->my_boxes[coarseBox].low.k / all_grids->levels[level]->box_dim;
+      for(bk=0;bk<all_grids->levels[level-1]->boxes_in.k/all_grids->levels[level]->boxes_in.k;bk++){
+      for(bj=0;bj<all_grids->levels[level-1]->boxes_in.j/all_grids->levels[level]->boxes_in.j;bj++){
+      for(bi=0;bi<all_grids->levels[level-1]->boxes_in.i/all_grids->levels[level]->boxes_in.i;bi++){
+        int fineBox_i = (all_grids->levels[level-1]->boxes_in.i/all_grids->levels[level]->boxes_in.i)*coarseBox_i + bi;
+        int fineBox_j = (all_grids->levels[level-1]->boxes_in.j/all_grids->levels[level]->boxes_in.j)*coarseBox_j + bj;
+        int fineBox_k = (all_grids->levels[level-1]->boxes_in.k/all_grids->levels[level]->boxes_in.k)*coarseBox_k + bk;
+        int fineBoxID =  fineBox_i + fineBox_j*all_grids->levels[level-1]->boxes_in.i + fineBox_k*all_grids->levels[level-1]->boxes_in.i*all_grids->levels[level-1]->boxes_in.j;
+        if(all_grids->levels[level-1]->rank_of_box[fineBoxID] != all_grids->levels[level]->my_rank){
+          fineBoxes[numFineBoxesRemote].sendRank  = all_grids->levels[level-1]->rank_of_box[  fineBoxID];
+          fineBoxes[numFineBoxesRemote].sendBoxID = fineBoxID;
+          fineBoxes[numFineBoxesRemote].sendBox   = -1; // I don't know the off-node box index
+          fineBoxes[numFineBoxesRemote].recvRank  = all_grids->levels[level  ]->rank_of_box[coarseBoxID];
+          fineBoxes[numFineBoxesRemote].recvBoxID = coarseBoxID;
+          fineBoxes[numFineBoxesRemote].recvBox   = coarseBox;
+          fineBoxes[numFineBoxesRemote].i         = bi*all_grids->levels[level-1]->box_dim/2;
+          fineBoxes[numFineBoxesRemote].j         = bj*all_grids->levels[level-1]->box_dim/2;
+          fineBoxes[numFineBoxesRemote].k         = bk*all_grids->levels[level-1]->box_dim/2;
+          fineRanks[numFineBoxesRemote] = all_grids->levels[level-1]->rank_of_box[fineBoxID];
+                    numFineBoxesRemote++;
+        }
+      }}}
+    } // my (coarse) boxes
+    // sort boxes by sendRank(==my rank) then by sendBoxID... ensures the sends and receive buffers are always sorted by sendBoxID...
+    qsort(fineBoxes,numFineBoxesRemote,sizeof(RP_type),qsortRP );
+    // sort the lists of neighboring ranks and remove duplicates...
+    qsort(fineRanks,numFineBoxesRemote,sizeof(    int),qsortInt);
+    int numFineRanks=0;
+    int _rank=-1;int neighbor=0;
+    for(neighbor=0;neighbor<numFineBoxesRemote;neighbor++)if(fineRanks[neighbor] != _rank){_rank=fineRanks[neighbor];fineRanks[numFineRanks++]=fineRanks[neighbor];}
+
+    // allocate structures...
+    all_grids->levels[level]->restriction[restrictionType].num_recvs     =                         numFineRanks;
+    all_grids->levels[level]->restriction[restrictionType].recv_ranks    =            (int*)malloc(numFineRanks*sizeof(int));
+    all_grids->levels[level]->restriction[restrictionType].recv_sizes    =            (int*)malloc(numFineRanks*sizeof(int));
+    all_grids->levels[level]->restriction[restrictionType].recv_buffers  =        (double**)malloc(numFineRanks*sizeof(double*));
+    if(numFineRanks>0){
+    if(all_grids->levels[level]->restriction[restrictionType].recv_ranks  ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->restriction[restrictionType].recv_ranks  \n",level);exit(0);}
+    if(all_grids->levels[level]->restriction[restrictionType].recv_sizes  ==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->restriction[restrictionType].recv_sizes  \n",level);exit(0);}
+    if(all_grids->levels[level]->restriction[restrictionType].recv_buffers==NULL){fprintf(stderr,"malloc failed - all_grids->levels[%d]->restriction[restrictionType].recv_buffers\n",level);exit(0);}
+    }
+
+    int elementSize;
+    int restrict_dim_i=-1;
+    int restrict_dim_j=-1;
+    int restrict_dim_k=-1;
+    switch(restrictionType){
+      case RESTRICT_CELL   : restrict_dim_i = (  all_grids->levels[level-1]->box_dim/2);
+                             restrict_dim_j = (  all_grids->levels[level-1]->box_dim/2);
+                             restrict_dim_k = (  all_grids->levels[level-1]->box_dim/2);break;
+      case RESTRICT_FACE_I : restrict_dim_i = (1+all_grids->levels[level-1]->box_dim/2);
+                             restrict_dim_j = (  all_grids->levels[level-1]->box_dim/2);
+                             restrict_dim_k = (  all_grids->levels[level-1]->box_dim/2);break;
+      case RESTRICT_FACE_J : restrict_dim_i = (  all_grids->levels[level-1]->box_dim/2);
+                             restrict_dim_j = (1+all_grids->levels[level-1]->box_dim/2);
+                             restrict_dim_k = (  all_grids->levels[level-1]->box_dim/2);break;
+      case RESTRICT_FACE_K : restrict_dim_i = (  all_grids->levels[level-1]->box_dim/2);
+                             restrict_dim_j = (  all_grids->levels[level-1]->box_dim/2);
+                             restrict_dim_k = (1+all_grids->levels[level-1]->box_dim/2);break;
+    }
+    elementSize = restrict_dim_i*restrict_dim_j*restrict_dim_k;
+
+    double * all_recv_buffers = (double*)malloc(numFineBoxesRemote*elementSize*sizeof(double));
+          if(numFineBoxesRemote*elementSize>0)
+          if(all_recv_buffers==NULL){fprintf(stderr,"malloc failed - restriction/all_recv_buffers\n");exit(0);}
+                      memset(all_recv_buffers,0,numFineBoxesRemote*elementSize*sizeof(double)); // DO NOT DELETE... you must initialize to 0 to avoid getting something like 0.0*NaN and corrupting the solve
+    //printf("level=%d, rank=%2d, recv_buffers=%6d\n",level,all_grids->my_rank,numFineBoxesRemote*elementSize*sizeof(double));
+
+    // for each neighbor, construct the unpack list and allocate the MPI recv buffer... 
+    for(neighbor=0;neighbor<numFineRanks;neighbor++){
+      int fineBox;
+      int offset = 0;
+      all_grids->levels[level]->restriction[restrictionType].recv_buffers[neighbor] = all_recv_buffers;
+      for(fineBox=0;fineBox<numFineBoxesRemote;fineBox++)if(fineBoxes[fineBox].sendRank==fineRanks[neighbor]){
+        // unpack MPI recv buffer...
+        append_block_to_list( &(all_grids->levels[level]->restriction[restrictionType].blocks[2]),
+                              &(all_grids->levels[level]->restriction[restrictionType].allocated_blocks[2]),
+                              &(all_grids->levels[level]->restriction[restrictionType].num_blocks[2]),
+          /* dim.i         = */ restrict_dim_i, 
+          /* dim.j         = */ restrict_dim_j, 
+          /* dim.k         = */ restrict_dim_k, 
+          /* read.box      = */ -1,
+          /* read.ptr      = */ all_grids->levels[level]->restriction[restrictionType].recv_buffers[neighbor],
+          /* read.i        = */ offset,
+          /* read.j        = */ 0,
+          /* read.k        = */ 0,
+          /* read.jStride  = */ restrict_dim_i,
+          /* read.kStride  = */ restrict_dim_i*restrict_dim_j, 
+          /* read.scale    = */ 1,
+          /* write.box     = */ fineBoxes[fineBox].recvBox,
+          /* write.ptr     = */ NULL,
+          /* write.i       = */ fineBoxes[fineBox].i,
+          /* write.j       = */ fineBoxes[fineBox].j,
+          /* write.k       = */ fineBoxes[fineBox].k,
+          /* write.jStride = */ all_grids->levels[level]->my_boxes[fineBoxes[fineBox].recvBox].jStride,
+          /* write.kStride = */ all_grids->levels[level]->my_boxes[fineBoxes[fineBox].recvBox].kStride,
+          /* write.scale   = */ 1,
+          /* blockcopy_i   = */ BLOCKCOPY_TILE_I, // default
+          /* blockcopy_j   = */ BLOCKCOPY_TILE_J, // default
+          /* blockcopy_k   = */ BLOCKCOPY_TILE_K, // default
+          /* subtype       = */ 0
+        );
+        offset+=elementSize;
+      }
+      all_grids->levels[level]->restriction[restrictionType].recv_ranks[neighbor] = fineRanks[neighbor];
+      all_grids->levels[level]->restriction[restrictionType].recv_sizes[neighbor] = offset;
+      all_recv_buffers+=offset;
+    } // neighbor
+
+    // free temporary storage...
+    free(fineBoxes);
+    free(fineRanks);
+  } // recv/unpack
+
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  } // level loop
+
+
+  #ifdef USE_MPI
+  for(level=0;level<all_grids->num_levels;level++){
+    all_grids->levels[level]->restriction[restrictionType].requests = NULL;
+    all_grids->levels[level]->restriction[restrictionType].status   = NULL;
+    if(level<all_grids->num_levels-1){ // bottom never calls restriction()
+    // by convention, level_f allocates a combined array of requests for both level_f sends and level_c recvs...
+    int nMessages = all_grids->levels[level+1]->restriction[restrictionType].num_recvs + all_grids->levels[level]->restriction[restrictionType].num_sends;
+    all_grids->levels[level]->restriction[restrictionType].requests = (MPI_Request*)malloc(nMessages*sizeof(MPI_Request));
+    all_grids->levels[level]->restriction[restrictionType].status   = (MPI_Status *)malloc(nMessages*sizeof(MPI_Status ));
+    }
+  }
+  #endif
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// given a fine grid input, build a hiearchy of MG levels
+// level 0 simply points to fine_grid.  All other levels are created
+// rebuild the restriction/interpolation lists for each coarse grid level
+// rebuild the operator on each coarse grid level
+// add extra vectors to the coarse grid once here instead of on every call to the coarse grid solve
+// NOTE, this routine presumes the fine_grid domain is cubical... fine_grid->dim.i==fine_grid->dim.j==fine_grid->dim.k
+// NOTE, as this function is not timed, it has not been optimzied for performance
+void MGBuild(mg_type *all_grids, level_type *fine_grid, double a, double b, int minCoarseGridDim, const MPI_Comm comm){
+  int  maxLevels=100; // i.e. maximum problem size is (2^100)^3
+  int     nProcs[100];
+  int      dim_i[100];
+  int boxes_in_i[100];
+  int    box_dim[100];
+  int box_ghosts[100];
+  all_grids->my_rank = fine_grid->my_rank;
+  all_grids->timers.MGBuild = 0;
+  double _timeStartMGBuild = getTime();
+
+  // calculate how deep we can make the v-cycle...
+  int level=1;
+                             int coarse_dim = fine_grid->dim.i;
+//if(fine_grid->dim.j<coarse_dim)coarse_dim = fine_grid->dim.j;
+//if(fine_grid->dim.k<coarse_dim)coarse_dim = fine_grid->dim.k;
+  while( (coarse_dim>=2*minCoarseGridDim) && ((coarse_dim&0x1)==0) ){ // grid dimension is even and big enough...
+    level++;
+    coarse_dim = coarse_dim / 2;
+  }if(level<maxLevels)maxLevels=level;
+
+      nProcs[0] = fine_grid->num_ranks;
+       dim_i[0] = fine_grid->dim.i;
+  boxes_in_i[0] = fine_grid->boxes_in.i;
+     box_dim[0] = fine_grid->box_dim;
+  box_ghosts[0] = fine_grid->box_ghosts;
+
+  // build the list of levels...
+  all_grids->levels = (level_type**)malloc(maxLevels*sizeof(level_type*));
+  if(all_grids->levels == NULL){fprintf(stderr,"malloc failed - MGBuild/all_grids->levels\n");exit(0);}
+  all_grids->num_levels=1;
+  all_grids->levels[0] = fine_grid;
+
+
+  // build a table to guide the construction of the v-cycle...
+  int doRestrict=1;if(maxLevels<2)doRestrict=0; // i.e. can't restrict if there is only one level !!!
+  #ifdef USE_UCYCLES
+  while(doRestrict){
+    level = all_grids->num_levels;
+    doRestrict=0;
+    if( (box_dim[level-1] % 2 == 0) ){
+          nProcs[level] =     nProcs[level-1];
+           dim_i[level] =      dim_i[level-1]/2;
+         box_dim[level] =    box_dim[level-1]/2;
+      boxes_in_i[level] = boxes_in_i[level-1];
+      box_ghosts[level] = box_ghosts[level-1];
+             doRestrict = 1;
+    }
+    if(box_dim[level] < box_ghosts[level])doRestrict=0;
+    if(dim_i[level]<minCoarseGridDim)doRestrict=0;
+    if(doRestrict)all_grids->num_levels++;
+  }
+  #else // TRUE V-Cycle...
+  while(doRestrict){
+    level = all_grids->num_levels;
+    doRestrict=0;
+    int fine_box_dim    =    box_dim[level-1];
+    int fine_nProcs     =     nProcs[level-1];
+    int fine_dim_i      =      dim_i[level-1];
+    int fine_boxes_in_i = boxes_in_i[level-1];
+    if( (fine_box_dim % 2 == 0) && (fine_box_dim > MG_AGGLOMERATION_START) && ((fine_box_dim/2)>=stencil_get_radius()) ){ // Boxes are too big to agglomerate
+          nProcs[level] = fine_nProcs;
+           dim_i[level] = fine_dim_i/2;
+         box_dim[level] = fine_box_dim/2; // FIX, verify its not less than the stencil radius
+      boxes_in_i[level] = fine_boxes_in_i;
+      box_ghosts[level] = box_ghosts[level-1];
+             doRestrict = 1;
+    }else
+    if( (fine_boxes_in_i % 2 == 0) && ((fine_box_dim)>=stencil_get_radius()) ){ // 8:1 box agglomeration
+          nProcs[level] = fine_nProcs;
+           dim_i[level] = fine_dim_i/2;
+         box_dim[level] = fine_box_dim;
+      boxes_in_i[level] = fine_boxes_in_i/2;
+      box_ghosts[level] = box_ghosts[level-1];
+             doRestrict = 1;
+    }else
+    if( (coarse_dim != 1) && (fine_dim_i == 2*coarse_dim) && ((fine_dim_i/2)>=stencil_get_radius()) ){ // agglomerate everything
+          nProcs[level] = 1;
+           dim_i[level] = fine_dim_i/2;
+         box_dim[level] = fine_dim_i/2; // FIX, verify its not less than the stencil radius
+      boxes_in_i[level] = 1;
+      box_ghosts[level] = box_ghosts[level-1];
+             doRestrict = 1;
+    }else
+    if( (coarse_dim != 1) && (fine_dim_i == 4*coarse_dim) && ((fine_box_dim/2)>=stencil_get_radius()) ){ // restrict box dimension, and run on fewer ranks
+          nProcs[level] = coarse_dim<fine_nProcs ? coarse_dim : fine_nProcs;
+           dim_i[level] = fine_dim_i/2;
+         box_dim[level] = fine_box_dim/2; // FIX, verify its not less than the stencil radius
+      boxes_in_i[level] = fine_boxes_in_i;
+      box_ghosts[level] = box_ghosts[level-1];
+             doRestrict = 1;
+    }else
+    if( (coarse_dim != 1) && (fine_dim_i == 8*coarse_dim) && ((fine_box_dim/2)>=stencil_get_radius()) ){ // restrict box dimension, and run on fewer ranks
+          nProcs[level] = coarse_dim*coarse_dim<fine_nProcs ? coarse_dim*coarse_dim : fine_nProcs;
+           dim_i[level] = fine_dim_i/2;
+         box_dim[level] = fine_box_dim/2; // FIX, verify its not less than the stencil radius
+      boxes_in_i[level] = fine_boxes_in_i;
+      box_ghosts[level] = box_ghosts[level-1];
+             doRestrict = 1;
+    }else
+    if( (fine_box_dim % 2 == 0) && ((fine_box_dim/2)>=stencil_get_radius()) ){ // restrict box dimension, and run on the same number of ranks
+          nProcs[level] = fine_nProcs;
+           dim_i[level] = fine_dim_i/2;
+         box_dim[level] = fine_box_dim/2; // FIX, verify its not less than the stencil radius
+      boxes_in_i[level] = fine_boxes_in_i;
+      box_ghosts[level] = box_ghosts[level-1];
+             doRestrict = 1;
+    }
+    if(dim_i[level]<minCoarseGridDim)doRestrict=0;
+    if(doRestrict)all_grids->num_levels++;
+  }
+  #endif
+
+
+  // now build all the coarsened levels...
+  for(level=1;level<all_grids->num_levels;level++){
+    all_grids->levels[level] = (level_type*)malloc(sizeof(level_type));
+    if(all_grids->levels[level] == NULL){fprintf(stderr,"malloc failed - MGBuild/doRestrict\n");exit(0);}
+    create_level(all_grids->levels[level],boxes_in_i[level],box_dim[level],box_ghosts[level],all_grids->levels[level-1]->numVectors,all_grids->levels[level-1]->boundary_condition.type,all_grids->levels[level-1]->my_rank,nProcs[level], comm);
+    all_grids->levels[level]->h = 2.0*all_grids->levels[level-1]->h;
+  }
+
+
+  // bottom solver (level = all_grids->num_levels-1) gets extra vectors...
+  create_vectors(all_grids->levels[all_grids->num_levels-1],all_grids->levels[all_grids->num_levels-1]->numVectors + IterativeSolver_NumVectors() );
+
+
+  // build the restriction and interpolation communicators...
+  if(all_grids->my_rank==0){fprintf(stdout,"\n  Building restriction and interpolation lists... ");fflush(stdout);}
+  build_restriction(all_grids,RESTRICT_CELL  ); // cell-centered
+  build_restriction(all_grids,RESTRICT_FACE_I); // face-centered, normal to i
+  build_restriction(all_grids,RESTRICT_FACE_J); // face-centered, normal to j
+  build_restriction(all_grids,RESTRICT_FACE_K); // face-centered, normal to k
+  build_interpolation(all_grids);
+  if(all_grids->my_rank==0){fprintf(stdout,"done\n");fflush(stdout);}
+
+
+  // build subcommunicators...
+  #ifdef USE_MPI
+  #ifdef USE_SUBCOMM
+  if(all_grids->my_rank==0){fprintf(stdout,"\n");}
+  for(level=1;level<all_grids->num_levels;level++){
+    double comm_split_start = MPI_Wtime();
+    if(all_grids->my_rank==0){fprintf(stdout,"  Building MPI subcommunicator for level %d... ",level);fflush(stdout);}
+    all_grids->levels[level]->active=0;
+    int ll;for(ll=level;ll<all_grids->num_levels;ll++)if(all_grids->levels[ll]->num_my_boxes>0)all_grids->levels[level]->active=1;
+    MPI_Comm_split(comm, all_grids->levels[level]->active, all_grids->levels[level]->my_rank, &all_grids->levels[level]->MPI_COMM_ALLREDUCE);
+    double comm_split_end = MPI_Wtime();
+    double comm_split_time_send = comm_split_end-comm_split_start;
+    double comm_split_time = 0;
+    MPI_Allreduce(&comm_split_time_send,&comm_split_time,1,MPI_DOUBLE,MPI_MAX,all_grids->levels[level]->MPI_COMM_ALLREDUCE);
+    if(all_grids->my_rank==0){fprintf(stdout,"done (%0.6f seconds)\n",comm_split_time);fflush(stdout);}
+  }
+  #endif
+  #endif
+
+
+  // rebuild various coefficients for the operator... must occur after build_restriction !!!
+  if(all_grids->my_rank==0){fprintf(stdout,"\n");}
+  for(level=1;level<all_grids->num_levels;level++){
+    rebuild_operator(all_grids->levels[level],(level>0)?all_grids->levels[level-1]:NULL,a,b);
+  }
+  if(all_grids->my_rank==0){fprintf(stdout,"\n");}
+
+
+  // quick tests for Poisson, Neumann, etc...
+  for(level=0;level<all_grids->num_levels;level++){
+    all_grids->levels[level]->must_subtract_mean = 0;
+    int alpha_is_zero = (dot(all_grids->levels[level],VECTOR_ALPHA,VECTOR_ALPHA) == 0.0);
+    // For Poisson with Periodic Boundary Conditions, by convention we assume the solution sums to zero.  Eliminate any constants from the solution by subtracting the mean.
+    if( (all_grids->levels[level]->boundary_condition.type==BC_PERIODIC) && ((a==0) || (alpha_is_zero==1)) )all_grids->levels[level]->must_subtract_mean = 1;
+  }
+
+  
+  all_grids->timers.MGBuild += (double)(getTime()-_timeStartMGBuild);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// deallocate all memory created in the MG hierarchy
+// WARNING, this will free the fine_grid level as well (FIX?)
+void MGDestroy(mg_type *all_grids){
+  int level;
+  int i;
+
+  #ifdef USE_MPI
+  #ifdef USE_SUBCOMM
+  // only MGBuild creates subcommunicators (level_create assigns)
+  for(level=all_grids->num_levels-1;level>0;level--){
+    if(all_grids->levels[level]->MPI_COMM_ALLREDUCE != MPI_COMM_WORLD)
+    MPI_Comm_free(&all_grids->levels[level]->MPI_COMM_ALLREDUCE);
+  }
+  #endif
+  #endif
+
+  if(all_grids->my_rank==0){fprintf(stdout,"attempting to free the restriction and interpolation lists... ");fflush(stdout);}
+  for(level=all_grids->num_levels-1;level>=0;level--){
+    // destroy restriction mini program created by MGBuild...
+    for(i=0;i<4;i++){
+      if(all_grids->levels[level]->restriction[i].num_recvs>0){
+      //for(j=0;j<all_grids->levels[level]->restriction[i].num_recvs;j++)if(all_grids->levels[level]->restriction[i].recv_buffers[j])free(all_grids->levels[level]->restriction[i].recv_buffers[j]);
+      if(all_grids->levels[level]->restriction[i].recv_buffers[0])free(all_grids->levels[level]->restriction[i].recv_buffers[0]); // allocated in bulk
+      if(all_grids->levels[level]->restriction[i].recv_buffers   )free(all_grids->levels[level]->restriction[i].recv_buffers   );
+      if(all_grids->levels[level]->restriction[i].recv_ranks     )free(all_grids->levels[level]->restriction[i].recv_ranks     );
+      if(all_grids->levels[level]->restriction[i].recv_sizes     )free(all_grids->levels[level]->restriction[i].recv_sizes     );
+      }
+      if(all_grids->levels[level]->restriction[i].num_sends>0){
+      //for(j=0;j<all_grids->levels[level]->restriction[i].num_sends;j++)if(all_grids->levels[level]->restriction[i].send_buffers[j])free(all_grids->levels[level]->restriction[i].send_buffers[j]);
+      if(all_grids->levels[level]->restriction[i].send_buffers[0])free(all_grids->levels[level]->restriction[i].send_buffers[0]); // allocated in bulk
+      if(all_grids->levels[level]->restriction[i].send_buffers   )free(all_grids->levels[level]->restriction[i].send_buffers   );
+      if(all_grids->levels[level]->restriction[i].send_ranks     )free(all_grids->levels[level]->restriction[i].send_ranks     );
+      if(all_grids->levels[level]->restriction[i].send_sizes     )free(all_grids->levels[level]->restriction[i].send_sizes     );
+      }
+      if(all_grids->levels[level]->restriction[i].blocks[0]      )free(all_grids->levels[level]->restriction[i].blocks[0]      );
+      if(all_grids->levels[level]->restriction[i].blocks[1]      )free(all_grids->levels[level]->restriction[i].blocks[1]      );
+      if(all_grids->levels[level]->restriction[i].blocks[2]      )free(all_grids->levels[level]->restriction[i].blocks[2]      );
+      #ifdef USE_MPI
+      if(all_grids->levels[level]->restriction[i].requests       )free(all_grids->levels[level]->restriction[i].requests       );
+      if(all_grids->levels[level]->restriction[i].status         )free(all_grids->levels[level]->restriction[i].status         );
+      #endif
+    }
+
+    // destroy interpolation mini program created by MGBuild...
+    if(all_grids->levels[level]->interpolation.num_recvs>0){
+    //for(j=0;j<all_grids->levels[level]->interpolation.num_recvs;j++)if(all_grids->levels[level]->interpolation.recv_buffers[j])free(all_grids->levels[level]->interpolation.recv_buffers[j]);
+    if(all_grids->levels[level]->interpolation.recv_buffers[0])free(all_grids->levels[level]->interpolation.recv_buffers[0]); // allocated in bulk
+    if(all_grids->levels[level]->interpolation.recv_buffers   )free(all_grids->levels[level]->interpolation.recv_buffers   );
+    if(all_grids->levels[level]->interpolation.recv_ranks     )free(all_grids->levels[level]->interpolation.recv_ranks     );
+    if(all_grids->levels[level]->interpolation.recv_sizes     )free(all_grids->levels[level]->interpolation.recv_sizes     );
+    }
+    if(all_grids->levels[level]->interpolation.num_sends>0){
+    //for(j=0;j<all_grids->levels[level]->interpolation.num_sends;j++)if(all_grids->levels[level]->interpolation.send_buffers[j])free(all_grids->levels[level]->interpolation.send_buffers[j]);
+    if(all_grids->levels[level]->interpolation.send_buffers[0])free(all_grids->levels[level]->interpolation.send_buffers[0]); // allocated in bulk
+    if(all_grids->levels[level]->interpolation.send_buffers   )free(all_grids->levels[level]->interpolation.send_buffers   );
+    if(all_grids->levels[level]->interpolation.send_ranks     )free(all_grids->levels[level]->interpolation.send_ranks     );
+    if(all_grids->levels[level]->interpolation.send_sizes     )free(all_grids->levels[level]->interpolation.send_sizes     );
+    }
+    if(all_grids->levels[level]->interpolation.blocks[0]      )free(all_grids->levels[level]->interpolation.blocks[0]      );
+    if(all_grids->levels[level]->interpolation.blocks[1]      )free(all_grids->levels[level]->interpolation.blocks[1]      );
+    if(all_grids->levels[level]->interpolation.blocks[2]      )free(all_grids->levels[level]->interpolation.blocks[2]      );
+    #ifdef USE_MPI
+    if(all_grids->levels[level]->interpolation.requests       )free(all_grids->levels[level]->interpolation.requests       );
+    if(all_grids->levels[level]->interpolation.status         )free(all_grids->levels[level]->interpolation.status         );
+    #endif
+
+  }
+  if(all_grids->my_rank==0){fprintf(stdout,"done\n");}
+
+  // now destroy the level itself (but don't destroy level 0 as it was not created by MGBuild)
+  for(level=all_grids->num_levels-1;level>0;level--){
+    destroy_level(all_grids->levels[level]);
+  }
+  if(all_grids->levels)free(all_grids->levels);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// perform a richardson error analysis to infer the order of the operator/solver
+void richardson_error(mg_type *all_grids, int levelh, int u_id){
+  // in FV...
+  // +-------+   +---+---+   +-------+   +-------+
+  // |       |   | a | b |   |       |   |a+b+c+d|
+  // |  u^2h | - +---+---+ = |  u^2h | - |  ---  |
+  // |       |   | c | d |   |       |   |   4   |
+  // +-------+   +---+---+   +-------+   +-------+
+  //
+  restriction(all_grids->levels[levelh+1],VECTOR_TEMP,all_grids->levels[levelh  ],u_id,RESTRICT_CELL); // temp^2h = R u^h
+  restriction(all_grids->levels[levelh+2],VECTOR_TEMP,all_grids->levels[levelh+1],u_id,RESTRICT_CELL); // temp^4h = R u^2h
+  add_vectors(all_grids->levels[levelh+1],VECTOR_TEMP,1.0,u_id,-1.0,VECTOR_TEMP);                      // temp^2h = u^2h - temp^2h = u^2h - R u^h
+  add_vectors(all_grids->levels[levelh+2],VECTOR_TEMP,1.0,u_id,-1.0,VECTOR_TEMP);                      // temp^2h = u^4h - temp^4h = u^4h - R u^2h
+  double norm_of_u2h_minus_uh  = norm(all_grids->levels[levelh+1],VECTOR_TEMP); // || u^2h - R u^h  ||max
+  double norm_of_u4h_minus_u2h = norm(all_grids->levels[levelh+2],VECTOR_TEMP); // || u^4h - R u^2h ||max
+  // estimate the error^h using ||u^2h - R u^h||
+  if(all_grids->my_rank==0){fprintf(stdout,"  h=%0.15e  ||error||=%0.15e\n",all_grids->levels[levelh]->h,norm_of_u2h_minus_uh);fflush(stdout);}
+  // log( ||u^4h - R u^2h|| / ||u^2h - R u^h|| ) / log(2) is an estimate of the order of the method (e.g. 4th order)
+  if(all_grids->my_rank==0){fprintf(stdout,"  order=%0.3f\n",log(norm_of_u4h_minus_u2h / norm_of_u2h_minus_uh) / log(2) );fflush(stdout);}
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void MGVCycle(mg_type *all_grids, int e_id, int R_id, double a, double b, int level){
+  if(!all_grids->levels[level]->active)return;
+  double _LevelStart;
+
+  // bottom solve...
+  if(level==all_grids->num_levels-1){
+    double _timeBottomStart = getTime();
+    IterativeSolver(all_grids->levels[level],e_id,R_id,a,b,MG_DEFAULT_BOTTOM_NORM);
+    all_grids->levels[level]->timers.Total += (double)(getTime()-_timeBottomStart);
+    return;
+  }
+
+  // down...
+  _LevelStart = getTime();
+       smooth(all_grids->levels[level  ],e_id,R_id,a,b);
+     residual(all_grids->levels[level  ],VECTOR_TEMP,e_id,R_id,a,b);
+  restriction(all_grids->levels[level+1],R_id,all_grids->levels[level],VECTOR_TEMP,RESTRICT_CELL);
+  zero_vector(all_grids->levels[level+1],e_id);
+  all_grids->levels[level]->timers.Total += (double)(getTime()-_LevelStart);
+
+  // recursion...
+  MGVCycle(all_grids,e_id,R_id,a,b,level+1);
+
+  // up...
+  _LevelStart = getTime();
+  interpolation_vcycle(all_grids->levels[level  ],e_id,1.0,all_grids->levels[level+1],e_id);
+                smooth(all_grids->levels[level  ],e_id,R_id,a,b);
+
+  all_grids->levels[level]->timers.Total += (double)(getTime()-_LevelStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void MGSolve(mg_type *all_grids, int onLevel, int u_id, int F_id, double a, double b, double dtol, double rtol){
+  // solves Au=f on level 'onLevel'
+  all_grids->MGSolves_performed++;
+  if(!all_grids->levels[onLevel]->active)return;
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  int e_id = u_id; // __u FIX
+  int R_id = VECTOR_F_MINUS_AV;
+  int v;
+  int maxVCycles = 20;
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  #ifdef _OPENMP
+  double MG_Start_Time = omp_get_wtime();
+  #elif USE_MPI
+  double MG_Start_Time = MPI_Wtime();
+  #endif
+  if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"MGSolve... ");}
+  double _timeStartMGSolve = getTime();
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // calculate norm of f for convergence criteria...
+  double norm_of_F     = 1.0;
+  double norm_of_DinvF = 1.0;
+  if(dtol>0){
+    mul_vectors(all_grids->levels[onLevel],VECTOR_TEMP,1.0,F_id,VECTOR_DINV); // D^{-1}F
+    norm_of_DinvF = norm(all_grids->levels[onLevel],VECTOR_TEMP);		// ||D^{-1}F||
+  }
+  if(rtol>0)norm_of_F = norm(all_grids->levels[onLevel],F_id);		// ||F||
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // make initial guess for e (=0) and setup the RHS
+   zero_vector(all_grids->levels[onLevel],e_id);                  // ee = 0
+  scale_vector(all_grids->levels[onLevel],R_id,1.0,F_id);         // R_id = F_id
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // now do v-cycles to calculate the correction...
+  for(v=0;v<maxVCycles;v++){   
+    int level = onLevel;
+    all_grids->levels[level]->vcycles_from_this_level++;
+
+    // do the v-cycle...
+    MGVCycle(all_grids,e_id,R_id,a,b,level);
+
+    // now calculate the norm of the residual...
+    double _timeStart = getTime();
+    if(all_grids->levels[level]->must_subtract_mean == 1){
+      double average_value_of_e = mean(all_grids->levels[level],e_id);
+      shift_vector(all_grids->levels[level],e_id,e_id,-average_value_of_e);
+    }
+    residual(all_grids->levels[level],VECTOR_TEMP,e_id,F_id,a,b);
+    if(dtol>0)mul_vectors(all_grids->levels[level],VECTOR_TEMP,1.0,VECTOR_TEMP,VECTOR_DINV); //  Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria...
+    double norm_of_residual = norm(all_grids->levels[level],VECTOR_TEMP);
+    double _timeNorm = getTime();
+    all_grids->levels[level]->timers.Total += (double)(_timeNorm-_timeStart);
+    if(all_grids->levels[level]->my_rank==0){
+      double rel = 0.0;
+      if(rtol>0)rel = norm_of_residual/norm_of_F;
+           else rel = norm_of_residual/norm_of_DinvF;
+      if(   v>0){fprintf(stdout,"\n           v-cycle=%2d  norm=%1.15e  rel=%1.15e  ",v+1,norm_of_residual,rel);}
+            else{fprintf(stdout,             "v-cycle=%2d  norm=%1.15e  rel=%1.15e  ",v+1,norm_of_residual,rel);}
+    }
+    if(norm_of_residual/norm_of_F < rtol)break;
+    if(norm_of_residual           < dtol)break;
+  } // maxVCycles
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  all_grids->timers.MGSolve += (double)(getTime()-_timeStartMGSolve);
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  #ifdef _OPENMP
+  if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done (%f seconds)\n",omp_get_wtime()-MG_Start_Time);} // used to monitor variability in individual solve times
+  #elif USE_MPI
+  if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done (%f seconds)\n",MPI_Wtime()-MG_Start_Time);} // used to monitor variability in individual solve times
+  #else
+  if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done\n");}
+  #endif
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void FMGSolve(mg_type *all_grids, int onLevel, int u_id, int F_id, double a, double b, double dtol, double rtol){
+
+  #ifdef UNLIMIT_FMG_FCYCLES
+
+  all_grids->MGSolves_performed++;
+  if(!all_grids->levels[onLevel]->active)return;
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  int maxFCycles=20;
+  int f;
+  int level;
+  int e_id = VECTOR_E;
+  int R_id = VECTOR_F_MINUS_AV;
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  #ifdef _OPENMP
+  double FMG_Start_Time = omp_get_wtime();
+  #elif USE_MPI
+  double FMG_Start_Time = MPI_Wtime();
+  #endif
+  if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"FMGSolve... ");}
+  double _timeStartMGSolve = getTime();
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // calculate norm of f...
+  double _LevelStart = getTime();
+  double norm_of_F = norm(all_grids->levels[onLevel],F_id);                     // ||F||
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // initialize the RHS for the f-cycle to f...
+  scale_vector(all_grids->levels[onLevel],R_id,1.0,F_id);              // R_id = F-Au = F-0 = F_id
+  all_grids->levels[onLevel]->timers.Total += (double)(getTime()-_LevelStart);
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // iterate on f-cycles...
+  for(f=0;f<maxFCycles;f++){
+
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // restrict RHS to bottom (coarsest grids)
+    for(level=onLevel;level<(all_grids->num_levels-1);level++){
+      double _LevelStart = getTime();
+      restriction(all_grids->levels[level+1],R_id,all_grids->levels[level],R_id,RESTRICT_CELL);
+      all_grids->levels[level]->timers.Total += (double)(getTime()-_LevelStart);
+    }
+
+
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // solve coarsest grid...
+      double _timeBottomStart = getTime();
+      level = all_grids->num_levels-1;
+      if(level>onLevel)zero_vector(all_grids->levels[level],e_id);//else use whatever was the initial guess
+      IterativeSolver(all_grids->levels[level],e_id,R_id,a,b,MG_DEFAULT_BOTTOM_NORM);  // -1 == exact solution
+      all_grids->levels[level]->timers.Total += (double)(getTime()-_timeBottomStart);
+
+
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // now do the F-cycle proper...
+    for(level=all_grids->num_levels-2;level>=onLevel;level--){
+      // high-order interpolation
+      _LevelStart = getTime();
+      interpolation_fcycle(all_grids->levels[level],e_id,0.0,all_grids->levels[level+1],e_id);
+      all_grids->levels[level]->timers.Total += (double)(getTime()-_LevelStart);
+
+      // v-cycle
+      all_grids->levels[level]->vcycles_from_this_level++;
+      MGVCycle(all_grids,e_id,R_id,a,b,level);
+    }
+
+    // correct current solution and calculate residual (new RHS)...
+    _LevelStart = getTime();
+    add_vectors(all_grids->levels[onLevel],u_id,1.0,u_id,1.0,e_id );
+    if(all_grids->levels[onLevel]->must_subtract_mean == 1){
+      double average_value_of_u = mean(all_grids->levels[onLevel],u_id);
+      shift_vector(all_grids->levels[onLevel],u_id,u_id,-average_value_of_u);
+    }
+    residual(all_grids->levels[onLevel],R_id,u_id,F_id,a,b);
+    double norm_of_residual = norm(all_grids->levels[onLevel],R_id);
+    all_grids->levels[onLevel]->timers.Total += (double)(getTime()-_LevelStart);
+
+    // test convergence...
+    if(all_grids->levels[onLevel]->my_rank==0){
+      double rel = 0.0;
+      rel = norm_of_residual/norm_of_F;
+      if(f>0){fprintf(stdout,"\n            f-cycle=%2d  norm=%1.15e  rel=%1.15e  ",f,norm_of_residual,rel);}
+         else{fprintf(stdout,              "f-cycle=%2d  norm=%1.15e  rel=%1.15e  ",f,norm_of_residual,rel);}
+    }
+    if(norm_of_residual/norm_of_F < rtol)break;
+
+  } // F-cycle
+
+  #else
+
+  all_grids->MGSolves_performed++;
+  if(!all_grids->levels[onLevel]->active)return;
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  #ifdef UNLIMIT_FMG_VCYCLES
+  int maxVCycles=20;
+  #else
+  int maxVCycles=0;
+  #endif
+  int v;
+  int level;
+  int e_id = u_id;
+  int R_id = VECTOR_F_MINUS_AV;
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  #ifdef _OPENMP
+  double FMG_Start_Time = omp_get_wtime();
+  #elif USE_MPI
+  double FMG_Start_Time = MPI_Wtime();
+  #endif
+  if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"FMGSolve... ");}
+  double _timeStartMGSolve = getTime();
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // calculate norm of f...
+  double _LevelStart = getTime();
+  double norm_of_F     = 1.0;
+  double norm_of_DinvF = 1.0;
+  if(dtol>0){
+    mul_vectors(all_grids->levels[onLevel],VECTOR_TEMP,1.0,F_id,VECTOR_DINV);	// D^{-1}F
+    norm_of_DinvF = norm(all_grids->levels[onLevel],VECTOR_TEMP);		// ||D^{-1}F||
+  }
+  if(rtol>0)norm_of_F = norm(all_grids->levels[onLevel],F_id);			// ||F||
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // initialize the RHS for the f-cycle to f...
+  scale_vector(all_grids->levels[onLevel],R_id,1.0,F_id);              // R_id = F_id
+  all_grids->levels[onLevel]->timers.Total += (double)(getTime()-_LevelStart);
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // restrict RHS to bottom (coarsest grids)
+  for(level=onLevel;level<(all_grids->num_levels-1);level++){
+    double _LevelStart = getTime();
+    restriction(all_grids->levels[level+1],R_id,all_grids->levels[level],R_id,RESTRICT_CELL);
+    all_grids->levels[level]->timers.Total += (double)(getTime()-_LevelStart);
+  }
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // solve coarsest grid...
+    double _timeBottomStart = getTime();
+    level = all_grids->num_levels-1;
+    if(level>onLevel)zero_vector(all_grids->levels[level],e_id);//else use whatever was the initial guess
+    IterativeSolver(all_grids->levels[level],e_id,R_id,a,b,MG_DEFAULT_BOTTOM_NORM);  // -1 == exact solution
+    all_grids->levels[level]->timers.Total += (double)(getTime()-_timeBottomStart);
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // now do the F-cycle proper...
+  for(level=all_grids->num_levels-2;level>=onLevel;level--){
+    // high-order interpolation
+    _LevelStart = getTime();
+    interpolation_fcycle(all_grids->levels[level],e_id,0.0,all_grids->levels[level+1],e_id);
+    all_grids->levels[level]->timers.Total += (double)(getTime()-_LevelStart);
+
+    // v-cycle
+    all_grids->levels[level]->vcycles_from_this_level++;
+    MGVCycle(all_grids,e_id,R_id,a,b,level);
+  }
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // now do the post-F V-cycles
+  for(v=-1;v<maxVCycles;v++){
+    int level = onLevel;
+
+    // do the v-cycle...
+    if(v>=0){
+    all_grids->levels[level]->vcycles_from_this_level++;
+    MGVCycle(all_grids,e_id,R_id,a,b,level);
+    }
+
+    // now calculate the norm of the residual...
+    double _timeStart = getTime();
+    if(all_grids->levels[level]->must_subtract_mean == 1){
+      double average_value_of_e = mean(all_grids->levels[level],e_id);
+      shift_vector(all_grids->levels[level],e_id,e_id,-average_value_of_e);
+    }
+    residual(all_grids->levels[level],VECTOR_TEMP,e_id,F_id,a,b);
+    if(dtol>0)mul_vectors(all_grids->levels[level],VECTOR_TEMP,1.0,VECTOR_TEMP,VECTOR_DINV); //  Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria...
+    double norm_of_residual = norm(all_grids->levels[level],VECTOR_TEMP);
+    double _timeNorm = getTime();
+    all_grids->levels[level]->timers.Total += (double)(_timeNorm-_timeStart);
+    if(all_grids->levels[level]->my_rank==0){
+      double rel = 0.0;
+      if(rtol>0)rel = norm_of_residual/norm_of_F;
+           else rel = norm_of_residual/norm_of_DinvF;
+      if(  v>=0){fprintf(stdout,"\n            v-cycle=%2d  norm=%1.15e  rel=%1.15e  ",v+1,norm_of_residual,rel);}
+            else{fprintf(stdout,              "f-cycle     norm=%1.15e  rel=%1.15e  ",norm_of_residual,rel);}
+    }
+    if(norm_of_residual/norm_of_F < rtol)break;
+    if(norm_of_residual           < dtol)break;
+  }
+
+  #endif /* UNLIMIT_FMG_FCYCLES */
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  all_grids->timers.MGSolve += (double)(getTime()-_timeStartMGSolve);
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  #ifdef _OPENMP
+  if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done (%f seconds)\n",omp_get_wtime()-FMG_Start_Time);} // used to monitor variability in individual solve times
+  #elif USE_MPI
+  if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done (%f seconds)\n",MPI_Wtime()-FMG_Start_Time);} // used to monitor variability in individual solve times
+  #else
+  if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done\n");}
+  #endif
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void MGPCG(mg_type *all_grids, int onLevel, int x_id, int F_id, double a, double b, double dtol, double rtol){
+  // Algorithm 9.1 in Iterative Methods for Sparse Linear Systems(Yousef Saad) using a MG V-Cycle as M^{-1}
+  level_type * level = all_grids->levels[onLevel];
+  if(!level->active)return;
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // CG with a MG preconditioner, every level needs 3 extra vectors (p, Ap, z)
+  int l;
+  for(l=0;l<all_grids->num_levels;l++){
+    create_vectors(all_grids->levels[l],VECTORS_RESERVED+3);
+  }
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // Test for Poisson with Periodic BCs
+  for(l=0;l<all_grids->num_levels;l++){
+    if(all_grids->levels[l]->must_subtract_mean==-1){
+      all_grids->levels[l]->must_subtract_mean=0;
+      int alpha_is_zero = (dot(all_grids->levels[l],VECTOR_ALPHA,VECTOR_ALPHA) == 0.0);
+      if( (all_grids->levels[l]->boundary_condition.type==BC_PERIODIC) && ((a==0) || (alpha_is_zero)) )all_grids->levels[l]->must_subtract_mean = 1;
+    }
+  }
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  int   r_id = VECTOR_F_MINUS_AV;
+  int   p_id = VECTORS_RESERVED+0;
+  int  Ap_id = VECTORS_RESERVED+1;
+  int   z_id = VECTORS_RESERVED+2;
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  #ifdef _OPENMP
+  double MGPCG_Start_Time = omp_get_wtime();
+  #elif USE_MPI
+  double MGPCG_Start_Time = MPI_Wtime();
+  #endif
+  if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"MGPCG...  ");}
+  double _timeStartMGSolve = getTime();
+  all_grids->MGSolves_performed++;
+  int jMax=20;
+  int j=0;
+  int CGFailed    = 0;
+  int CGConverged = 0;
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  zero_vector(level,x_id);                                                      // x[] = 0
+  residual(level,r_id,x_id,F_id,a,b);                                           // r[] = F_id[] - A(x_id)
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  if(level->must_subtract_mean == 1){
+    double mean_of_r = mean(level,r_id);
+    shift_vector(level,r_id,r_id,-mean_of_r);
+  }
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  double norm_of_r0 = norm(level,r_id);                                         // the norm of the initial residual...
+  if(norm_of_r0 == 0.0){CGConverged=1;}                                         // entered CG with exact solution
+  level->vcycles_from_this_level++;                                             //
+  zero_vector(level,z_id);                                                      // z[] = 0
+  MGVCycle(all_grids,z_id,r_id,a,b,onLevel);                                    // z[] = M^{-1}r[]
+  scale_vector(level,p_id,1.0,z_id);                                            // p[] = z[]
+  double r_dot_z = dot(level,r_id,z_id);                                        // r_dot_z = dot(r,z)
+  while( (j<jMax) && (!CGFailed) && (!CGConverged) ){                           // while(not done){
+    j++;level->Krylov_iterations++;                                             //
+    apply_op(level,Ap_id,p_id,a,b);                                             //   Ap[] = A(p)
+    double Ap_dot_p = dot(level,Ap_id,p_id);                                    //   Ap_dot_p = dot(Ap,p)
+    if(Ap_dot_p == 0.0){CGFailed=1;break;}                                      //   pivot breakdown ???
+    double alpha = r_dot_z / Ap_dot_p;                                          //   alpha = r_dot_z / Ap_dot_p
+    if(isinf(alpha)){CGFailed=1;break;}                                         //   ???
+    add_vectors(level,x_id,1.0,x_id, alpha,p_id );                              //   x_id[] = x_id[] + alpha*p[]
+    add_vectors(level,r_id,1.0,r_id,-alpha,Ap_id);                              //   r[]    = r[]    - alpha*Ap[]   (intermediate residual?)
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    if(level->must_subtract_mean == 1){
+      double mean_of_r = mean(level,r_id);
+      shift_vector(level,r_id,r_id,-mean_of_r);
+    }
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  //double norm_of_r = norm(level,r_id);                                        //   norm of intermediate residual (delusional convergence)
+    residual(level,VECTOR_TEMP,x_id,F_id,a,b);                                  //   true residual
+    double norm_of_r = norm(level,VECTOR_TEMP);                                 //   norm of true residual (true convergence test)
+    if(norm_of_r == 0.0){CGConverged=1;break;}                                  //
+    if(level->my_rank==0){
+      if(   j>1){fprintf(stdout,"\n          ");}
+      if(rtol>0){fprintf(stdout,"iter=%3d  norm=%1.15e  rel=%1.15e  ",j,norm_of_r,norm_of_r/norm_of_r0    );}
+    }
+    if(norm_of_r/norm_of_r0 < rtol)break;                                       //   norm if true residual is small enough
+    level->vcycles_from_this_level++;                                           //
+    zero_vector(level,z_id);                                                    //   z[] = 0
+    MGVCycle(all_grids,z_id,r_id,a,b,onLevel);                                  //   z[] = M^{-1}r[]
+    double r_dot_z_new = dot(level,r_id,z_id);                                  //   r_dot_z_new = dot(r_{j+1},z_{j+1})
+    if(r_dot_z_new == 0.0){CGFailed=1;break;}                                   //   Lanczos breakdown ???
+    double beta = (r_dot_z_new/r_dot_z);                                        //   beta = (r_dot_z_new/r_dot_z)
+    if(isinf(beta)){CGFailed=1;break;}                                          //   ???
+    add_vectors(level,p_id,1.0,z_id,beta,p_id );                                //   p[] = z[] + beta*p[]
+    r_dot_z = r_dot_z_new;                                                      //   r_dot_r = r_dot_r_new   (save old r_dot_r)
+    // FIX... need to test for stalled convergence...
+  }                                                                             // }
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  all_grids->timers.MGSolve += (double)(getTime()-_timeStartMGSolve);
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  #ifdef _OPENMP
+  if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done (%f seconds)\n",omp_get_wtime()-MGPCG_Start_Time);} // used to monitor variability in individual solve times
+  #elif USE_MPI
+  if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done (%f seconds)\n",MPI_Wtime()-MGPCG_Start_Time);} // used to monitor variability in individual solve times
+  #else
+  if(all_grids->levels[onLevel]->my_rank==0){fprintf(stdout,"done\n");}
+  #endif
+}
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators.27pt.c b/Util/hpgmg/finite-volume/source/operators.27pt.c
new file mode 100644
index 00000000..2d71e465
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.27pt.c
@@ -0,0 +1,163 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#include "timers.h"
+#include "defines.h"
+#include "level.h"
+#include "operators.h"
+//------------------------------------------------------------------------------------------------------------------------------
+#define MyPragma(a) _Pragma(#a)
+//------------------------------------------------------------------------------------------------------------------------------
+#if (_OPENMP>=201107) // OpenMP 3.1 supports max reductions...
+  // XL C/C++ 12.01.0000.0009 sets _OPENMP to 201107, but does not support the max clause within a _Pragma().  
+  // This issue was fixed by XL C/C++ 12.01.0000.0011
+  // If you do not have this version of XL C/C++ and run into this bug, uncomment these macros...
+  //#warning not threading norm() calculations due to issue with XL/C, _Pragma, and reduction(max:bmax)
+  //#define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1)                     )
+  //#define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(  +:bsum) )
+  //#define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    
+  #define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1)                     )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(  +:bsum) )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(max:bmax) )
+#elif _OPENMP // older OpenMP versions don't support the max reduction clause
+  #warning Threading max reductions requires OpenMP 3.1 (July 2011).  Please upgrade your compiler.                                                           
+  #define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1)                     )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(  +:bsum) )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    
+#else // flat MPI should not define any threading...
+  #define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+void apply_BCs(level_type * level, int x_id, int shape){apply_BCs_p2(level,x_id,shape);} // 27pt uses cell centered, not cell averaged
+//void apply_BCs(level_type * level, int x_id, int shape){apply_BCs_v2(level,x_id,shape);}
+//------------------------------------------------------------------------------------------------------------------------------
+#define STENCIL_COEF0 (-4.2666666666666666666)  // -128.0/30.0;
+#define STENCIL_COEF1 ( 0.4666666666666666666)  //   14.0/30.0;
+#define STENCIL_COEF2 ( 0.1000000000000000000)  //    3.0/30.0;
+#define STENCIL_COEF3 ( 0.0333333333333333333)  //    1.0/30.0;
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef STENCIL_VARIABLE_COEFFICIENT
+  #error This implementation does not support variable-coefficient operators
+#endif
+#ifdef STENCIL_FUSE_BC
+  #error This implementation does not support fusion of the boundary conditions with the operator
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#define Dinv_ijk() Dinv[ijk]        // simply retrieve it rather than recalculating it
+//------------------------------------------------------------------------------------------------------------------------------
+#define apply_op_ijk(x)				\
+(						\
+  a*x[ijk] - b*h2inv*(				\
+    STENCIL_COEF3*(x[ijk-kStride-jStride-1] +	\
+                   x[ijk-kStride-jStride+1] +	\
+                   x[ijk-kStride+jStride-1] +	\
+                   x[ijk-kStride+jStride+1] +	\
+                   x[ijk+kStride-jStride-1] +	\
+                   x[ijk+kStride-jStride+1] +	\
+                   x[ijk+kStride+jStride-1] +	\
+                   x[ijk+kStride+jStride+1] ) +	\
+    STENCIL_COEF2*(x[ijk-kStride-jStride  ] +	\
+                   x[ijk-kStride        -1] +	\
+                   x[ijk-kStride        +1] +	\
+                   x[ijk-kStride+jStride  ] +	\
+                   x[ijk        -jStride-1] +	\
+                   x[ijk        -jStride+1] +	\
+                   x[ijk        +jStride-1] +	\
+                   x[ijk        +jStride+1] +	\
+                   x[ijk+kStride-jStride  ] +	\
+                   x[ijk+kStride        -1] +	\
+                   x[ijk+kStride        +1] +	\
+                   x[ijk+kStride+jStride  ] ) +	\
+    STENCIL_COEF1*(x[ijk-kStride          ] +	\
+                   x[ijk        -jStride  ] +	\
+                   x[ijk                -1] +	\
+                   x[ijk                +1] +	\
+                   x[ijk        +jStride  ] +	\
+                   x[ijk+kStride          ] ) +	\
+    STENCIL_COEF0*(x[ijk                  ] )	\
+  )						\
+)
+//------------------------------------------------------------------------------------------------------------------------------
+int stencil_get_radius(){return(1);} // 27pt = dense 3^3
+int stencil_get_shape(){return(STENCIL_SHAPE_BOX);} // needs faces, edges, and corners
+//------------------------------------------------------------------------------------------------------------------------------
+void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){
+  // form restriction of alpha[], beta_*[] coefficients from fromLevel
+  if(fromLevel != NULL){
+    restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL  );
+    restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I);
+    restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J);
+    restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K);
+  } // else case assumes alpha/beta have been set
+
+  // exchange alpha/beta/...  (must be done before calculating Dinv)
+  exchange_boundary(level,VECTOR_ALPHA ,STENCIL_SHAPE_BOX); // safe
+  exchange_boundary(level,VECTOR_BETA_I,STENCIL_SHAPE_BOX);
+  exchange_boundary(level,VECTOR_BETA_J,STENCIL_SHAPE_BOX);
+  exchange_boundary(level,VECTOR_BETA_K,STENCIL_SHAPE_BOX);
+
+  // black box rebuild of D^{-1}, l1^{-1}, dominant eigenvalue, ...
+  rebuild_operator_blackbox(level,a,b,2);
+
+  // exchange Dinv/L1inv/...
+  exchange_boundary(level,VECTOR_DINV ,STENCIL_SHAPE_BOX); // safe
+  exchange_boundary(level,VECTOR_L1INV,STENCIL_SHAPE_BOX);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef  USE_GSRB
+#warning GSRB is not recommended for the 27pt operator
+#define GSRB_OOP
+#define NUM_SMOOTHS      2 // RBRB
+#include "operators/gsrb.c"
+#elif   USE_CHEBY
+#define NUM_SMOOTHS      1
+#define CHEBYSHEV_DEGREE 4 // i.e. one degree-4 polynomial smoother
+#include "operators/chebyshev.c"
+#elif   USE_JACOBI
+#define NUM_SMOOTHS      6
+#include "operators/jacobi.c"
+#elif   USE_L1JACOBI
+#define NUM_SMOOTHS      6
+#include "operators/jacobi.c"
+#elif   USE_SYMGS
+#define NUM_SMOOTHS      2 // FBFB
+#include "operators/symgs.c"
+#else
+#error You must compile with either -DUSE_GSRB, -DUSE_CHEBY, -DUSE_JACOBI, -DUSE_L1JACOBI, or -DUSE_SYMGS
+#endif
+#include "operators/residual.c"
+#include "operators/apply_op.c"
+#include "operators/rebuild.c"
+//------------------------------------------------------------------------------------------------------------------------------
+#include "operators/blockCopy.c"
+#include "operators/misc.c"
+#include "operators/exchange_boundary.c"
+#include "operators/boundary_fd.c" // 27pt uses cell centered, not cell averaged
+//#include "operators/boundary_fv.c"
+#include "operators/restriction.c"
+#include "operators/interpolation_p2.c"
+//#include "operators/interpolation_v2.c"
+//------------------------------------------------------------------------------------------------------------------------------
+void interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_p2(level_f,id_f,prescale_f,level_c,id_c);} // 27pt uses cell centered, not cell averaged
+void interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_p2(level_f,id_f,prescale_f,level_c,id_c);}
+//void interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_v2(level_f,id_f,prescale_f,level_c,id_c);}
+//void interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_v2(level_f,id_f,prescale_f,level_c,id_c);}
+//------------------------------------------------------------------------------------------------------------------------------
+#include "operators/problem.p6.c"
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators.7pt.c b/Util/hpgmg/finite-volume/source/operators.7pt.c
new file mode 100644
index 00000000..4802c72e
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.7pt.c
@@ -0,0 +1,275 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#include "timers.h"
+#include "defines.h"
+#include "level.h"
+#include "operators.h"
+//------------------------------------------------------------------------------------------------------------------------------
+#define MyPragma(a) _Pragma(#a)
+//------------------------------------------------------------------------------------------------------------------------------
+#if (_OPENMP>=201107) // OpenMP 3.1 supports max reductions...
+  // XL C/C++ 12.01.0000.0009 sets _OPENMP to 201107, but does not support the max clause within a _Pragma().  
+  // This issue was fixed by XL C/C++ 12.01.0000.0011
+  // If you do not have this version of XL C/C++ and run into this bug, uncomment these macros...
+  //#warning not threading norm() calculations due to issue with XL/C, _Pragma, and reduction(max:bmax)
+  //#define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1)                     )
+  //#define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(  +:bsum) )
+  //#define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    
+  #define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1)                     )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(  +:bsum) )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(max:bmax) )
+#elif _OPENMP // older OpenMP versions don't support the max reduction clause
+  #warning Threading max reductions requires OpenMP 3.1 (July 2011).  Please upgrade your compiler.                                                           
+  #define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1)                     )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(  +:bsum) )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    
+#else // flat MPI should not define any threading...
+  #define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+void apply_BCs(level_type * level, int x_id, int shape){apply_BCs_p1(level,x_id,shape);}
+//------------------------------------------------------------------------------------------------------------------------------
+#define Dinv_ijk() Dinv[ijk]        // simply retrieve it rather than recalculating it
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef STENCIL_VARIABLE_COEFFICIENT
+#ifdef USE_HELMHOLTZ // variable coefficient Helmholtz...
+  #define apply_op_ijk(x)                               \
+  (                                                     \
+    a*alpha[ijk]*x[ijk]                                 \
+   -b*h2inv*(                                           \
+      + beta_i[ijk+1      ]*( x[ijk+1      ] - x[ijk] ) \
+      + beta_i[ijk        ]*( x[ijk-1      ] - x[ijk] ) \
+      + beta_j[ijk+jStride]*( x[ijk+jStride] - x[ijk] ) \
+      + beta_j[ijk        ]*( x[ijk-jStride] - x[ijk] ) \
+      + beta_k[ijk+kStride]*( x[ijk+kStride] - x[ijk] ) \
+      + beta_k[ijk        ]*( x[ijk-kStride] - x[ijk] ) \
+    )                                                   \
+  )
+#else // variable coefficient Poisson...
+  #define apply_op_ijk(x)                               \
+  (                                                     \
+    -b*h2inv*(                                          \
+      + beta_i[ijk+1      ]*( x[ijk+1      ] - x[ijk] ) \
+      + beta_i[ijk        ]*( x[ijk-1      ] - x[ijk] ) \
+      + beta_j[ijk+jStride]*( x[ijk+jStride] - x[ijk] ) \
+      + beta_j[ijk        ]*( x[ijk-jStride] - x[ijk] ) \
+      + beta_k[ijk+kStride]*( x[ijk+kStride] - x[ijk] ) \
+      + beta_k[ijk        ]*( x[ijk-kStride] - x[ijk] ) \
+    )                                                   \
+  )
+#endif
+#else  // constant coefficient case...  
+  #define apply_op_ijk(x)            \
+  (                                \
+    a*x[ijk] - b*h2inv*(           \
+      + x[ijk+1      ]             \
+      + x[ijk-1      ]             \
+      + x[ijk+jStride]             \
+      + x[ijk-jStride]             \
+      + x[ijk+kStride]             \
+      + x[ijk-kStride]             \
+      - x[ijk        ]*6.0         \
+    )                              \
+  )
+#endif // variable/constant coefficient
+
+//------------------------------------------------------------------------------------------------------------------------------
+int stencil_get_radius(){return(1);} // 7pt reaches out 1 point
+int stencil_get_shape(){return(STENCIL_SHAPE_STAR);} // needs just faces
+//------------------------------------------------------------------------------------------------------------------------------
+void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){
+  if(level->my_rank==0){fprintf(stdout,"  rebuilding operator for level...  h=%e  ",level->h);fflush(stdout);}
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // form restriction of alpha[], beta_*[] coefficients from fromLevel
+  if(fromLevel != NULL){
+    restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL  );
+    restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I);
+    restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J);
+    restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K);
+  } // else case assumes alpha/beta have been set
+
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // exchange alpha/beta/...  (must be done before calculating Dinv)
+  exchange_boundary(level,VECTOR_ALPHA ,STENCIL_SHAPE_BOX); // safe
+  exchange_boundary(level,VECTOR_BETA_I,STENCIL_SHAPE_BOX);
+  exchange_boundary(level,VECTOR_BETA_J,STENCIL_SHAPE_BOX);
+  exchange_boundary(level,VECTOR_BETA_K,STENCIL_SHAPE_BOX);
+
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // calculate Dinv, L1inv, and estimate the dominant Eigenvalue
+  double _timeStart = getTime();
+  int block;
+
+  double dominant_eigenvalue = -1e9;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,block,level->num_my_blocks,dominant_eigenvalue)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+    const int ilo = level->my_blocks[block].read.i;
+    const int jlo = level->my_blocks[block].read.j;
+    const int klo = level->my_blocks[block].read.k;
+    const int ihi = level->my_blocks[block].dim.i + ilo;
+    const int jhi = level->my_blocks[block].dim.j + jlo;
+    const int khi = level->my_blocks[block].dim.k + klo;
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    double h2inv = 1.0/(level->h*level->h);
+    double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+    double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+    double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+    double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+    double * __restrict__   Dinv = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
+    double * __restrict__  L1inv = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride);
+    double block_eigenvalue = -1e9;
+
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){ 
+      int ijk = i + j*jStride + k*kStride;
+
+      // used for quick linear approximation to zero dirichlet BC
+      double ilo_is_valid =1.0;
+      double ihi_is_valid =1.0;
+      double jlo_is_valid =1.0;
+      double jhi_is_valid =1.0;
+      double klo_is_valid =1.0;
+      double khi_is_valid =1.0;
+      if(level->boundary_condition.type != BC_PERIODIC){
+         if(level->my_boxes[box].low.i+i-1 <             0)ilo_is_valid = 0.0;
+         if(level->my_boxes[box].low.j+j-1 <             0)jlo_is_valid = 0.0;
+         if(level->my_boxes[box].low.k+k-1 <             0)klo_is_valid = 0.0;
+         if(level->my_boxes[box].low.i+i+1 >= level->dim.i)ihi_is_valid = 0.0;
+         if(level->my_boxes[box].low.j+j+1 >= level->dim.j)jhi_is_valid = 0.0;
+         if(level->my_boxes[box].low.k+k+1 >= level->dim.k)khi_is_valid = 0.0;
+       }
+
+      #ifdef STENCIL_VARIABLE_COEFFICIENT
+      // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements...
+      double sumAbsAij = fabs(b*h2inv) * (
+                           fabs( beta_i[ijk        ]*ilo_is_valid )+
+                           fabs( beta_j[ijk        ]*jlo_is_valid )+
+                           fabs( beta_k[ijk        ]*klo_is_valid )+
+                           fabs( beta_i[ijk+1      ]*ihi_is_valid )+
+                           fabs( beta_j[ijk+jStride]*jhi_is_valid )+
+                           fabs( beta_k[ijk+kStride]*khi_is_valid )
+                         );
+
+      // center of Gershgorin disc is the diagonal element...
+      double    Aii = a*alpha[ijk] - b*h2inv*(
+                        beta_i[ijk        ]*( ilo_is_valid-2.0 )+
+                        beta_j[ijk        ]*( jlo_is_valid-2.0 )+
+                        beta_k[ijk        ]*( klo_is_valid-2.0 )+
+                        beta_i[ijk+1      ]*( ihi_is_valid-2.0 )+
+                        beta_j[ijk+jStride]*( jhi_is_valid-2.0 )+
+                        beta_k[ijk+kStride]*( khi_is_valid-2.0 ) 
+                      );
+      #else // Constant coefficient versions with fused BC's...
+      // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements...
+      double sumAbsAij = fabs(b*h2inv) * (
+                           ilo_is_valid +
+                           jlo_is_valid +
+                           klo_is_valid +
+                           ihi_is_valid +
+                           jhi_is_valid +
+                           khi_is_valid 
+                         );
+
+      // center of Gershgorin disc is the diagonal element...
+      double    Aii = a - b*h2inv*(
+                         ilo_is_valid +
+                         jlo_is_valid +
+                         klo_is_valid +
+                         ihi_is_valid +
+                         jhi_is_valid +
+                         khi_is_valid - 12.0
+                      );
+      #endif
+
+      // calculate Dinv = D^{-1}, L1inv = ( D+D^{L1} )^{-1}, and the dominant eigenvalue...
+                             Dinv[ijk] = 1.0/Aii;				// inverse of the diagonal Aii
+                          //L1inv[ijk] = 1.0/(Aii+sumAbsAij);			// inverse of the L1 row norm... L1inv = ( D+D^{L1} )^{-1}
+      if(Aii>=1.5*sumAbsAij)L1inv[ijk] = 1.0/(Aii              ); 		// as suggested by eq 6.5 in Baker et al, "Multigrid smoothers for ultra-parallel computing: additional theory and discussion"...
+                       else L1inv[ijk] = 1.0/(Aii+0.5*sumAbsAij);		// 
+      double Di = (Aii + sumAbsAij)/Aii;if(Di>block_eigenvalue)block_eigenvalue=Di;	// upper limit to Gershgorin disc == bound on dominant eigenvalue
+    }}}
+    if(block_eigenvalue>dominant_eigenvalue){dominant_eigenvalue = block_eigenvalue;}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // Reduce the local estimates dominant eigenvalue to a global estimate
+  #ifdef USE_MPI
+  double _timeStartAllReduce = getTime();
+  double send = dominant_eigenvalue;
+  MPI_Allreduce(&send,&dominant_eigenvalue,1,MPI_DOUBLE,MPI_MAX,level->MPI_COMM_ALLREDUCE);
+  double _timeEndAllReduce = getTime();
+  level->timers.collectives   += (double)(_timeEndAllReduce-_timeStartAllReduce);
+  #endif
+  if(level->my_rank==0){fprintf(stdout,"eigenvalue_max<%e\n",dominant_eigenvalue);}
+  level->dominant_eigenvalue_of_DinvA = dominant_eigenvalue;
+
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // exchange Dinv/L1inv/...
+  exchange_boundary(level,VECTOR_DINV ,STENCIL_SHAPE_BOX); // safe
+  exchange_boundary(level,VECTOR_L1INV,STENCIL_SHAPE_BOX);
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef  USE_GSRB
+#define NUM_SMOOTHS      2 // RBRB
+#include "operators/gsrb.c"
+#elif   USE_CHEBY
+#define NUM_SMOOTHS      1
+#define CHEBYSHEV_DEGREE 4 // i.e. one degree-4 polynomial smoother
+#include "operators/chebyshev.c"
+#elif   USE_JACOBI
+#define NUM_SMOOTHS      6
+#include "operators/jacobi.c"
+#elif   USE_L1JACOBI
+#define NUM_SMOOTHS      6
+#include "operators/jacobi.c"
+#elif   USE_SYMGS
+#define NUM_SMOOTHS      2
+#include "operators/symgs.c"
+#else
+#error You must compile with either -DUSE_GSRB, -DUSE_CHEBY, -DUSE_JACOBI, -DUSE_L1JACOBI, or -DUSE_SYMGS
+#endif
+#include "operators/residual.c"
+#include "operators/apply_op.c"
+//------------------------------------------------------------------------------------------------------------------------------
+#include "operators/blockCopy.c"
+#include "operators/misc.c"
+#include "operators/exchange_boundary.c"
+#include "operators/boundary_fd.c"
+#include "operators/restriction.c"
+#include "operators/interpolation_p1.c"
+//------------------------------------------------------------------------------------------------------------------------------
+void interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_p1(level_f,id_f,prescale_f,level_c,id_c);}
+void interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_p1(level_f,id_f,prescale_f,level_c,id_c);}
+//------------------------------------------------------------------------------------------------------------------------------
+#include "operators/problem.p6.c"
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators.fv2.c b/Util/hpgmg/finite-volume/source/operators.fv2.c
new file mode 100644
index 00000000..41a875c5
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.fv2.c
@@ -0,0 +1,162 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#include "timers.h"
+#include "defines.h"
+#include "level.h"
+#include "operators.h"
+//------------------------------------------------------------------------------------------------------------------------------
+#define MyPragma(a) _Pragma(#a)
+//------------------------------------------------------------------------------------------------------------------------------
+#if (_OPENMP>=201107) // OpenMP 3.1 supports max reductions...
+  // XL C/C++ 12.01.0000.0009 sets _OPENMP to 201107, but does not support the max clause within a _Pragma().  
+  // This issue was fixed by XL C/C++ 12.01.0000.0011
+  // If you do not have this version of XL C/C++ and run into this bug, uncomment these macros...
+  //#warning not threading norm() calculations due to issue with XL/C, _Pragma, and reduction(max:bmax)
+  //#define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1)                     )
+  //#define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(  +:bsum) )
+  //#define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    
+  #define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1)                     )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(  +:bsum) )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(max:bmax) )
+#elif _OPENMP // older OpenMP versions don't support the max reduction clause
+  #warning Threading max reductions requires OpenMP 3.1 (July 2011).  Please upgrade your compiler.                                                           
+  #define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1)                     )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(  +:bsum) )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    
+#else // flat MPI should not define any threading...
+  #define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef STENCIL_FUSE_BC
+  #error This implementation does not support fusion of the boundary conditions with the operator
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+void apply_BCs(level_type * level, int x_id, int shape){apply_BCs_v2(level,x_id,shape);}
+//------------------------------------------------------------------------------------------------------------------------------
+#define Dinv_ijk() Dinv[ijk]        // simply retrieve it rather than recalculating it
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef STENCIL_VARIABLE_COEFFICIENT
+  #ifdef USE_HELMHOLTZ // variable coefficient Helmholtz...
+  #define apply_op_ijk(x)                               \
+  (                                                     \
+    a*alpha[ijk]*x[ijk]                                 \
+   -b*h2inv*(                                           \
+      + beta_i[ijk+1      ]*( x[ijk+1      ] - x[ijk] ) \
+      + beta_i[ijk        ]*( x[ijk-1      ] - x[ijk] ) \
+      + beta_j[ijk+jStride]*( x[ijk+jStride] - x[ijk] ) \
+      + beta_j[ijk        ]*( x[ijk-jStride] - x[ijk] ) \
+      + beta_k[ijk+kStride]*( x[ijk+kStride] - x[ijk] ) \
+      + beta_k[ijk        ]*( x[ijk-kStride] - x[ijk] ) \
+    )                                                   \
+  )
+  #else // variable coefficient Poisson...
+  #define apply_op_ijk(x)                               \
+  (                                                     \
+    -b*h2inv*(                                          \
+      + beta_i[ijk+1      ]*( x[ijk+1      ] - x[ijk] ) \
+      + beta_i[ijk        ]*( x[ijk-1      ] - x[ijk] ) \
+      + beta_j[ijk+jStride]*( x[ijk+jStride] - x[ijk] ) \
+      + beta_j[ijk        ]*( x[ijk-jStride] - x[ijk] ) \
+      + beta_k[ijk+kStride]*( x[ijk+kStride] - x[ijk] ) \
+      + beta_k[ijk        ]*( x[ijk-kStride] - x[ijk] ) \
+    )                                                   \
+  )
+  #endif
+#else  // constant coefficient case...  
+  #define apply_op_ijk(x)            \
+  (                                \
+    a*x[ijk] - b*h2inv*(           \
+      + x[ijk+1      ]             \
+      + x[ijk-1      ]             \
+      + x[ijk+jStride]             \
+      + x[ijk-jStride]             \
+      + x[ijk+kStride]             \
+      + x[ijk-kStride]             \
+      - x[ijk        ]*6.0         \
+    )                              \
+  )
+#endif // variable/constant coefficient
+//------------------------------------------------------------------------------------------------------------------------------
+int stencil_get_radius(){return(1);}
+int stencil_get_shape(){return(STENCIL_SHAPE_STAR);} // needs just faces
+//------------------------------------------------------------------------------------------------------------------------------
+void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){
+  // form restriction of alpha[], beta_*[] coefficients from fromLevel
+  if(fromLevel != NULL){
+    restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL  );
+    restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I);
+    restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J);
+    restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K);
+  } // else case assumes alpha/beta have been set
+
+  //no need to extrapolate the beta's into the ghost zones (no mixed derivatives in 2nd order)
+  //extrapolate_betas(level);
+  //initialize_problem(level,level->h,a,b); // approach used for testing smooth beta's; destroys the black box nature of the solver
+
+  // exchange alpha/beta/...  (must be done before calculating Dinv)
+  exchange_boundary(level,VECTOR_ALPHA ,STENCIL_SHAPE_BOX); // safe
+  exchange_boundary(level,VECTOR_BETA_I,STENCIL_SHAPE_BOX);
+  exchange_boundary(level,VECTOR_BETA_J,STENCIL_SHAPE_BOX);
+  exchange_boundary(level,VECTOR_BETA_K,STENCIL_SHAPE_BOX);
+
+  // black box rebuild of D^{-1}, l1^{-1}, dominant eigenvalue, ...
+  rebuild_operator_blackbox(level,a,b,2);
+
+  // exchange Dinv/L1inv/...
+  exchange_boundary(level,VECTOR_DINV ,STENCIL_SHAPE_BOX); // safe
+  exchange_boundary(level,VECTOR_L1INV,STENCIL_SHAPE_BOX);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef  USE_GSRB
+//#define GSRB_OOP	// no need for out-of-place for 7pt
+#define NUM_SMOOTHS      3 // RBRBRB
+#include "operators/gsrb.c"
+#elif   USE_CHEBY
+#define NUM_SMOOTHS      1
+#define CHEBYSHEV_DEGREE 6 // i.e. one degree-6 polynomial smoother
+#include "operators/chebyshev.c"
+#elif   USE_JACOBI
+#define NUM_SMOOTHS      6
+#include "operators/jacobi.c"
+#elif   USE_L1JACOBI
+#define NUM_SMOOTHS      6
+#include "operators/jacobi.c"
+#elif   USE_SYMGS
+#define NUM_SMOOTHS      2 // FBFB
+#include "operators/symgs.c"
+#else
+#error You must compile with either -DUSE_GSRB, -DUSE_CHEBY, -DUSE_JACOBI, -DUSE_L1JACOBI, or -DUSE_SYMGS
+#endif
+#include "operators/residual.c"
+#include "operators/apply_op.c"
+#include "operators/rebuild.c"
+//------------------------------------------------------------------------------------------------------------------------------
+#include "operators/blockCopy.c"
+#include "operators/misc.c"
+#include "operators/exchange_boundary.c"
+#include "operators/boundary_fv.c"
+#include "operators/restriction.c"
+#include "operators/interpolation_v2.c"
+//------------------------------------------------------------------------------------------------------------------------------
+void interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_v2(level_f,id_f,prescale_f,level_c,id_c);}
+void interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_v2(level_f,id_f,prescale_f,level_c,id_c);}
+//------------------------------------------------------------------------------------------------------------------------------
+#include "operators/problem.fv.c"
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators.fv4.c b/Util/hpgmg/finite-volume/source/operators.fv4.c
new file mode 100644
index 00000000..220b0ff9
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.fv4.c
@@ -0,0 +1,211 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#include "timers.h"
+#include "defines.h"
+#include "level.h"
+#include "operators.h"
+//------------------------------------------------------------------------------------------------------------------------------
+#define MyPragma(a) _Pragma(#a)
+//------------------------------------------------------------------------------------------------------------------------------
+#if (_OPENMP>=201107) // OpenMP 3.1 supports max reductions...
+  // XL C/C++ 12.01.0000.0009 sets _OPENMP to 201107, but does not support the max clause within a _Pragma().  
+  // This issue was fixed by XL C/C++ 12.01.0000.0011
+  // If you do not have this version of XL C/C++ and run into this bug, uncomment these macros...
+  //#warning not threading norm() calculations due to issue with XL/C, _Pragma, and reduction(max:bmax)
+  //#define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1)                     )
+  //#define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(  +:bsum) )
+  //#define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    
+  #define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1)                     )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(  +:bsum) )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(max:bmax) )
+#elif _OPENMP // older OpenMP versions don't support the max reduction clause
+  #warning Threading max reductions requires OpenMP 3.1 (July 2011).  Please upgrade your compiler.                                                           
+  #define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1)                     )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(  +:bsum) )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    
+#else // flat MPI should not define any threading...
+  #define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef STENCIL_FUSE_BC
+  #error This implementation does not support fusion of the boundary conditions with the operator
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+void apply_BCs(level_type * level, int x_id, int shape){apply_BCs_v4(level,x_id,shape);}
+//------------------------------------------------------------------------------------------------------------------------------
+#define Dinv_ijk() Dinv[ijk]        // simply retrieve it rather than recalculating it
+//------------------------------------------------------------------------------------------------------------------------------
+#define STENCIL_TWELFTH ( 0.0833333333333333333)  // 1.0/12.0;
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef STENCIL_VARIABLE_COEFFICIENT
+  #ifdef USE_HELMHOLTZ
+  #define apply_op_ijk(x)                                                                                                                            \
+  (                                                                                                                                                  \
+    a*alpha[ijk]*x[ijk]                                                                                                                              \
+   -b*h2inv*(                                                                                                                                        \
+      STENCIL_TWELFTH*(                                                                                                                              \
+        + beta_i[ijk        ]*( 15.0*(x[ijk-1      ]-x[ijk]) - (x[ijk-2        ]-x[ijk+1      ]) )                                                   \
+        + beta_i[ijk+1      ]*( 15.0*(x[ijk+1      ]-x[ijk]) - (x[ijk+2        ]-x[ijk-1      ]) )                                                   \
+        + beta_j[ijk        ]*( 15.0*(x[ijk-jStride]-x[ijk]) - (x[ijk-2*jStride]-x[ijk+jStride]) )                                                   \
+        + beta_j[ijk+jStride]*( 15.0*(x[ijk+jStride]-x[ijk]) - (x[ijk+2*jStride]-x[ijk-jStride]) )                                                   \
+        + beta_k[ijk        ]*( 15.0*(x[ijk-kStride]-x[ijk]) - (x[ijk-2*kStride]-x[ijk+kStride]) )                                                   \
+        + beta_k[ijk+kStride]*( 15.0*(x[ijk+kStride]-x[ijk]) - (x[ijk+2*kStride]-x[ijk-kStride]) )                                                   \
+      )                                                                                                                                              \
+      + 0.25*STENCIL_TWELFTH*(                                                                                                                       \
+        + (beta_i[ijk        +jStride]-beta_i[ijk        -jStride]) * (x[ijk-1      +jStride]-x[ijk+jStride]-x[ijk-1      -jStride]+x[ijk-jStride])  \
+        + (beta_i[ijk        +kStride]-beta_i[ijk        -kStride]) * (x[ijk-1      +kStride]-x[ijk+kStride]-x[ijk-1      -kStride]+x[ijk-kStride])  \
+        + (beta_j[ijk        +1      ]-beta_j[ijk        -1      ]) * (x[ijk-jStride+1      ]-x[ijk+1      ]-x[ijk-jStride-1      ]+x[ijk-1      ])  \
+        + (beta_j[ijk        +kStride]-beta_j[ijk        -kStride]) * (x[ijk-jStride+kStride]-x[ijk+kStride]-x[ijk-jStride-kStride]+x[ijk-kStride])  \
+        + (beta_k[ijk        +1      ]-beta_k[ijk        -1      ]) * (x[ijk-kStride+1      ]-x[ijk+1      ]-x[ijk-kStride-1      ]+x[ijk-1      ])  \
+        + (beta_k[ijk        +jStride]-beta_k[ijk        -jStride]) * (x[ijk-kStride+jStride]-x[ijk+jStride]-x[ijk-kStride-jStride]+x[ijk-jStride])  \
+                                                                                                                                                     \
+        + (beta_i[ijk+1      +jStride]-beta_i[ijk+1      -jStride]) * (x[ijk+1      +jStride]-x[ijk+jStride]-x[ijk+1      -jStride]+x[ijk-jStride])  \
+        + (beta_i[ijk+1      +kStride]-beta_i[ijk+1      -kStride]) * (x[ijk+1      +kStride]-x[ijk+kStride]-x[ijk+1      -kStride]+x[ijk-kStride])  \
+        + (beta_j[ijk+jStride+1      ]-beta_j[ijk+jStride-1      ]) * (x[ijk+jStride+1      ]-x[ijk+1      ]-x[ijk+jStride-1      ]+x[ijk-1      ])  \
+        + (beta_j[ijk+jStride+kStride]-beta_j[ijk+jStride-kStride]) * (x[ijk+jStride+kStride]-x[ijk+kStride]-x[ijk+jStride-kStride]+x[ijk-kStride])  \
+        + (beta_k[ijk+kStride+1      ]-beta_k[ijk+kStride-1      ]) * (x[ijk+kStride+1      ]-x[ijk+1      ]-x[ijk+kStride-1      ]+x[ijk-1      ])  \
+        + (beta_k[ijk+kStride+jStride]-beta_k[ijk+kStride-jStride]) * (x[ijk+kStride+jStride]-x[ijk+jStride]-x[ijk+kStride-jStride]+x[ijk-jStride])  \
+      )                                                                                                                                              \
+    )                                                                                                                                                \
+  )
+  #else // Poisson...
+  #define apply_op_ijk(x)                                                                                                                            \
+  (                                                                                                                                                  \
+   -b*h2inv*(                                                                                                                                        \
+      STENCIL_TWELFTH*(                                                                                                                              \
+        + beta_i[ijk        ]*( 15.0*(x[ijk-1      ]-x[ijk]) - (x[ijk-2        ]-x[ijk+1      ]) )                                                   \
+        + beta_i[ijk+1      ]*( 15.0*(x[ijk+1      ]-x[ijk]) - (x[ijk+2        ]-x[ijk-1      ]) )                                                   \
+        + beta_j[ijk        ]*( 15.0*(x[ijk-jStride]-x[ijk]) - (x[ijk-2*jStride]-x[ijk+jStride]) )                                                   \
+        + beta_j[ijk+jStride]*( 15.0*(x[ijk+jStride]-x[ijk]) - (x[ijk+2*jStride]-x[ijk-jStride]) )                                                   \
+        + beta_k[ijk        ]*( 15.0*(x[ijk-kStride]-x[ijk]) - (x[ijk-2*kStride]-x[ijk+kStride]) )                                                   \
+        + beta_k[ijk+kStride]*( 15.0*(x[ijk+kStride]-x[ijk]) - (x[ijk+2*kStride]-x[ijk-kStride]) )                                                   \
+      )                                                                                                                                              \
+      + 0.25*STENCIL_TWELFTH*(                                                                                                                       \
+        + (beta_i[ijk        +jStride]-beta_i[ijk        -jStride]) * (x[ijk-1      +jStride]-x[ijk+jStride]-x[ijk-1      -jStride]+x[ijk-jStride])  \
+        + (beta_i[ijk        +kStride]-beta_i[ijk        -kStride]) * (x[ijk-1      +kStride]-x[ijk+kStride]-x[ijk-1      -kStride]+x[ijk-kStride])  \
+        + (beta_j[ijk        +1      ]-beta_j[ijk        -1      ]) * (x[ijk-jStride+1      ]-x[ijk+1      ]-x[ijk-jStride-1      ]+x[ijk-1      ])  \
+        + (beta_j[ijk        +kStride]-beta_j[ijk        -kStride]) * (x[ijk-jStride+kStride]-x[ijk+kStride]-x[ijk-jStride-kStride]+x[ijk-kStride])  \
+        + (beta_k[ijk        +1      ]-beta_k[ijk        -1      ]) * (x[ijk-kStride+1      ]-x[ijk+1      ]-x[ijk-kStride-1      ]+x[ijk-1      ])  \
+        + (beta_k[ijk        +jStride]-beta_k[ijk        -jStride]) * (x[ijk-kStride+jStride]-x[ijk+jStride]-x[ijk-kStride-jStride]+x[ijk-jStride])  \
+                                                                                                                                                     \
+        + (beta_i[ijk+1      +jStride]-beta_i[ijk+1      -jStride]) * (x[ijk+1      +jStride]-x[ijk+jStride]-x[ijk+1      -jStride]+x[ijk-jStride])  \
+        + (beta_i[ijk+1      +kStride]-beta_i[ijk+1      -kStride]) * (x[ijk+1      +kStride]-x[ijk+kStride]-x[ijk+1      -kStride]+x[ijk-kStride])  \
+        + (beta_j[ijk+jStride+1      ]-beta_j[ijk+jStride-1      ]) * (x[ijk+jStride+1      ]-x[ijk+1      ]-x[ijk+jStride-1      ]+x[ijk-1      ])  \
+        + (beta_j[ijk+jStride+kStride]-beta_j[ijk+jStride-kStride]) * (x[ijk+jStride+kStride]-x[ijk+kStride]-x[ijk+jStride-kStride]+x[ijk-kStride])  \
+        + (beta_k[ijk+kStride+1      ]-beta_k[ijk+kStride-1      ]) * (x[ijk+kStride+1      ]-x[ijk+1      ]-x[ijk+kStride-1      ]+x[ijk-1      ])  \
+        + (beta_k[ijk+kStride+jStride]-beta_k[ijk+kStride-jStride]) * (x[ijk+kStride+jStride]-x[ijk+jStride]-x[ijk+kStride-jStride]+x[ijk-jStride])  \
+      )                                                                                                                                              \
+    )                                                                                                                                                \
+  )
+  #endif
+#else // constant coefficient (don't bother differentiating between Poisson and Helmholtz)...
+  #define apply_op_ijk(x)                 \
+  (                                       \
+    a*x[ijk] - b*h2inv*STENCIL_TWELFTH*(  \
+       - 1.0*(x[ijk-2*kStride] +          \
+              x[ijk-2*jStride] +          \
+              x[ijk-2        ] +          \
+              x[ijk+2        ] +          \
+              x[ijk+2*jStride] +          \
+              x[ijk+2*kStride] )          \
+       +16.0*(x[ijk  -kStride] +          \
+              x[ijk  -jStride] +          \
+              x[ijk  -1      ] +          \
+              x[ijk  +1      ] +          \
+              x[ijk  +jStride] +          \
+              x[ijk  +kStride] )          \
+       -90.0*(x[ijk          ] )          \
+    )                                     \
+  )
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef STENCIL_VARIABLE_COEFFICIENT
+int stencil_get_radius(){return(2);} // stencil reaches out 2 cells
+int stencil_get_shape(){return(STENCIL_SHAPE_NO_CORNERS);} // needs faces and edges, but not corners
+#else
+int stencil_get_radius(){return(2);} // stencil reaches out 2 cells
+int stencil_get_shape(){return(STENCIL_SHAPE_STAR);} // needs just faces
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){
+  // form restriction of alpha[], beta_*[] coefficients from fromLevel
+  if(fromLevel != NULL){
+    restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL  );
+    restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I);
+    restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J);
+    restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K);
+  } // else case assumes alpha/beta have been set
+
+  // extrapolate the beta's into the ghost zones (needed for mixed derivatives)
+  extrapolate_betas(level);
+  //initialize_problem(level,level->h,a,b); // approach used for testing smooth beta's; destroys the black box nature of the solver
+
+  // exchange alpha/beta/...  (must be done before calculating Dinv)
+  exchange_boundary(level,VECTOR_ALPHA ,STENCIL_SHAPE_BOX); // safe
+  exchange_boundary(level,VECTOR_BETA_I,STENCIL_SHAPE_BOX);
+  exchange_boundary(level,VECTOR_BETA_J,STENCIL_SHAPE_BOX);
+  exchange_boundary(level,VECTOR_BETA_K,STENCIL_SHAPE_BOX);
+
+  // black box rebuild of D^{-1}, l1^{-1}, dominant eigenvalue, ...
+  rebuild_operator_blackbox(level,a,b,4);
+
+  // exchange Dinv/L1inv/...
+  exchange_boundary(level,VECTOR_DINV ,STENCIL_SHAPE_BOX); // safe
+  exchange_boundary(level,VECTOR_L1INV,STENCIL_SHAPE_BOX);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef  USE_GSRB
+#define GSRB_OOP
+#define NUM_SMOOTHS      3 // RBRBRB
+#include "operators/gsrb.c"
+#elif   USE_CHEBY
+#warning The Chebyshev smoother is currently underperforming for 4th order.  Please use -DUSE_GSRB or -DUSE_JACOBI
+#define NUM_SMOOTHS      1
+#define CHEBYSHEV_DEGREE 6 // i.e. one degree-6 polynomial smoother
+#include "operators/chebyshev.c"
+#elif   USE_JACOBI
+#define NUM_SMOOTHS      6
+#include "operators/jacobi.c"
+#elif   USE_L1JACOBI
+#define NUM_SMOOTHS      6
+#include "operators/jacobi.c"
+#elif   USE_SYMGS
+#define NUM_SMOOTHS      2 // FBFB
+#include "operators/symgs.c"
+#else
+#error You must compile with either -DUSE_GSRB, -DUSE_CHEBY, -DUSE_JACOBI, -DUSE_L1JACOBI, or -DUSE_SYMGS
+#endif
+#include "operators/residual.c"
+#include "operators/apply_op.c"
+#include "operators/rebuild.c"
+//------------------------------------------------------------------------------------------------------------------------------
+#include "operators/blockCopy.c"
+#include "operators/misc.c"
+#include "operators/exchange_boundary.c"
+#include "operators/boundary_fv.c"
+#include "operators/restriction.c"
+#include "operators/interpolation_v2.c"
+#include "operators/interpolation_v4.c"
+//------------------------------------------------------------------------------------------------------------------------------
+void interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_v2(level_f,id_f,prescale_f,level_c,id_c);}
+void interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_v4(level_f,id_f,prescale_f,level_c,id_c);}
+//------------------------------------------------------------------------------------------------------------------------------
+#include "operators/problem.fv.c"
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators.h b/Util/hpgmg/finite-volume/source/operators.h
new file mode 100644
index 00000000..847244d1
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.h
@@ -0,0 +1,52 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef OPERATORS_H
+#define OPERATORS_H
+//------------------------------------------------------------------------------------------------------------------------------
+#define RESTRICT_CELL   0
+#define RESTRICT_FACE_I 1
+#define RESTRICT_FACE_J 2
+#define RESTRICT_FACE_K 3
+//------------------------------------------------------------------------------------------------------------------------------
+int stencil_get_radius(); 
+int stencil_get_shape();
+//------------------------------------------------------------------------------------------------------------------------------
+  void                  apply_op(level_type * level, int Ax_id,  int x_id, double a, double b);
+  void                  residual(level_type * level, int res_id, int x_id, int rhs_id, double a, double b);
+  void                    smooth(level_type * level, int phi_id, int rhs_id, double a, double b);
+  void          rebuild_operator(level_type * level, level_type *fromLevel, double a, double b);
+  void rebuild_operator_blackbox(level_type * level, double a, double b, int colors_in_each_dim);
+//------------------------------------------------------------------------------------------------------------------------------
+  void               restriction(level_type * level_c, int id_c, level_type *level_f, int id_f, int restrictionType);
+  void      interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c); // interpolation used inside a v-cycle
+  void      interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c); // interpolation used in the f-cycle to create a new initial guess for the next finner v-cycle
+//------------------------------------------------------------------------------------------------------------------------------
+  void         exchange_boundary(level_type * level, int id_a, int shape);
+  void              apply_BCs_p1(level_type * level, int x_id, int shape); // piecewise (cell centered) linear
+  void              apply_BCs_p2(level_type * level, int x_id, int shape); // piecewise (cell centered) quadratic
+  void              apply_BCs_v1(level_type * level, int x_id, int shape); // volumetric linear
+  void              apply_BCs_v2(level_type * level, int x_id, int shape); // volumetric quadratic
+  void              apply_BCs_v4(level_type * level, int x_id, int shape); // volumetric quartic
+  void         extrapolate_betas(level_type * level);
+//------------------------------------------------------------------------------------------------------------------------------
+double                       dot(level_type * level, int id_a, int id_b);
+double                      norm(level_type * level, int id_a);
+double                      mean(level_type * level, int id_a);
+double                     error(level_type * level, int id_a, int id_b);
+  void               add_vectors(level_type * level, int id_c, double scale_a, int id_a, double scale_b, int id_b);
+  void             scale_vector( level_type * level, int id_c, double scale_a, int id_a);
+  void              zero_vector( level_type * level, int id_a);
+  void             shift_vector( level_type * level, int id_c, int id_a, double shift_a);
+  void               mul_vectors(level_type * level, int id_c, double scale, int id_a, int id_b);
+  void            invert_vector( level_type * level, int id_c, double scale_a, int id_a);
+  void              init_vector( level_type * level, int id_a, double scalar);
+//------------------------------------------------------------------------------------------------------------------------------
+void                color_vector(level_type * level, int id, int colors, int icolor, int jcolor, int kcolor);
+void               random_vector(level_type * level, int id);
+//------------------------------------------------------------------------------------------------------------------------------
+  void        initialize_problem(level_type * level, double hLevel, double a, double b);
+//------------------------------------------------------------------------------------------------------------------------------
+#endif
diff --git a/Util/hpgmg/finite-volume/source/operators.old.c b/Util/hpgmg/finite-volume/source/operators.old.c
new file mode 100644
index 00000000..9ba6db4e
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.old.c
@@ -0,0 +1,280 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#include "timers.h"
+#include "defines.h"
+#include "level.h"
+#include "operators.h"
+//------------------------------------------------------------------------------------------------------------------------------
+#define MyPragma(a) _Pragma(#a)
+//------------------------------------------------------------------------------------------------------------------------------
+#if (_OPENMP>=201107) // OpenMP 3.1 supports max reductions...
+  // XL C/C++ 12.01.0000.0009 sets _OPENMP to 201107, but does not support the max clause within a _Pragma().  
+  // This issue was fixed by XL C/C++ 12.01.0000.0011
+  // If you do not have this version of XL C/C++ and run into this bug, uncomment these macros...
+  //#warning not threading norm() calculations due to issue with XL/C, _Pragma, and reduction(max:bmax)
+  //#define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1)                     )
+  //#define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(  +:bsum) )
+  //#define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    
+  #define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1)                     )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(  +:bsum) )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(max:bmax) )
+#elif _OPENMP // older OpenMP versions don't support the max reduction clause
+  #warning Threading max reductions requires OpenMP 3.1 (July 2011).  Please upgrade your compiler.                                                           
+  #define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1)                     )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    MyPragma(omp parallel for private(b) if(nb>1) schedule(static,1) reduction(  +:bsum) )
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    
+#else // flat MPI should not define any threading...
+  #define PRAGMA_THREAD_ACROSS_BLOCKS(    level,b,nb     )    
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,b,nb,bsum)    
+  #define PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,b,nb,bmax)    
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#warning operators.old.c represents an older, lower performance, less performance portable approach to smoothers/residual calculation.  It is strongly suggested you use the default operator file.
+//------------------------------------------------------------------------------------------------------------------------------
+void apply_BCs(level_type * level, int x_id, int shape){apply_BCs_p1(level,x_id,shape);}
+//------------------------------------------------------------------------------------------------------------------------------
+#define Dinv_ijk() Dinv[ijk]        // simply retrieve it rather than recalculating it
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef STENCIL_VARIABLE_COEFFICIENT
+#ifdef USE_HELMHOLTZ // variable coefficient Helmholtz...
+  #define apply_op_ijk(x)                               \
+  (                                                     \
+    a*alpha[ijk]*x[ijk]                                 \
+   -b*h2inv*(                                           \
+      + beta_i[ijk+1      ]*( x[ijk+1      ] - x[ijk] ) \
+      + beta_i[ijk        ]*( x[ijk-1      ] - x[ijk] ) \
+      + beta_j[ijk+jStride]*( x[ijk+jStride] - x[ijk] ) \
+      + beta_j[ijk        ]*( x[ijk-jStride] - x[ijk] ) \
+      + beta_k[ijk+kStride]*( x[ijk+kStride] - x[ijk] ) \
+      + beta_k[ijk        ]*( x[ijk-kStride] - x[ijk] ) \
+    )                                                   \
+  )
+#else // variable coefficient Poisson...
+  #define apply_op_ijk(x)                               \
+  (                                                     \
+    -b*h2inv*(                                          \
+      + beta_i[ijk+1      ]*( x[ijk+1      ] - x[ijk] ) \
+      + beta_i[ijk        ]*( x[ijk-1      ] - x[ijk] ) \
+      + beta_j[ijk+jStride]*( x[ijk+jStride] - x[ijk] ) \
+      + beta_j[ijk        ]*( x[ijk-jStride] - x[ijk] ) \
+      + beta_k[ijk+kStride]*( x[ijk+kStride] - x[ijk] ) \
+      + beta_k[ijk        ]*( x[ijk-kStride] - x[ijk] ) \
+    )                                                   \
+  )
+#endif
+#else  // constant coefficient case...  
+  #define apply_op_ijk(x)            \
+  (                                \
+    a*x[ijk] - b*h2inv*(           \
+      + x[ijk+1      ]             \
+      + x[ijk-1      ]             \
+      + x[ijk+jStride]             \
+      + x[ijk-jStride]             \
+      + x[ijk+kStride]             \
+      + x[ijk-kStride]             \
+      - x[ijk        ]*6.0         \
+    )                              \
+  )
+#endif // variable/constant coefficient
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+int stencil_get_radius(){return(1);} // 7pt reaches out 1 point
+int stencil_get_shape(){return(STENCIL_SHAPE_STAR);} // needs just faces
+//------------------------------------------------------------------------------------------------------------------------------
+void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){
+  if(level->my_rank==0){fprintf(stdout,"  rebuilding operator for level...  h=%e  ",level->h);}
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // form restriction of alpha[], beta_*[] coefficients from fromLevel
+  if(fromLevel != NULL){
+    restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL  );
+    restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I);
+    restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J);
+    restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K);
+  } // else case assumes alpha/beta have been set
+
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // exchange alpha/beta/...  (must be done before calculating Dinv)
+  exchange_boundary(level,VECTOR_ALPHA ,STENCIL_SHAPE_BOX); // safe
+  exchange_boundary(level,VECTOR_BETA_I,STENCIL_SHAPE_BOX);
+  exchange_boundary(level,VECTOR_BETA_J,STENCIL_SHAPE_BOX);
+  exchange_boundary(level,VECTOR_BETA_K,STENCIL_SHAPE_BOX);
+
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // calculate Dinv, L1inv, and estimate the dominant Eigenvalue
+  double _timeStart = getTime();
+  int block;
+
+  double dominant_eigenvalue = -1e9;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,block,level->num_my_blocks,dominant_eigenvalue)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+    const int ilo = level->my_blocks[block].read.i;
+    const int jlo = level->my_blocks[block].read.j;
+    const int klo = level->my_blocks[block].read.k;
+    const int ihi = level->my_blocks[block].dim.i + ilo;
+    const int jhi = level->my_blocks[block].dim.j + jlo;
+    const int khi = level->my_blocks[block].dim.k + klo;
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    double h2inv = 1.0/(level->h*level->h);
+    double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+    double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+    double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+    double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+    double * __restrict__   Dinv = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
+    double * __restrict__  L1inv = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride);
+    double block_eigenvalue = -1e9;
+
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){ 
+      int ijk = i + j*jStride + k*kStride;
+
+      // used for quick linear approximation to zero dirichlet BC
+      double ilo_is_valid =1.0;
+      double ihi_is_valid =1.0;
+      double jlo_is_valid =1.0;
+      double jhi_is_valid =1.0;
+      double klo_is_valid =1.0;
+      double khi_is_valid =1.0;
+      if(level->boundary_condition.type != BC_PERIODIC){
+         if(level->my_boxes[box].low.i+i-1 <             0)ilo_is_valid = 0.0;
+         if(level->my_boxes[box].low.j+j-1 <             0)jlo_is_valid = 0.0;
+         if(level->my_boxes[box].low.k+k-1 <             0)klo_is_valid = 0.0;
+         if(level->my_boxes[box].low.i+i+1 >= level->dim.i)ihi_is_valid = 0.0;
+         if(level->my_boxes[box].low.j+j+1 >= level->dim.j)jhi_is_valid = 0.0;
+         if(level->my_boxes[box].low.k+k+1 >= level->dim.k)khi_is_valid = 0.0;
+       }
+
+      #ifdef STENCIL_VARIABLE_COEFFICIENT
+      // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements...
+      double sumAbsAij = fabs(b*h2inv) * (
+                           fabs( beta_i[ijk        ]*ilo_is_valid )+
+                           fabs( beta_j[ijk        ]*jlo_is_valid )+
+                           fabs( beta_k[ijk        ]*klo_is_valid )+
+                           fabs( beta_i[ijk+1      ]*ihi_is_valid )+
+                           fabs( beta_j[ijk+jStride]*jhi_is_valid )+
+                           fabs( beta_k[ijk+kStride]*khi_is_valid )
+                         );
+
+      // center of Gershgorin disc is the diagonal element...
+      double    Aii = a*alpha[ijk] - b*h2inv*(
+                        beta_i[ijk        ]*( ilo_is_valid-2.0 )+
+                        beta_j[ijk        ]*( jlo_is_valid-2.0 )+
+                        beta_k[ijk        ]*( klo_is_valid-2.0 )+
+                        beta_i[ijk+1      ]*( ihi_is_valid-2.0 )+
+                        beta_j[ijk+jStride]*( jhi_is_valid-2.0 )+
+                        beta_k[ijk+kStride]*( khi_is_valid-2.0 ) 
+                      );
+      #else // Constant coefficient versions with fused BC's...
+      // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements...
+      double sumAbsAij = fabs(b*h2inv) * (
+                           ilo_is_valid +
+                           jlo_is_valid +
+                           klo_is_valid +
+                           ihi_is_valid +
+                           jhi_is_valid +
+                           khi_is_valid 
+                         );
+
+      // center of Gershgorin disc is the diagonal element...
+      double    Aii = a - b*h2inv*(
+                         ilo_is_valid +
+                         jlo_is_valid +
+                         klo_is_valid +
+                         ihi_is_valid +
+                         jhi_is_valid +
+                         khi_is_valid - 12.0
+                      );
+      #endif
+
+      // calculate Dinv = D^{-1}, L1inv = ( D+D^{L1} )^{-1}, and the dominant eigenvalue...
+                             Dinv[ijk] = 1.0/Aii;				// inverse of the diagonal Aii
+                          //L1inv[ijk] = 1.0/(Aii+sumAbsAij);			// inverse of the L1 row norm... L1inv = ( D+D^{L1} )^{-1}
+      if(Aii>=1.5*sumAbsAij)L1inv[ijk] = 1.0/(Aii              ); 		// as suggested by eq 6.5 in Baker et al, "Multigrid smoothers for ultra-parallel computing: additional theory and discussion"...
+                       else L1inv[ijk] = 1.0/(Aii+0.5*sumAbsAij);		// 
+      double Di = (Aii + sumAbsAij)/Aii;if(Di>block_eigenvalue)block_eigenvalue=Di;	// upper limit to Gershgorin disc == bound on dominant eigenvalue
+    }}}
+    if(block_eigenvalue>dominant_eigenvalue){dominant_eigenvalue = block_eigenvalue;}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // Reduce the local estimates dominant eigenvalue to a global estimate
+  #ifdef USE_MPI
+  double _timeStartAllReduce = getTime();
+  double send = dominant_eigenvalue;
+  MPI_Allreduce(&send,&dominant_eigenvalue,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD);
+  double _timeEndAllReduce = getTime();
+  level->timers.collectives   += (double)(_timeEndAllReduce-_timeStartAllReduce);
+  #endif
+  if(level->my_rank==0){fprintf(stdout,"eigenvalue_max<%e\n",dominant_eigenvalue);}
+  level->dominant_eigenvalue_of_DinvA = dominant_eigenvalue;
+
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // exchange Dinv/L1inv/...
+  exchange_boundary(level,VECTOR_DINV ,STENCIL_SHAPE_BOX); // safe
+  exchange_boundary(level,VECTOR_L1INV,STENCIL_SHAPE_BOX);
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+#include "operators.old/iterators.c"
+#ifdef  USE_GSRB
+#define NUM_SMOOTHS      2 // RBRB
+#include "operators.old/gsrb.c"
+#elif   USE_CHEBY
+#define NUM_SMOOTHS      1
+#define CHEBYSHEV_DEGREE 4 // i.e. one degree-4 polynomial smoother
+#include "operators.old/chebyshev.c"
+#elif   USE_JACOBI
+#define NUM_SMOOTHS      6
+#include "operators.old/jacobi.c"
+#elif   USE_L1JACOBI
+#define NUM_SMOOTHS      6
+#include "operators.old/jacobi.c"
+#elif   USE_SYMGS
+#define NUM_SMOOTHS      1
+#include "operators.old/symgs.c"
+#else
+#error You must compile with either -DUSE_GSRB, -DUSE_CHEBY, -DUSE_JACOBI, -DUSE_L1JACOBI, or -DUSE_SYMGS
+#endif
+#include "operators.old/residual.c"
+#include "operators.old/apply_op.c"
+//------------------------------------------------------------------------------------------------------------------------------
+#include "operators/blockCopy.c"
+#include "operators/misc.c"
+#include "operators/exchange_boundary.c"
+#include "operators/boundary_fd.c"
+#include "operators/restriction.c"
+#include "operators/interpolation_p0.c"
+#include "operators/interpolation_p1.c"
+//------------------------------------------------------------------------------------------------------------------------------
+void interpolation_vcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_p0(level_f,id_f,prescale_f,level_c,id_c);}
+void interpolation_fcycle(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){interpolation_p1(level_f,id_f,prescale_f,level_c,id_c);}
+//------------------------------------------------------------------------------------------------------------------------------
+#include "operators/problem.p6.c"
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/chebyshev.c b/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/chebyshev.c
new file mode 100644
index 00000000..1d42e08b
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/chebyshev.c
@@ -0,0 +1,96 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+// Based on Yousef Saad's Iterative Methods for Sparse Linear Algebra, Algorithm 12.1, page 399
+//------------------------------------------------------------------------------------------------------------------------------
+void smooth(level_type * level, int x_id, int rhs_id, double a, double b){
+  if( (level->dominant_eigenvalue_of_DinvA<=0.0) && (level->my_rank==0) )printf("dominant_eigenvalue_of_DinvA <= 0.0 !\n");
+  if((CHEBYSHEV_DEGREE*NUM_SMOOTHS)&1){
+    printf("error... CHEBYSHEV_DEGREE*NUM_SMOOTHS must be even for the chebyshev smoother...\n");
+    exit(0);
+  }
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  int box,s;
+  int ghosts = level->box_ghosts;
+  int communicationAvoiding = ghosts > stencil_get_radius(); 
+
+
+  // compute the Chebyshev coefficients...
+  double beta     = 1.000*level->dominant_eigenvalue_of_DinvA;
+//double alpha    = 0.300000*beta;
+//double alpha    = 0.250000*beta;
+//double alpha    = 0.166666*beta;
+  double alpha    = 0.125000*beta;
+  double theta    = 0.5*(beta+alpha);		// center of the spectral ellipse
+  double delta    = 0.5*(beta-alpha);		// major axis?
+  double sigma = theta/delta;
+  double rho_n = 1/sigma;			// rho_0
+  double chebyshev_c1[CHEBYSHEV_DEGREE];	// + c1*(x_n-x_nm1) == rho_n*rho_nm1
+  double chebyshev_c2[CHEBYSHEV_DEGREE];	// + c2*(b-Ax_n)
+  chebyshev_c1[0] = 0.0;
+  chebyshev_c2[0] = 1/theta;
+  for(s=1;s<CHEBYSHEV_DEGREE;s++){
+    double rho_nm1 = rho_n;
+    rho_n = 1.0/(2.0*sigma - rho_nm1);
+    chebyshev_c1[s] = rho_n*rho_nm1;
+    chebyshev_c2[s] = rho_n*2.0/delta;
+  }
+
+
+  // if communication-avoiding, need updated RHS for stencils in ghost zones
+  if(communicationAvoiding)exchange_boundary(level,rhs_id,0); 
+
+  for(s=0;s<CHEBYSHEV_DEGREE*NUM_SMOOTHS;s+=ghosts){
+    // Chebyshev ping pongs between x_id and VECTOR_TEMP
+    if((s&1)==0){exchange_boundary(level,       x_id,stencil_is_star_shaped() && !communicationAvoiding);apply_BCs(level,       x_id);}
+            else{exchange_boundary(level,VECTOR_TEMP,stencil_is_star_shaped() && !communicationAvoiding);apply_BCs(level,VECTOR_TEMP);}
+    
+    // now do ghosts communication-avoiding smooths on each box...
+    uint64_t _timeStart = CycleTime();
+    PRAGMA_THREAD_ACROSS_BOXES(level,box)
+    for(box=0;box<level->num_my_boxes;box++){
+      int i,j,k,ss;
+      const int jStride = level->my_boxes[box].jStride;
+      const int kStride = level->my_boxes[box].kStride;
+      const int     dim = level->my_boxes[box].dim;
+      const double h2inv = 1.0/(level->h*level->h);
+      const double * __restrict__ rhs      = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ alpha    = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_i   = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_j   = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_k   = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ Dinv     = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ valid    = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain
+
+      int ghostsToOperateOn=ghosts-1;
+      for(ss=s;ss<s+ghosts;ss++,ghostsToOperateOn--){
+              double * __restrict__ x_np1;
+        const double * __restrict__ x_n;
+        const double * __restrict__ x_nm1;
+              if((ss&1)==0){x_n    = level->my_boxes[box].vectors[       x_id] + ghosts*(1+jStride+kStride);
+                            x_nm1  = level->my_boxes[box].vectors[VECTOR_TEMP] + ghosts*(1+jStride+kStride); 
+                            x_np1  = level->my_boxes[box].vectors[VECTOR_TEMP] + ghosts*(1+jStride+kStride);}
+                       else{x_n    = level->my_boxes[box].vectors[VECTOR_TEMP] + ghosts*(1+jStride+kStride);
+                            x_nm1  = level->my_boxes[box].vectors[       x_id] + ghosts*(1+jStride+kStride); 
+                            x_np1  = level->my_boxes[box].vectors[       x_id] + ghosts*(1+jStride+kStride);}
+        const double c1 = chebyshev_c1[ss%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE.
+        const double c2 = chebyshev_c2[ss%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE.
+        PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+        for(k=0-ghostsToOperateOn;k<dim+ghostsToOperateOn;k++){
+        for(j=0-ghostsToOperateOn;j<dim+ghostsToOperateOn;j++){
+        for(i=0-ghostsToOperateOn;i<dim+ghostsToOperateOn;i++){
+          int ijk = i + j*jStride + k*kStride;
+          // According to Saad... but his was missing a Dinv[ijk] == D^{-1} !!!
+          //  x_{n+1} = x_{n} + rho_{n} [ rho_{n-1}(x_{n} - x_{n-1}) + (2/delta)(b-Ax_{n}) ]
+          //  x_temp[ijk] = x_n[ijk] + c1*(x_n[ijk]-x_temp[ijk]) + c2*Dinv[ijk]*(rhs[ijk]-Ax_n);
+          double Ax_n   = apply_op_ijk(x_n);
+          double lambda =     Dinv_ijk();
+          x_np1[ijk] = x_n[ijk] + c1*(x_n[ijk]-x_nm1[ijk]) + c2*lambda*(rhs[ijk]-Ax_n);
+        }}}
+      } // ss-loop
+    } // box-loop
+    level->cycles.smooth += (uint64_t)(CycleTime()-_timeStart);
+  } // s-loop
+}
diff --git a/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/gsrb.c b/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/gsrb.c
new file mode 100644
index 00000000..cbb9e754
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/gsrb.c
@@ -0,0 +1,90 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+//#define GSRB_STRIDE2
+//#define GSRB_FP
+//------------------------------------------------------------------------------------------------------------------------------
+void smooth(level_type * level, int phi_id, int rhs_id, double a, double b){
+  int box,s;
+  int ghosts = level->box_ghosts;
+  int communicationAvoiding = ghosts > stencil_get_radius();  
+
+  // if communication-avoiding, need updated RHS for stencils in ghost zones
+  if(communicationAvoiding)exchange_boundary(level,rhs_id,0); 
+
+  for(s=0;s<2*NUM_SMOOTHS;s+=ghosts){ // there are two sweeps per GSRB smooth
+    exchange_boundary(level,phi_id,stencil_is_star_shaped() && !communicationAvoiding);
+            apply_BCs(level,phi_id);
+
+    // now do ghosts communication-avoiding smooths on each box...
+    uint64_t _timeStart = CycleTime();
+    PRAGMA_THREAD_ACROSS_BOXES(level,box)
+    for(box=0;box<level->num_my_boxes;box++){
+      int i,j,k,ss;
+      int color000 = (level->my_boxes[box].low.i^level->my_boxes[box].low.j^level->my_boxes[box].low.k)&1;  // is element 000 red or black ???  (should only be an issue if box dimension is odd)
+      const int jStride = level->my_boxes[box].jStride;
+      const int kStride = level->my_boxes[box].kStride;
+      const int     dim = level->my_boxes[box].dim;
+      const double h2inv = 1.0/(level->h*level->h);
+      const double * __restrict__ phi      = level->my_boxes[box].vectors[       phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+            double * __restrict__ phi_new  = level->my_boxes[box].vectors[       phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+      const double * __restrict__ rhs      = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ alpha    = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_i   = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_j   = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_k   = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ Dinv     = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ valid    = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain
+      const double * __restrict__ RedBlack[2] = {level->RedBlack_FP[0] + ghosts*(1+jStride), 
+                                                 level->RedBlack_FP[1] + ghosts*(1+jStride)};
+          
+
+      int ghostsToOperateOn=ghosts-1;
+      for(ss=s;ss<s+ghosts;ss++,ghostsToOperateOn--){
+        #if defined(GSRB_FP)
+        #warning GSRB using pre-computed 1.0/0.0 FP array for Red-Black to facilitate vectorization...
+        PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+        for(k=0-ghostsToOperateOn;k<dim+ghostsToOperateOn;k++){
+        for(j=0-ghostsToOperateOn;j<dim+ghostsToOperateOn;j++){
+        for(i=0-ghostsToOperateOn;i<dim+ghostsToOperateOn;i++){
+              int EvenOdd = (k^ss^color000)&1;
+              int ij  = i + j*jStride;
+              int ijk = i + j*jStride + k*kStride;
+              double Ax     = apply_op_ijk(phi);
+              double lambda =     Dinv_ijk();
+              phi_new[ijk] = phi[ijk] + RedBlack[EvenOdd][ij]*lambda*(rhs[ijk]-Ax); // compiler seems to get confused unless there are disjoint read/write pointers
+        }}}
+        #elif defined(GSRB_STRIDE2)
+        #warning GSRB using stride-2 accesses to minimie the number of flop's
+        PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+        for(k=0-ghostsToOperateOn;k<dim+ghostsToOperateOn;k++){
+        for(j=0-ghostsToOperateOn;j<dim+ghostsToOperateOn;j++){
+        for(i=((j^k^ss^color000)&1)+1-ghosts;i<dim+ghostsToOperateOn;i+=2){ // stride-2 GSRB
+              int ijk = i + j*jStride + k*kStride; 
+              double Ax     = apply_op_ijk(phi);
+              double lambda =     Dinv_ijk();
+              phi_new[ijk] = phi[ijk] + lambda*(rhs[ijk]-Ax);
+        }}}
+        #else
+        #warning GSRB using if-then-else on loop indices for Red-Black because its easy to read...
+        PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+        for(k=0-ghostsToOperateOn;k<dim+ghostsToOperateOn;k++){
+        for(j=0-ghostsToOperateOn;j<dim+ghostsToOperateOn;j++){
+        for(i=0-ghostsToOperateOn;i<dim+ghostsToOperateOn;i++){
+        if((i^j^k^ss^color000^1)&1){ // looks very clean when [0] is i,j,k=0,0,0 
+              int ijk = i + j*jStride + k*kStride;
+              double Ax     = apply_op_ijk(phi);
+              double lambda =     Dinv_ijk();
+              phi_new[ijk] = phi[ijk] + lambda*(rhs[ijk]-Ax);
+        }}}}
+        #endif
+      } // ss-loop
+    } // boxes
+    level->cycles.smooth += (uint64_t)(CycleTime()-_timeStart);
+  } // s-loop
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/jacobi.c b/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/jacobi.c
new file mode 100644
index 00000000..4b09da9a
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.old/aggregate.mpi/jacobi.c
@@ -0,0 +1,79 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+#include "../timer.h"
+//------------------------------------------------------------------------------------------------------------------------------
+void smooth(level_type * level, int x_id, int rhs_id, double a, double b){
+  if(NUM_SMOOTHS&1){
+    printf("error - NUM_SMOOTHS must be even...\n");
+    exit(0);
+  }
+
+
+  int box,s;
+  int ghosts = level->box_ghosts;
+  int starShaped = stencil_is_star_shaped();
+  int communicationAvoiding = ghosts > stencil_get_radius(); 
+ 
+  #ifdef USE_L1JACOBI
+  double weight = 1.0;
+  #else
+  double weight = 2.0/3.0;
+  #endif
+ 
+ 
+  // if communication-avoiding, need updated RHS for stencils in ghost zones
+  if(communicationAvoiding)exchange_boundary(level,rhs_id,0); 
+
+  for(s=0;s<NUM_SMOOTHS;s+=ghosts){
+    // Jacobi ping pongs between x_id and VECTOR_TEMP
+    if((s&1)==0){exchange_boundary(level,       x_id,stencil_is_star_shaped() && !communicationAvoiding);apply_BCs(level,       x_id);}
+            else{exchange_boundary(level,VECTOR_TEMP,stencil_is_star_shaped() && !communicationAvoiding);apply_BCs(level,VECTOR_TEMP);}
+
+    // now do ghosts communication-avoiding smooths on each box...
+    uint64_t _timeStart = CycleTime();
+
+    PRAGMA_THREAD_ACROSS_BOXES(level,box)
+    for(box=0;box<level->num_my_boxes;box++){
+      int i,j,k,ss;
+      const int jStride = level->my_boxes[box].jStride;
+      const int kStride = level->my_boxes[box].kStride;
+      const int     dim = level->my_boxes[box].dim;
+      const double h2inv = 1.0/(level->h*level->h);
+      const double * __restrict__ rhs    = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ valid  = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain
+      #ifdef USE_L1JACOBI
+      const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride);
+      #else
+      const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
+      #endif
+      int ghostsToOperateOn=ghosts-1;
+      for(ss=s;ss<s+ghosts;ss++,ghostsToOperateOn--){
+        const double * __restrict__ x_n;
+              double * __restrict__ x_np1;
+              if((ss&1)==0){x_n   = level->my_boxes[box].vectors[       x_id] + ghosts*(1+jStride+kStride);
+                            x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP] + ghosts*(1+jStride+kStride);}
+                       else{x_n   = level->my_boxes[box].vectors[VECTOR_TEMP] + ghosts*(1+jStride+kStride);
+                            x_np1 = level->my_boxes[box].vectors[       x_id] + ghosts*(1+jStride+kStride);}
+        PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+        for(k=0-ghostsToOperateOn;k<dim+ghostsToOperateOn;k++){
+        for(j=0-ghostsToOperateOn;j<dim+ghostsToOperateOn;j++){
+        for(i=0-ghostsToOperateOn;i<dim+ghostsToOperateOn;i++){
+          int ijk = i + j*jStride + k*kStride;
+          double Ax_n = apply_op_ijk(x_n);
+          x_np1[ijk] = x_n[ijk] + weight*lambda[ijk]*(rhs[ijk]-Ax_n);
+        }}}
+      } // ss-loop
+    } // box-loop
+    level->cycles.smooth += (uint64_t)(CycleTime()-_timeStart);
+  } // s-loop
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators.old/apply_op.c b/Util/hpgmg/finite-volume/source/operators.old/apply_op.c
new file mode 100644
index 00000000..0f7aeab9
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.old/apply_op.c
@@ -0,0 +1,40 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+void apply_op(level_type * level, int Ax_id, int x_id, double a, double b){  // y=Ax
+  // exchange the boundary of x in preparation for Ax
+  exchange_boundary(level,x_id,stencil_get_shape());
+          apply_BCs(level,x_id,stencil_get_shape());
+
+  // now do Ax proper...
+  double _timeStart = getTime();
+  const int  ghosts = level->box_ghosts;
+  const int jStride = level->box_jStride;
+  const int kStride = level->box_kStride;
+  const int     dim = level->box_dim;
+  const double h2inv = 1.0/(level->h*level->h);
+  int box;
+
+  PRAGMA_THREAD_ACROSS_BOXES(level,box)
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const double * __restrict__ x      = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+          double * __restrict__ Ax     = level->my_boxes[box].vectors[        Ax_id] + ghosts*(1+jStride+kStride); 
+    const double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+    const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+    const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+    const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+
+    PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+    for(k=0;k<dim;k++){
+    for(j=0;j<dim;j++){
+    for(i=0;i<dim;i++){
+      int ijk = i + j*jStride + k*kStride;
+      Ax[ijk] = apply_op_ijk(x);
+    }}}
+  }
+  level->timers.apply_op += (double)(getTime()-_timeStart);
+}
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators.old/chebyshev.c b/Util/hpgmg/finite-volume/source/operators.old/chebyshev.c
new file mode 100644
index 00000000..c86be4cf
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.old/chebyshev.c
@@ -0,0 +1,93 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+// Based on Yousef Saad's Iterative Methods for Sparse Linear Algebra, Algorithm 12.1, page 399
+//------------------------------------------------------------------------------------------------------------------------------
+void smooth(level_type * level, int x_id, int rhs_id, double a, double b){
+  if((CHEBYSHEV_DEGREE*NUM_SMOOTHS)&1){
+    fprintf(stderr,"error... CHEBYSHEV_DEGREE*NUM_SMOOTHS must be even for the chebyshev smoother...\n");
+    exit(0);
+  }
+  if( (level->dominant_eigenvalue_of_DinvA<=0.0) && (level->my_rank==0) )fprintf(stderr,"dominant_eigenvalue_of_DinvA <= 0.0 !\n");
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  int box,s;
+
+
+
+  // compute the Chebyshev coefficients...
+  double beta     = 1.000*level->dominant_eigenvalue_of_DinvA;
+//double alpha    = 0.300000*beta;
+//double alpha    = 0.250000*beta;
+//double alpha    = 0.166666*beta;
+  double alpha    = 0.125000*beta;
+  double theta    = 0.5*(beta+alpha);		// center of the spectral ellipse
+  double delta    = 0.5*(beta-alpha);		// major axis?
+  double sigma = theta/delta;
+  double rho_n = 1/sigma;			// rho_0
+  double chebyshev_c1[CHEBYSHEV_DEGREE];	// + c1*(x_n-x_nm1) == rho_n*rho_nm1
+  double chebyshev_c2[CHEBYSHEV_DEGREE];	// + c2*(b-Ax_n)
+  chebyshev_c1[0] = 0.0;
+  chebyshev_c2[0] = 1/theta;
+  for(s=1;s<CHEBYSHEV_DEGREE;s++){
+    double rho_nm1 = rho_n;
+    rho_n = 1.0/(2.0*sigma - rho_nm1);
+    chebyshev_c1[s] = rho_n*rho_nm1;
+    chebyshev_c2[s] = rho_n*2.0/delta;
+  }
+
+
+  for(s=0;s<CHEBYSHEV_DEGREE*NUM_SMOOTHS;s++){
+    // get ghost zone data... Chebyshev ping pongs between x_id and VECTOR_TEMP
+    if((s&1)==0){exchange_boundary(level,       x_id,stencil_get_shape());apply_BCs(level,       x_id,stencil_get_shape());}
+            else{exchange_boundary(level,VECTOR_TEMP,stencil_get_shape());apply_BCs(level,VECTOR_TEMP,stencil_get_shape());}
+
+    // apply the smoother... Chebyshev ping pongs between x_id and VECTOR_TEMP
+    double _timeStart = getTime();
+    const int  ghosts = level->box_ghosts;
+    const int jStride = level->box_jStride;
+    const int kStride = level->box_kStride;
+    const int     dim = level->box_dim;
+    const double h2inv = 1.0/(level->h*level->h);
+
+    PRAGMA_THREAD_ACROSS_BOXES(level,box)
+    for(box=0;box<level->num_my_boxes;box++){
+      int i,j,k;
+      const double * __restrict__ rhs      = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ alpha    = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_i   = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_j   = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_k   = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ Dinv     = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
+
+            double * __restrict__ x_np1;
+      const double * __restrict__ x_n;
+      const double * __restrict__ x_nm1;
+                       if((s&1)==0){x_n    = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride);
+                                    x_nm1  = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride); 
+                                    x_np1  = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);}
+                               else{x_n    = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);
+                                    x_nm1  = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride); 
+                                    x_np1  = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride);}
+      const double c1 = chebyshev_c1[s%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE.
+      const double c2 = chebyshev_c2[s%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE.
+
+      PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+      for(k=0;k<dim;k++){
+      for(j=0;j<dim;j++){
+      for(i=0;i<dim;i++){
+        const int ijk = i + j*jStride + k*kStride;
+        // According to Saad... but his was missing a Dinv[ijk] == D^{-1} !!!
+        //  x_{n+1} = x_{n} + rho_{n} [ rho_{n-1}(x_{n} - x_{n-1}) + (2/delta)(b-Ax_{n}) ]
+        //  x_temp[ijk] = x_n[ijk] + c1*(x_n[ijk]-x_temp[ijk]) + c2*Dinv[ijk]*(rhs[ijk]-Ax_n);
+        const double Ax_n   = apply_op_ijk(x_n);
+        const double lambda =     Dinv_ijk();
+        x_np1[ijk] = x_n[ijk] + c1*(x_n[ijk]-x_nm1[ijk]) + c2*lambda*(rhs[ijk]-Ax_n);
+      }}}
+    } // box-loop
+    level->timers.smooth += (double)(getTime()-_timeStart);
+  } // s-loop
+}
diff --git a/Util/hpgmg/finite-volume/source/operators.old/gsrb.c b/Util/hpgmg/finite-volume/source/operators.old/gsrb.c
new file mode 100644
index 00000000..bcf54475
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.old/gsrb.c
@@ -0,0 +1,133 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#if   defined(GSRB_FP)
+  #warning Overriding default GSRB implementation and using pre-computed 1.0/0.0 FP array for Red-Black to facilitate vectorization...
+#elif defined(GSRB_STRIDE2)
+  #if defined(GSRB_OOP)
+  #warning Overriding default GSRB implementation and using out-of-place and stride-2 accesses to minimize the number of flops
+  #else
+  #warning Overriding default GSRB implementation and using stride-2 accesses to minimize the number of flops
+  #endif
+#elif defined(GSRB_BRANCH)
+  #if defined(GSRB_OOP)
+  #warning Overriding default GSRB implementation and using out-of-place implementation with an if-then-else on loop indices...
+  #else
+  #warning Overriding default GSRB implementation and using if-then-else on loop indices...
+  #endif
+#else
+#define GSRB_STRIDE2 // default implementation
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+void smooth(level_type * level, int phi_id, int rhs_id, double a, double b){
+  int box,s;
+  for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps per GSRB smooth
+
+    // exchange the ghost zone...
+    #ifdef GSRB_OOP // out-of-place GSRB ping pongs between x and VECTOR_TEMP
+    if((s&1)==0){exchange_boundary(level,     phi_id,stencil_get_shape());apply_BCs(level,     phi_id,stencil_get_shape());}
+            else{exchange_boundary(level,VECTOR_TEMP,stencil_get_shape());apply_BCs(level,VECTOR_TEMP,stencil_get_shape());}
+    #else // in-place GSRB only operates on x
+                 exchange_boundary(level,     phi_id,stencil_get_shape());apply_BCs(level,     phi_id,stencil_get_shape());
+    #endif
+
+
+    // apply the smoother...
+    double _timeStart = getTime();
+    const int  ghosts = level->box_ghosts;
+    const int jStride = level->box_jStride;
+    const int kStride = level->box_kStride;
+    const int     dim = level->box_dim;
+    const double h2inv = 1.0/(level->h*level->h);
+
+    PRAGMA_THREAD_ACROSS_BOXES(level,box)
+    for(box=0;box<level->num_my_boxes;box++){
+      int i,j,k;
+      const int color000 = (level->my_boxes[box].low.i^level->my_boxes[box].low.j^level->my_boxes[box].low.k^s)&1;  // is element 000 red or black on *THIS* sweep
+
+      const double * __restrict__ rhs      = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ alpha    = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_i   = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_j   = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_k   = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ Dinv     = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
+      #ifdef GSRB_OOP
+      const double * __restrict__ phi;
+            double * __restrict__ phi_new;
+                     if((s&1)==0){phi      = level->my_boxes[box].vectors[       phi_id] + ghosts*(1+jStride+kStride);
+                                  phi_new  = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);}
+                             else{phi      = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);
+                                  phi_new  = level->my_boxes[box].vectors[       phi_id] + ghosts*(1+jStride+kStride);}
+      #else
+      const double * __restrict__ phi      = level->my_boxes[box].vectors[       phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+            double * __restrict__ phi_new  = level->my_boxes[box].vectors[       phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+      #endif
+          
+
+      #if defined(GSRB_FP)
+      PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+      for(k=0;k<dim;k++){
+      for(j=0;j<dim;j++){
+      const double * __restrict__ RedBlack = level->RedBlack_FP + ghosts*(1+jStride) + kStride*((k^color000)&0x1);
+      for(i=0;i<dim;i++){
+        int ij  = i + j*jStride;
+        int ijk = i + j*jStride + k*kStride;
+        double Ax     = apply_op_ijk(phi);
+        double lambda =     Dinv_ijk();
+        phi_new[ijk] = phi[ijk] + RedBlack[ij]*lambda*(rhs[ijk]-Ax);
+        //phi_new[ijk] = ((i^j^k^color000)&1) ? phi[ijk] : phi[ijk] + lambda*(rhs[ijk]-Ax);
+      }}}
+
+
+      #elif defined(GSRB_STRIDE2)
+      PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+      for(k=0;k<dim;k++){
+      for(j=0;j<dim;j++){
+        #ifdef GSRB_OOP
+        // out-of-place must copy old value...
+        for(i=0;i<dim;i++){
+          int ijk = i + j*jStride + k*kStride; 
+          phi_new[ijk] = phi[ijk];
+        }
+        #endif
+        for(i=((j^k^color000)&1);i<dim;i+=2){ // stride-2 GSRB
+          int ijk = i + j*jStride + k*kStride; 
+          double Ax     = apply_op_ijk(phi);
+          double lambda =     Dinv_ijk();
+          phi_new[ijk] = phi[ijk] + lambda*(rhs[ijk]-Ax);
+        }
+      }}
+
+
+      #elif defined(GSRB_BRANCH)
+      PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+      for(k=0;k<dim;k++){
+      for(j=0;j<dim;j++){
+      for(i=0;i<dim;i++){
+        int ijk = i + j*jStride + k*kStride;
+        if((i^j^k^color000^1)&1){ // looks very clean when [0] is i,j,k=0,0,0 
+          double Ax     = apply_op_ijk(phi);
+          double lambda =     Dinv_ijk();
+          phi_new[ijk] = phi[ijk] + lambda*(rhs[ijk]-Ax);
+        #ifdef GSRB_OOP
+        }else{
+          phi_new[ijk] = phi[ijk]; // copy old value when sweep color != cell color
+        #endif
+        }
+      }}}
+
+
+      #else
+      #error no GSRB implementation was specified
+      #endif
+
+
+    } // boxes
+    level->timers.smooth += (double)(getTime()-_timeStart);
+  } // s-loop
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators.old/iterators.c b/Util/hpgmg/finite-volume/source/operators.old/iterators.c
new file mode 100644
index 00000000..3ea9637c
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.old/iterators.c
@@ -0,0 +1,53 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#if 0
+#if (_OPENMP>=201107) // OpenMP 3.1 supports max reductions...
+  #define PRAGMA_THREAD_ACROSS_BOXES(    level,box)           MyPragma(omp parallel for private(box)   if(level->concurrent_boxes>1) num_threads(level->concurrent_boxes)                                                       )
+  #define PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,level_sum) MyPragma(omp parallel for private(box)   if(level->concurrent_boxes>1) num_threads(level->concurrent_boxes)             reduction(  +:level_sum) schedule(static) )
+  #define PRAGMA_THREAD_ACROSS_BOXES_MAX(level,box,level_max) MyPragma(omp parallel for private(box)   if(level->concurrent_boxes>1) num_threads(level->concurrent_boxes)             reduction(max:level_max) schedule(static) )
+  #define PRAGMA_THREAD_WITHIN_A_BOX(    level,i,j,k)         MyPragma(omp parallel for private(i,j,k) if(level->threads_per_box >1) num_threads(level->threads_per_box ) collapse(2)                                           ) 
+  #define PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,box_sum) MyPragma(omp parallel for private(i,j,k) if(level->threads_per_box >1) num_threads(level->threads_per_box ) collapse(2) reduction(  +:  box_sum) schedule(static) ) 
+  #define PRAGMA_THREAD_WITHIN_A_BOX_MAX(level,i,j,k,box_max) MyPragma(omp parallel for private(i,j,k) if(level->threads_per_box >1) num_threads(level->threads_per_box ) collapse(2) reduction(max:  box_max) schedule(static) ) 
+#elif _OPENMP // older OpenMP versions don't support the max reduction clause
+  #define PRAGMA_THREAD_ACROSS_BOXES(    level,box)           MyPragma(omp parallel for private(box)   if(level->concurrent_boxes>1) num_threads(level->concurrent_boxes) )
+  #define PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,level_sum) MyPragma(omp parallel for private(box)   if(level->concurrent_boxes>1) num_threads(level->concurrent_boxes)             reduction(  +:level_sum) schedule(static) )
+  #define PRAGMA_THREAD_ACROSS_BOXES_MAX(level,box,level_max) #warning Threading max reductions requires OpenMP 3.1 (July 2011).  Please upgrade your compiler.
+  #define PRAGMA_THREAD_WITHIN_A_BOX(    level,i,j,k)         MyPragma(omp parallel for private(i,j,k) if(level->threads_per_box >1) num_threads(level->threads_per_box ) collapse(2) ) 
+  #define PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,box_sum) MyPragma(omp parallel for private(i,j,k) if(level->threads_per_box >1) num_threads(level->threads_per_box ) collapse(2) reduction(  +:  box_sum) schedule(static) ) 
+  #define PRAGMA_THREAD_WITHIN_A_BOX_MAX(level,i,j,k,box_max) #warning Threading max reductions requires OpenMP 3.1 (July 2011).  Please upgrade your compiler.
+#else // flat MPI should not define any threading...
+  #define PRAGMA_THREAD_ACROSS_BOXES(    level,box)          
+  #define PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,level_sum)
+  #define PRAGMA_THREAD_ACROSS_BOXES_MAX(level,box,level_max)
+  #define PRAGMA_THREAD_WITHIN_A_BOX(    level,i,j,k)        
+  #define PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,box_sum)
+  #define PRAGMA_THREAD_WITHIN_A_BOX_MAX(level,i,j,k,box_max)
+#endif
+#else
+#if (_OPENMP>=201107) // OpenMP 3.1 supports max reductions...
+  #define PRAGMA_THREAD_ACROSS_BOXES(    level,box)           
+  #define PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,level_sum) 
+  #define PRAGMA_THREAD_ACROSS_BOXES_MAX(level,box,level_max) 
+  #define PRAGMA_THREAD_WITHIN_A_BOX(    level,i,j,k)         MyPragma(omp parallel for private(i,j,k) collapse(2)                                           ) 
+  #define PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,box_sum) MyPragma(omp parallel for private(i,j,k) collapse(2) reduction(  +:  box_sum) schedule(static) ) 
+  #define PRAGMA_THREAD_WITHIN_A_BOX_MAX(level,i,j,k,box_max) MyPragma(omp parallel for private(i,j,k) collapse(2) reduction(max:  box_max) schedule(static) ) 
+#elif _OPENMP // older OpenMP versions don't support the max reduction clause
+  #define PRAGMA_THREAD_ACROSS_BOXES(    level,box)           
+  #define PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,level_sum) 
+  #define PRAGMA_THREAD_ACROSS_BOXES_MAX(level,box,level_max) #warning Threading max reductions requires OpenMP 3.1 (July 2011).  Please upgrade your compiler.
+  #define PRAGMA_THREAD_WITHIN_A_BOX(    level,i,j,k)         MyPragma(omp parallel for private(i,j,k) collapse(2)                                           ) 
+  #define PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,box_sum) MyPragma(omp parallel for private(i,j,k) collapse(2) reduction(  +:  box_sum) schedule(static) ) 
+  #define PRAGMA_THREAD_WITHIN_A_BOX_MAX(level,i,j,k,box_max) #warning Threading max reductions requires OpenMP 3.1 (July 2011).  Please upgrade your compiler.
+#else // flat MPI should not define any threading...
+  #define PRAGMA_THREAD_ACROSS_BOXES(    level,box)          
+  #define PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,level_sum)
+  #define PRAGMA_THREAD_ACROSS_BOXES_MAX(level,box,level_max)
+  #define PRAGMA_THREAD_WITHIN_A_BOX(    level,i,j,k)        
+  #define PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,box_sum)
+  #define PRAGMA_THREAD_WITHIN_A_BOX_MAX(level,i,j,k,box_max)
+#endif
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators.old/jacobi.c b/Util/hpgmg/finite-volume/source/operators.old/jacobi.c
new file mode 100644
index 00000000..c930f4e3
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.old/jacobi.c
@@ -0,0 +1,66 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+//------------------------------------------------------------------------------------------------------------------------------
+void smooth(level_type * level, int x_id, int rhs_id, double a, double b){
+  if(NUM_SMOOTHS&1){
+    fprintf(stderr,"error - NUM_SMOOTHS must be even...\n");
+    exit(0);
+  }
+
+  #ifdef USE_L1JACOBI
+  double weight = 1.0;
+  #else
+  double weight = 2.0/3.0;
+  #endif
+ 
+  int box,s;
+  for(s=0;s<NUM_SMOOTHS;s++){
+    // exchange ghost zone data... Jacobi ping pongs between x_id and VECTOR_TEMP
+    if((s&1)==0){exchange_boundary(level,       x_id,stencil_get_shape());apply_BCs(level,       x_id,stencil_get_shape());}
+            else{exchange_boundary(level,VECTOR_TEMP,stencil_get_shape());apply_BCs(level,VECTOR_TEMP,stencil_get_shape());}
+
+    // apply the smoother... Jacobi ping pongs between x_id and VECTOR_TEMP
+    double _timeStart = getTime();
+    const int  ghosts = level->box_ghosts;
+    const int jStride = level->box_jStride;
+    const int kStride = level->box_kStride;
+    const int     dim = level->box_dim;
+    const double h2inv = 1.0/(level->h*level->h);
+
+    PRAGMA_THREAD_ACROSS_BOXES(level,box)
+    for(box=0;box<level->num_my_boxes;box++){
+      int i,j,k;
+      const double * __restrict__ rhs    = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+      #ifdef USE_L1JACOBI
+      const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride);
+      #else
+      const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
+      #endif
+        const double * __restrict__ x_n;
+              double * __restrict__ x_np1;
+                      if((s&1)==0){x_n   = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride);
+                                   x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);}
+                              else{x_n   = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);
+                                   x_np1 = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride);}
+      PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+      for(k=0;k<dim;k++){
+      for(j=0;j<dim;j++){
+      for(i=0;i<dim;i++){
+        int ijk = i + j*jStride + k*kStride;
+        double Ax_n = apply_op_ijk(x_n);
+        x_np1[ijk] = x_n[ijk] + weight*lambda[ijk]*(rhs[ijk]-Ax_n);
+      }}}
+    } // box-loop
+    level->timers.smooth += (double)(getTime()-_timeStart);
+  } // s-loop
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators.old/misc.c b/Util/hpgmg/finite-volume/source/operators.old/misc.c
new file mode 100644
index 00000000..fecc3f73
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.old/misc.c
@@ -0,0 +1,373 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+void zero_vector(level_type * level, int component_id){
+  // zero's the entire grid INCLUDING ghost zones...
+  double _timeStart = getTime();
+  int box;
+
+  PRAGMA_THREAD_ACROSS_BOXES(level,box)
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int     dim = level->my_boxes[box].dim;
+    double * __restrict__ grid = level->my_boxes[box].vectors[component_id] + ghosts*(1+jStride+kStride);
+    PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+    for(k=-ghosts;k<dim+ghosts;k++){
+    for(j=-ghosts;j<dim+ghosts;j++){
+    for(i=-ghosts;i<dim+ghosts;i++){
+      int ijk = i + j*jStride + k*kStride;
+      grid[ijk] = 0.0;
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void init_vector(level_type * level, int component_id, double scalar){
+  // initializes the grid to a scalar while zero'ing the ghost zones...
+  double _timeStart = getTime();
+  int box;
+
+  PRAGMA_THREAD_ACROSS_BOXES(level,box)
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int     dim = level->my_boxes[box].dim;
+    double * __restrict__ grid = level->my_boxes[box].vectors[component_id] + ghosts*(1+jStride+kStride);
+    PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+    for(k=-ghosts;k<dim+ghosts;k++){
+    for(j=-ghosts;j<dim+ghosts;j++){
+    for(i=-ghosts;i<dim+ghosts;i++){
+        int ijk = i + j*jStride + k*kStride;
+        int ghostZone = (i<0) || (j<0) || (k<0) || (i>=dim) || (j>=dim) || (k>=dim);
+        grid[ijk] = ghostZone ? 0.0 : scalar;
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void add_vectors(level_type * level, int id_c, double scale_a, int id_a, double scale_b, int id_b){ // c=scale_a*id_a + scale_b*id_b
+  double _timeStart = getTime();
+
+  int box;
+
+  PRAGMA_THREAD_ACROSS_BOXES(level,box)
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int     dim = level->my_boxes[box].dim;
+    double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride);
+    double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride);
+    double * __restrict__ grid_b = level->my_boxes[box].vectors[id_b] + ghosts*(1+jStride+kStride);
+    PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+    for(k=0;k<dim;k++){
+    for(j=0;j<dim;j++){
+    for(i=0;i<dim;i++){
+        int ijk = i + j*jStride + k*kStride;
+        grid_c[ijk] = scale_a*grid_a[ijk] + scale_b*grid_b[ijk];
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void mul_vectors(level_type * level, int id_c, double scale, int id_a, int id_b){ // id_c=scale*id_a*id_b
+  double _timeStart = getTime();
+
+  int box;
+
+  PRAGMA_THREAD_ACROSS_BOXES(level,box)
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int     dim = level->my_boxes[box].dim;
+    double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride);
+    double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride);
+    double * __restrict__ grid_b = level->my_boxes[box].vectors[id_b] + ghosts*(1+jStride+kStride);
+    PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+    for(k=0;k<dim;k++){
+    for(j=0;j<dim;j++){
+    for(i=0;i<dim;i++){
+        int ijk = i + j*jStride + k*kStride;
+        grid_c[ijk] = scale*grid_a[ijk]*grid_b[ijk];
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void invert_vector(level_type * level, int id_c, double scale_a, int id_a){ // c[]=scale_a/a[]
+  double _timeStart = getTime();
+
+  int box;
+
+  PRAGMA_THREAD_ACROSS_BOXES(level,box)
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int     dim = level->my_boxes[box].dim;
+    double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride);
+    double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride);
+    PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+    for(k=0;k<dim;k++){
+    for(j=0;j<dim;j++){
+    for(i=0;i<dim;i++){
+        int ijk = i + j*jStride + k*kStride;
+        grid_c[ijk] = scale_a/grid_a[ijk];
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void scale_vector(level_type * level, int id_c, double scale_a, int id_a){ // c[]=scale_a*a[]
+  double _timeStart = getTime();
+
+  int box;
+
+  PRAGMA_THREAD_ACROSS_BOXES(level,box)
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int     dim = level->my_boxes[box].dim;
+    double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride);
+    double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride);
+    PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+    for(k=0;k<dim;k++){
+    for(j=0;j<dim;j++){
+    for(i=0;i<dim;i++){
+        int ijk = i + j*jStride + k*kStride;
+        grid_c[ijk] = scale_a*grid_a[ijk];
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+double dot(level_type * level, int id_a, int id_b){
+  double _timeStart = getTime();
+
+
+  int box;
+  double a_dot_b_level =  0.0;
+  // FIX, schedule(static) is a stand in to guarantee reproducibility...
+  PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,a_dot_b_level)
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int     dim = level->my_boxes[box].dim;
+    double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+    double * __restrict__ grid_b = level->my_boxes[box].vectors[id_b] + ghosts*(1+jStride+kStride);
+    double a_dot_b_box = 0.0;
+    PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,a_dot_b_box)
+    for(k=0;k<dim;k++){
+    for(j=0;j<dim;j++){
+    for(i=0;i<dim;i++){
+      int ijk = i + j*jStride + k*kStride;
+      a_dot_b_box += grid_a[ijk]*grid_b[ijk];
+    }}}
+    a_dot_b_level+=a_dot_b_box;
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+
+  #ifdef USE_MPI
+  double _timeStartAllReduce = getTime();
+  double send = a_dot_b_level;
+  MPI_Allreduce(&send,&a_dot_b_level,1,MPI_DOUBLE,MPI_SUM,level->MPI_COMM_ALLREDUCE);
+  double _timeEndAllReduce = getTime();
+  level->timers.collectives   += (double)(_timeEndAllReduce-_timeStartAllReduce);
+  #endif
+
+  return(a_dot_b_level);
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
+double norm(level_type * level, int component_id){ // implements the max norm
+  double _timeStart = getTime();
+
+  int box;
+  double max_norm =  0.0;
+  // FIX, schedule(static) is a stand in to guarantee reproducibility...
+  PRAGMA_THREAD_ACROSS_BOXES_MAX(level,box,max_norm)
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int     dim = level->my_boxes[box].dim;
+    double * __restrict__ grid   = level->my_boxes[box].vectors[component_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+    double box_norm = 0.0;
+    PRAGMA_THREAD_WITHIN_A_BOX_MAX(level,i,j,k,box_norm)
+    for(k=0;k<dim;k++){
+    for(j=0;j<dim;j++){
+    for(i=0;i<dim;i++){
+      int ijk = i + j*jStride + k*kStride;
+      double fabs_grid_ijk = fabs(grid[ijk]);
+      if(fabs_grid_ijk>box_norm){box_norm=fabs_grid_ijk;} // max norm
+    }}}
+    if(box_norm>max_norm){max_norm = box_norm;}
+  } // box list
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+
+  #ifdef USE_MPI
+  double _timeStartAllReduce = getTime();
+  double send = max_norm;
+  MPI_Allreduce(&send,&max_norm,1,MPI_DOUBLE,MPI_MAX,level->MPI_COMM_ALLREDUCE);
+  double _timeEndAllReduce = getTime();
+  level->timers.collectives   += (double)(_timeEndAllReduce-_timeStartAllReduce);
+  #endif
+  return(max_norm);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+double mean(level_type * level, int id_a){
+  double _timeStart = getTime();
+
+
+  int box;
+  double sum_level =  0.0;
+  PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,sum_level)
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int     dim = level->my_boxes[box].dim;
+    double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+    double sum_box = 0.0;
+    PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,sum_box)
+    for(k=0;k<dim;k++){
+    for(j=0;j<dim;j++){
+    for(i=0;i<dim;i++){
+      int ijk = i + j*jStride + k*kStride;
+      sum_box += grid_a[ijk];
+    }}}
+    sum_level+=sum_box;
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+  double ncells_level = (double)level->dim.i*(double)level->dim.j*(double)level->dim.k;
+
+  #ifdef USE_MPI
+  double _timeStartAllReduce = getTime();
+  double send = sum_level;
+  MPI_Allreduce(&send,&sum_level,1,MPI_DOUBLE,MPI_SUM,level->MPI_COMM_ALLREDUCE);
+  double _timeEndAllReduce = getTime();
+  level->timers.collectives   += (double)(_timeEndAllReduce-_timeStartAllReduce);
+  #endif
+
+  double mean_level = sum_level / ncells_level;
+  return(mean_level);
+}
+
+
+void shift_vector(level_type * level, int id_c, int id_a, double shift_a){
+  double _timeStart = getTime();
+
+
+  int box;
+  PRAGMA_THREAD_ACROSS_BOXES(level,box)
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int     dim = level->my_boxes[box].dim;
+    double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+    double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+
+    PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+    for(k=0;k<dim;k++){
+    for(j=0;j<dim;j++){
+    for(i=0;i<dim;i++){
+      int ijk = i + j*jStride + k*kStride;
+      grid_c[ijk] = grid_a[ijk] + shift_a;
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
+double error(level_type * level, int id_a, int id_b){
+  double h3 = level->h * level->h * level->h;
+               add_vectors(level,VECTOR_TEMP,1.0,id_a,-1.0,id_b);            // VECTOR_TEMP = id_a - id_b
+  double   max =      norm(level,VECTOR_TEMP);                return(max);   // max norm of error function
+  double    L2 = sqrt( dot(level,VECTOR_TEMP,VECTOR_TEMP)*h3);return( L2);   // normalized L2 error ?
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void color_vector(level_type * level, int id, int colors_in_each_dim, int icolor, int jcolor, int kcolor){
+  double _timeStart = getTime();
+  int box;
+  PRAGMA_THREAD_ACROSS_BOXES(level,box)
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int     dim = level->my_boxes[box].dim;
+    double * __restrict__ grid = level->my_boxes[box].vectors[id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+
+    PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+    for(k=0;k<dim;k++){double sk=0.0;if( ((k+boxlowk+kcolor)%colors_in_each_dim) == 0 )sk=1.0; // if colors_in_each_dim==1 (don't color), all cells are set to 1.0
+    for(j=0;j<dim;j++){double sj=0.0;if( ((j+boxlowj+jcolor)%colors_in_each_dim) == 0 )sj=1.0;
+    for(i=0;i<dim;i++){double si=0.0;if( ((i+boxlowi+icolor)%colors_in_each_dim) == 0 )si=1.0;
+      int ijk = i + j*jStride + k*kStride;
+      grid[ijk] = si*sj*sk;
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void random_vector(level_type * level, int id){
+  double _timeStart = getTime();
+  int box;
+  PRAGMA_THREAD_ACROSS_BOXES(level,box)
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int     dim = level->my_boxes[box].dim;
+    double * __restrict__ grid = level->my_boxes[box].vectors[id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+
+    PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+    for(k=0;k<dim;k++){
+    for(j=0;j<dim;j++){
+    for(i=0;i<dim;i++){
+      int ijk = i + j*jStride + k*kStride;
+      grid[ijk] = -0.500 + 1.0*(i^j^k^0x1);
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators.old/residual.c b/Util/hpgmg/finite-volume/source/operators.old/residual.c
new file mode 100644
index 00000000..03f75f27
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.old/residual.c
@@ -0,0 +1,44 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+// calculate res_id = rhs_id - A(x_id)
+
+void residual(level_type * level, int res_id, int x_id, int rhs_id, double a, double b){
+  // exchange the boundary for x in prep for Ax...
+  exchange_boundary(level,x_id,stencil_get_shape());
+          apply_BCs(level,x_id,stencil_get_shape());
+
+  // now do residual/restriction proper...
+  double _timeStart = getTime();
+  const int  ghosts = level->box_ghosts;
+  const int jStride = level->box_jStride;
+  const int kStride = level->box_kStride;
+  const int     dim = level->box_dim;
+  const double h2inv = 1.0/(level->h*level->h);
+  int box;
+
+  PRAGMA_THREAD_ACROSS_BOXES(level,box)
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const double * __restrict__ x      = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+    const double * __restrict__ rhs    = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
+    const double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+    const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+    const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+    const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+          double * __restrict__ res    = level->my_boxes[box].vectors[       res_id] + ghosts*(1+jStride+kStride);
+
+    PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k)
+    for(k=0;k<dim;k++){
+    for(j=0;j<dim;j++){
+    for(i=0;i<dim;i++){
+      int ijk = i + j*jStride + k*kStride;
+      double Ax = apply_op_ijk(x);
+      res[ijk] = rhs[ijk]-Ax;
+    }}}
+  }
+  level->timers.residual += (double)(getTime()-_timeStart);
+}
+
diff --git a/Util/hpgmg/finite-volume/source/operators.old/symgs.c b/Util/hpgmg/finite-volume/source/operators.old/symgs.c
new file mode 100644
index 00000000..51eebeeb
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators.old/symgs.c
@@ -0,0 +1,59 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+void smooth(level_type * level, int phi_id, int rhs_id, double a, double b){
+  int box,s;
+
+  for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps (forward/backward) per GS smooth
+    exchange_boundary(level,phi_id,stencil_get_shape());
+            apply_BCs(level,phi_id,stencil_get_shape());
+
+    // now do ghosts communication-avoiding smooths on each box...
+    double _timeStart = getTime();
+    const int  ghosts = level->box_ghosts;
+    const int jStride = level->box_jStride;
+    const int kStride = level->box_kStride;
+    const int     dim = level->box_dim;
+    const double h2inv = 1.0/(level->h*level->h);
+
+    #ifdef _OPENMP
+    #pragma omp parallel for
+    #endif
+    for(box=0;box<level->num_my_boxes;box++){
+      int i,j,k;
+            double * __restrict__ phi      = level->my_boxes[box].vectors[       phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+      const double * __restrict__ rhs      = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ alpha    = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_i   = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_j   = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_k   = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ Dinv     = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
+          
+
+      if( (s&0x1)==0 ){ // forward sweep... hard to thread
+        for(k=0;k<dim;k++){
+        for(j=0;j<dim;j++){
+        for(i=0;i<dim;i++){
+          int ijk = i + j*jStride + k*kStride;
+          double Ax = apply_op_ijk(phi);
+          phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax);
+        }}}
+      }else{ // backward sweep... hard to thread
+        for(k=dim-1;k>=0;k--){
+        for(j=dim-1;j>=0;j--){
+        for(i=dim-1;i>=0;i--){
+          int ijk = i + j*jStride + k*kStride;
+          double Ax = apply_op_ijk(phi);
+          phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax);
+        }}}
+      }
+
+    } // boxes
+    level->timers.smooth += (double)(getTime()-_timeStart);
+  } // s-loop
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators/apply_op.c b/Util/hpgmg/finite-volume/source/operators/apply_op.c
new file mode 100644
index 00000000..bc3a1c98
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/apply_op.c
@@ -0,0 +1,48 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+// Applies the linear operator specified in the apply_op_ijk macro to vector x_id and stores the result in Ax_id
+// This requires exchanging a ghost zone and/or enforcing a boundary condition.
+// NOTE, Ax_id and x_id must be distinct
+void apply_op(level_type * level, int Ax_id, int x_id, double a, double b){
+  // exchange the boundary of x in preparation for Ax
+  exchange_boundary(level,x_id,stencil_get_shape());
+          apply_BCs(level,x_id,stencil_get_shape());
+
+  // now do Ax proper...
+  double _timeStart = getTime();
+  int block;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+    const int ilo = level->my_blocks[block].read.i;
+    const int jlo = level->my_blocks[block].read.j;
+    const int klo = level->my_blocks[block].read.k;
+    const int ihi = level->my_blocks[block].dim.i + ilo;
+    const int jhi = level->my_blocks[block].dim.j + jlo;
+    const int khi = level->my_blocks[block].dim.k + klo;
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const double h2inv = 1.0/(level->h*level->h);
+    const double * __restrict__ x      = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+          double * __restrict__ Ax     = level->my_boxes[box].vectors[        Ax_id] + ghosts*(1+jStride+kStride); 
+    const double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+    const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+    const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+    const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){
+      int ijk = i + j*jStride + k*kStride;
+      Ax[ijk] = apply_op_ijk(x);
+    }}}
+  }
+  level->timers.apply_op += (double)(getTime()-_timeStart);
+}
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators/blockCopy.c b/Util/hpgmg/finite-volume/source/operators/blockCopy.c
new file mode 100644
index 00000000..bd387c2c
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/blockCopy.c
@@ -0,0 +1,136 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+static inline void CopyBlock(level_type *level, int id, blockCopy_type *block){
+  // copy 3D array from read_i,j,k of read[] to write_i,j,k in write[]
+  int   dim_i       = block->dim.i;
+  int   dim_j       = block->dim.j;
+  int   dim_k       = block->dim.k;
+
+  int  read_i       = block->read.i;
+  int  read_j       = block->read.j;
+  int  read_k       = block->read.k;
+  int  read_jStride = block->read.jStride;
+  int  read_kStride = block->read.kStride;
+
+  int write_i       = block->write.i;
+  int write_j       = block->write.j;
+  int write_k       = block->write.k;
+  int write_jStride = block->write.jStride;
+  int write_kStride = block->write.kStride;
+
+  const double * __restrict__  read = block->read.ptr;
+        double * __restrict__ write = block->write.ptr;
+
+  if(block->read.box >=0){
+     read_jStride = level->my_boxes[block->read.box ].jStride;
+     read_kStride = level->my_boxes[block->read.box ].kStride;
+     read = level->my_boxes[ block->read.box].vectors[id] + level->box_ghosts*(1+ read_jStride+ read_kStride);
+  }
+  if(block->write.box>=0){
+    write_jStride = level->my_boxes[block->write.box].jStride;
+    write_kStride = level->my_boxes[block->write.box].kStride;
+    write = level->my_boxes[block->write.box].vectors[id] + level->box_ghosts*(1+write_jStride+write_kStride);
+  }
+
+
+  int i,j,k;
+  if(dim_i==1){ // be smart and don't have an inner loop from 0 to 0
+    for(k=0;k<dim_k;k++){
+    for(j=0;j<dim_j;j++){
+      int  read_ijk = ( read_i) + (j+ read_j)* read_jStride + (k+ read_k)* read_kStride;
+      int write_ijk = (write_i) + (j+write_j)*write_jStride + (k+write_k)*write_kStride;
+      write[write_ijk] = read[read_ijk];
+    }}
+  }else if(dim_i==2){ // be smart and don't have an inner loop from 0 to 1
+    for(k=0;k<dim_k;k++){
+    for(j=0;j<dim_j;j++){
+      int  read_ijk = ( read_i) + (j+ read_j)* read_jStride + (k+ read_k)* read_kStride;
+      int write_ijk = (write_i) + (j+write_j)*write_jStride + (k+write_k)*write_kStride;
+      write[write_ijk+0] = read[read_ijk+0];
+      write[write_ijk+1] = read[read_ijk+1];
+    }}
+  }else if(dim_i==4){ // be smart and don't have an inner loop from 0 to 3
+    for(k=0;k<dim_k;k++){
+    for(j=0;j<dim_j;j++){
+      int  read_ijk = ( read_i) + (j+ read_j)* read_jStride + (k+ read_k)* read_kStride;
+      int write_ijk = (write_i) + (j+write_j)*write_jStride + (k+write_k)*write_kStride;
+      write[write_ijk+0] = read[read_ijk+0];
+      write[write_ijk+1] = read[read_ijk+1];
+      write[write_ijk+2] = read[read_ijk+2];
+      write[write_ijk+3] = read[read_ijk+3];
+    }}
+  }else if(dim_j==1){ // don't have a 0..0 loop
+    for(k=0;k<dim_k;k++){
+    for(i=0;i<dim_i;i++){
+      int  read_ijk = (i+ read_i) + ( read_j)* read_jStride + (k+ read_k)* read_kStride;
+      int write_ijk = (i+write_i) + (write_j)*write_jStride + (k+write_k)*write_kStride;
+      write[write_ijk] = read[read_ijk];
+    }}
+  }else if(dim_k==1){ // don't have a 0..0 loop
+    for(j=0;j<dim_j;j++){
+    for(i=0;i<dim_i;i++){
+      int  read_ijk = (i+ read_i) + (j+ read_j)* read_jStride + ( read_k)* read_kStride;
+      int write_ijk = (i+write_i) + (j+write_j)*write_jStride + (write_k)*write_kStride;
+      write[write_ijk] = read[read_ijk];
+    }}
+  }else{ // general case...
+    for(k=0;k<dim_k;k++){
+    for(j=0;j<dim_j;j++){
+    for(i=0;i<dim_i;i++){
+      int  read_ijk = (i+ read_i) + (j+ read_j)* read_jStride + (k+ read_k)* read_kStride;
+      int write_ijk = (i+write_i) + (j+write_j)*write_jStride + (k+write_k)*write_kStride;
+      write[write_ijk] = read[read_ijk];
+    }}}
+  }
+
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+static inline void IncrementBlock(level_type *level, int id, double prescale, blockCopy_type *block){
+  // copy 3D array from read_i,j,k of read[] to write_i,j,k in write[]
+  int   dim_i       = block->dim.i;
+  int   dim_j       = block->dim.j;
+  int   dim_k       = block->dim.k;
+
+  int  read_i       = block->read.i;
+  int  read_j       = block->read.j;
+  int  read_k       = block->read.k;
+  int  read_jStride = block->read.jStride;
+  int  read_kStride = block->read.kStride;
+
+  int write_i       = block->write.i;
+  int write_j       = block->write.j;
+  int write_k       = block->write.k;
+  int write_jStride = block->write.jStride;
+  int write_kStride = block->write.kStride;
+
+  const double * __restrict__  read = block->read.ptr;
+        double * __restrict__ write = block->write.ptr;
+
+  if(block->read.box >=0){
+     read_jStride = level->my_boxes[block->read.box ].jStride;
+     read_kStride = level->my_boxes[block->read.box ].kStride;
+     read = level->my_boxes[ block->read.box].vectors[id] + level->box_ghosts*(1+ read_jStride+ read_kStride);
+  }
+  if(block->write.box>=0){
+    write_jStride = level->my_boxes[block->write.box].jStride;
+    write_kStride = level->my_boxes[block->write.box].kStride;
+    write = level->my_boxes[block->write.box].vectors[id] + level->box_ghosts*(1+write_jStride+write_kStride);
+  }
+
+  int i,j,k;
+  for(k=0;k<dim_k;k++){
+  for(j=0;j<dim_j;j++){
+  for(i=0;i<dim_i;i++){
+    int  read_ijk = (i+ read_i) + (j+ read_j)* read_jStride + (k+ read_k)* read_kStride;
+    int write_ijk = (i+write_i) + (j+write_j)*write_jStride + (k+write_k)*write_kStride;
+    write[write_ijk] = prescale*write[write_ijk] + read[read_ijk]; // CAREFUL !!!  you must guarantee you zero'd the MPI buffers(write[]) and destination boxes at some point to avoid 0.0*NaN or 0.0*inf
+  }}}
+
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators/boundary_fd.c b/Util/hpgmg/finite-volume/source/operators/boundary_fd.c
new file mode 100644
index 00000000..4d94bb58
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/boundary_fd.c
@@ -0,0 +1,205 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+void apply_BCs_p1(level_type * level, int x_id, int shape){
+  // For cell-centered, we need to fill in the ghost zones to apply any BC's
+  // This code does a simple piecewise linear interpolation for homogeneous dirichlet (0 on boundary)
+  // Nominally, this is first performed across faces, then to edges, then to corners.  
+  // In this implementation, these three steps are fused
+  //
+  //   . . . . . . . . . .        . . . . . . . . . .
+  //   .       .       .          .       .       .
+  //   .   ?   .   ?   .          .+x(0,0).-x(0,0).
+  //   .       .       .          .       .       .
+  //   . . . . +---0---+--        . . . . +-------+--
+  //   .       |       |          .       |       |
+  //   .   ?   0 x(0,0)|          .-x(0,0)| x(0,0)|
+  //   .       |       |          .       |       |
+  //   . . . . +-------+--        . . . . +-------+--
+  //   .       |       |          .       |       |
+  //
+  //
+  if(shape>=STENCIL_MAX_SHAPES)shape=STENCIL_SHAPE_BOX;  // shape must be < STENCIL_MAX_SHAPES in order to safely index into boundary_condition.blocks[]
+  if(level->boundary_condition.type == BC_PERIODIC)return; // no BC's to apply !
+
+  const int   faces[27] = {0,0,0,0,1,0,0,0,0,  0,1,0,1,0,1,0,1,0,  0,0,0,0,1,0,0,0,0};
+  const int   edges[27] = {0,1,0,1,0,1,0,1,0,  1,0,1,0,0,0,1,0,1,  0,1,0,1,0,1,0,1,0};
+  const int corners[27] = {1,0,1,0,0,0,1,0,1,  0,0,0,0,0,0,0,0,0,  1,0,1,0,0,0,1,0,1};
+
+  int buffer;
+  double _timeStart = getTime();
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->boundary_condition.num_blocks[shape])
+  for(buffer=0;buffer<level->boundary_condition.num_blocks[shape];buffer++){
+    double scale = 1.0;
+    if(  faces[level->boundary_condition.blocks[shape][buffer].subtype])scale=-1.0;
+    if(  edges[level->boundary_condition.blocks[shape][buffer].subtype])scale= 1.0;
+    if(corners[level->boundary_condition.blocks[shape][buffer].subtype])scale=-1.0;
+
+    int i,j,k;
+    const int       box = level->boundary_condition.blocks[shape][buffer].read.box; 
+    const int     dim_i = level->boundary_condition.blocks[shape][buffer].dim.i;
+    const int     dim_j = level->boundary_condition.blocks[shape][buffer].dim.j;
+    const int     dim_k = level->boundary_condition.blocks[shape][buffer].dim.k;
+    const int       ilo = level->boundary_condition.blocks[shape][buffer].read.i;
+    const int       jlo = level->boundary_condition.blocks[shape][buffer].read.j;
+    const int       klo = level->boundary_condition.blocks[shape][buffer].read.k;
+    const int normal = 26-level->boundary_condition.blocks[shape][buffer].subtype; // invert the normal vector
+ 
+    // hard code for box to box BC's 
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    double * __restrict__  x = level->my_boxes[box].vectors[x_id] + level->my_boxes[box].ghosts*(1+jStride+kStride);
+
+    // convert normal vector into pointer offsets...
+    const int di = (((normal % 3)  )-1);
+    const int dj = (((normal % 9)/3)-1);
+    const int dk = (((normal / 9)  )-1);
+    const int stride = di + dj*jStride + dk*kStride;
+
+    if(dim_i==1){
+      for(k=0;k<dim_k;k++){
+      for(j=0;j<dim_j;j++){
+        int ijk = (  ilo) + (j+jlo)*jStride + (k+klo)*kStride;
+        x[ijk] = scale*x[ijk+stride]; // homogeneous linear = 1pt stencil
+      }}
+    }else if(dim_j==1){
+      for(k=0;k<dim_k;k++){
+      for(i=0;i<dim_i;i++){
+        int ijk = (i+ilo) + (  jlo)*jStride + (k+klo)*kStride;
+        x[ijk] = scale*x[ijk+stride]; // homogeneous linear = 1pt stencil
+      }}
+    }else if(dim_k==1){
+      for(j=0;j<dim_j;j++){
+      for(i=0;i<dim_i;i++){
+        int ijk = (i+ilo) + (j+jlo)*jStride + (  klo)*kStride;
+        x[ijk] = scale*x[ijk+stride]; // homogeneous linear = 1pt stencil
+      }}
+    }else{
+      for(k=0;k<dim_k;k++){
+      for(j=0;j<dim_j;j++){
+      for(i=0;i<dim_i;i++){
+        int ijk = (i+ilo) + (j+jlo)*jStride + (k+klo)*kStride;
+        x[ijk] = scale*x[ijk+stride]; // homogeneous linear = 1pt stencil
+      }}}
+    }
+
+  }
+  level->timers.boundary_conditions += (double)(getTime()-_timeStart);
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
+void apply_BCs_p2(level_type * level, int x_id, int shape){
+  // For cell-centered, we need to fill in the ghost zones to apply any BC's
+  // This code does a simple piecewise quadratic interpolation for homogeneous dirichlet (0 on boundary)
+  // Nominally, this is first performed across faces, then to edges, then to corners.  
+  //
+  if(shape>=STENCIL_MAX_SHAPES)shape=STENCIL_SHAPE_BOX;  // shape must be < STENCIL_MAX_SHAPES in order to safely index into boundary_condition.blocks[]
+  if(level->boundary_condition.type == BC_PERIODIC)return; // no BC's to apply !
+  if(level->box_dim<2){apply_BCs_p1(level,x_id,shape);return;}
+
+  const int   faces[27] = {0,0,0,0,1,0,0,0,0,  0,1,0,1,0,1,0,1,0,  0,0,0,0,1,0,0,0,0};
+  const int   edges[27] = {0,1,0,1,0,1,0,1,0,  1,0,1,0,0,0,1,0,1,  0,1,0,1,0,1,0,1,0};
+  const int corners[27] = {1,0,1,0,0,0,1,0,1,  0,0,0,0,0,0,0,0,0,  1,0,1,0,0,0,1,0,1};
+
+  int buffer;
+  double _timeStart = getTime();
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->boundary_condition.num_blocks[shape])
+  for(buffer=0;buffer<level->boundary_condition.num_blocks[shape];buffer++){
+    int i,j,k;
+    const int       box = level->boundary_condition.blocks[shape][buffer].read.box; 
+    const int     dim_i = level->boundary_condition.blocks[shape][buffer].dim.i;
+    const int     dim_j = level->boundary_condition.blocks[shape][buffer].dim.j;
+    const int     dim_k = level->boundary_condition.blocks[shape][buffer].dim.k;
+    const int       ilo = level->boundary_condition.blocks[shape][buffer].read.i;
+    const int       jlo = level->boundary_condition.blocks[shape][buffer].read.j;
+    const int       klo = level->boundary_condition.blocks[shape][buffer].read.k;
+    const int normal = 26-level->boundary_condition.blocks[shape][buffer].subtype; // invert the normal vector
+ 
+    // hard code for box to box BC's 
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    double * __restrict__  x = level->my_boxes[box].vectors[x_id] + level->my_boxes[box].ghosts*(1+jStride+kStride);
+
+    // convert normal vector into pointer offsets...
+    const int di = (((normal % 3)  )-1)*1;
+    const int dj = (((normal % 9)/3)-1)*jStride;
+    const int dk = (((normal / 9)  )-1)*kStride;
+
+    if(faces[normal]){
+      //
+      //      /------/------/------/
+      //     /  ??  /  -2  /  1/3 /
+      //    /------/------/------/
+      //
+      const int stride = di+dj+dk;
+      const int stride2 = stride*2;
+      for(k=0;k<dim_k;k++){
+      for(j=0;j<dim_j;j++){
+      for(i=0;i<dim_i;i++){
+        int ijk = (i+ilo) + (j+jlo)*jStride + (k+klo)*kStride;
+        x[ijk] = -2.0*x[ijk+stride] + 0.333333333333333333*x[ijk+stride2]; // 2pt stencil
+      }}}
+    }else if(edges[normal]){
+      //
+      //                 /------/------/
+      //                / -2/3 /  1/9 /
+      //               /------/------/
+      //              /   4  / -2/3 /
+      //      /------/------/------/
+      //     /  ??  /
+      //    /------/
+      //
+      int dr=-1;
+      int ds=-1;
+      if(di==0){dr=dj;ds=dk;}
+      if(dj==0){dr=di;ds=dk;}
+      if(dk==0){dr=di;ds=dj;}
+      for(k=0;k<dim_k;k++){
+      for(j=0;j<dim_j;j++){
+      for(i=0;i<dim_i;i++){
+        // 4pt stencil...
+        int ijk = (i+ilo) + (j+jlo)*jStride + (k+klo)*kStride;
+        x[ijk] =   4.000000000000000000*x[ijk+  dr+  ds] 
+                 - 0.666666666666666667*x[ijk+2*dr+  ds]
+                 - 0.666666666666666667*x[ijk+  dr+2*ds]
+                 + 0.111111111111111111*x[ijk+2*dr+2*ds];
+      }}}
+    }else if(corners[normal]){
+      //
+      //              /------/------/
+      //             / -2/9 / 1/27 /.
+      //            /------/------/ .
+      //           /  4/3 / -2/9 /  .
+      //          /------/------/   .
+      //          .                 .
+      //          .   /------/------/
+      //          .  /  4/3 / -2/9 /.
+      //          . /------/------/ .
+      //          ./  -8  /  4/3 /  .
+      //          /------/------/   .
+      //          .                 .
+      //          .   /------/------/
+      //          .  /   0  /   0  /
+      //          . /------/------/
+      //          ./   0  /   0  /
+      //   /------/------/------/
+      //  /  ??  /
+      // /------/
+      //
+      // 4pt stencil...
+      int ijk = (ilo) + (jlo)*jStride + (klo)*kStride;
+      x[ijk] =  -8.000000000000000000*x[ijk+  di+  dj+  dk] 
+                +1.333333333333333333*x[ijk+2*di+  dj+  dk] 
+                +1.333333333333333333*x[ijk+  di+2*dj+  dk] 
+                +1.333333333333333333*x[ijk+  di+  dj+2*dk] 
+                -0.222222222222222222*x[ijk+2*di+2*dj+  dk] 
+                -0.222222222222222222*x[ijk+  di+2*dj+2*dk] 
+                -0.222222222222222222*x[ijk+2*di+  dj+2*dk] 
+                +0.037037037037037037*x[ijk+2*di+2*dj+2*dk];
+    }
+
+  }
+  level->timers.boundary_conditions += (double)(getTime()-_timeStart);
+}
diff --git a/Util/hpgmg/finite-volume/source/operators/boundary_fv.c b/Util/hpgmg/finite-volume/source/operators/boundary_fv.c
new file mode 100644
index 00000000..180aba93
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/boundary_fv.c
@@ -0,0 +1,683 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+void apply_BCs_v1(level_type * level, int x_id, int shape){
+  // For cell-centered, we need to fill in the ghost zones to apply any BC's
+  // This code does a simple linear interpolation for homogeneous dirichlet (0 on boundary)
+  // Nominally, this is first performed across faces, then to edges, then to corners.  
+  // In this implementation, these three steps are fused
+  //
+  //   . . . . . . . . . .        . . . . . . . . . .
+  //   .       .       .          .       .       .
+  //   .   ?   .   ?   .          .+x(0,0).-x(0,0).
+  //   .       .       .          .       .       .
+  //   . . . . +---0---+--        . . . . +-------+--
+  //   .       |       |          .       |       |
+  //   .   ?   0 x(0,0)|          .-x(0,0)| x(0,0)|
+  //   .       |       |          .       |       |
+  //   . . . . +-------+--        . . . . +-------+--
+  //   .       |       |          .       |       |
+  //
+  //
+  if(shape>=STENCIL_MAX_SHAPES)shape=STENCIL_SHAPE_BOX;  // shape must be < STENCIL_MAX_SHAPES in order to safely index into boundary_condition.blocks[]
+  if(level->boundary_condition.type == BC_PERIODIC)return; // no BC's to apply !
+
+  const int   faces[27] = {0,0,0,0,1,0,0,0,0,  0,1,0,1,0,1,0,1,0,  0,0,0,0,1,0,0,0,0};
+  const int   edges[27] = {0,1,0,1,0,1,0,1,0,  1,0,1,0,0,0,1,0,1,  0,1,0,1,0,1,0,1,0};
+  const int corners[27] = {1,0,1,0,0,0,1,0,1,  0,0,0,0,0,0,0,0,0,  1,0,1,0,0,0,1,0,1};
+
+  int buffer;
+  double _timeStart = getTime();
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->boundary_condition.num_blocks[shape])
+  for(buffer=0;buffer<level->boundary_condition.num_blocks[shape];buffer++){
+    double scale = 1.0;
+    if(  faces[level->boundary_condition.blocks[shape][buffer].subtype])scale=-1.0;
+    if(  edges[level->boundary_condition.blocks[shape][buffer].subtype])scale= 1.0;
+    if(corners[level->boundary_condition.blocks[shape][buffer].subtype])scale=-1.0;
+
+    int i,j,k;
+    const int       box = level->boundary_condition.blocks[shape][buffer].read.box; 
+    const int     dim_i = level->boundary_condition.blocks[shape][buffer].dim.i;
+    const int     dim_j = level->boundary_condition.blocks[shape][buffer].dim.j;
+    const int     dim_k = level->boundary_condition.blocks[shape][buffer].dim.k;
+    const int       ilo = level->boundary_condition.blocks[shape][buffer].read.i;
+    const int       jlo = level->boundary_condition.blocks[shape][buffer].read.j;
+    const int       klo = level->boundary_condition.blocks[shape][buffer].read.k;
+    const int normal = 26-level->boundary_condition.blocks[shape][buffer].subtype; // invert the normal vector
+ 
+    // hard code for box to box BC's 
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    double * __restrict__  x = level->my_boxes[box].vectors[x_id] + level->box_ghosts*(1+jStride+kStride);
+
+    // convert normal vector into pointer offsets...
+    const int di = (((normal % 3)  )-1);
+    const int dj = (((normal % 9)/3)-1);
+    const int dk = (((normal / 9)  )-1);
+    const int stride = di + dj*jStride + dk*kStride;
+
+    if(dim_i==1){
+      for(k=0;k<dim_k;k++){
+      for(j=0;j<dim_j;j++){
+        int ijk = (  ilo) + (j+jlo)*jStride + (k+klo)*kStride;
+        x[ijk] = scale*x[ijk+stride]; // homogeneous linear = 1pt stencil
+      }}
+    }else if(dim_j==1){
+      for(k=0;k<dim_k;k++){
+      for(i=0;i<dim_i;i++){
+        int ijk = (i+ilo) + (  jlo)*jStride + (k+klo)*kStride;
+        x[ijk] = scale*x[ijk+stride]; // homogeneous linear = 1pt stencil
+      }}
+    }else if(dim_k==1){
+      for(j=0;j<dim_j;j++){
+      for(i=0;i<dim_i;i++){
+        int ijk = (i+ilo) + (j+jlo)*jStride + (  klo)*kStride;
+        x[ijk] = scale*x[ijk+stride]; // homogeneous linear = 1pt stencil
+      }}
+    }else{
+      for(k=0;k<dim_k;k++){
+      for(j=0;j<dim_j;j++){
+      for(i=0;i<dim_i;i++){
+        int ijk = (i+ilo) + (j+jlo)*jStride + (k+klo)*kStride;
+        x[ijk] = scale*x[ijk+stride]; // homogeneous linear = 1pt stencil
+      }}}
+    }
+
+  }
+  level->timers.boundary_conditions += (double)(getTime()-_timeStart);
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
+// For cell-centered/averaged, one must fill in a ghost zone in order to affect a boundary condition
+// The argument shape indicates on which regions of the domain (not the individual boxes) must the boundary condition be enforced.
+//   If shape exceeds the range of defined shapes, the boundary condition will be applied to all faces, edges, and corners
+// This code performs a simple quadratic volume averages extrapolation for homogeneous dirichlet (0 on boundary)
+//   Nominally, this is first performed across faces, then to edges, then to corners.  
+//   In this implementation, these three steps are fused
+// This code will apply the BC only to the first ghost zone.  Subsequent (2nd, 3rd, ...) ghost zones will be zero'd
+// This code will drop order if one attempts to apply quadratic BC's to boxes of less than 2^3
+void apply_BCs_v2(level_type * level, int x_id, int shape){
+  const int box_dim    = level->box_dim;
+  const int box_ghosts = level->box_ghosts;
+  if(shape>=STENCIL_MAX_SHAPES)shape=STENCIL_SHAPE_BOX;  // shape must be < STENCIL_MAX_SHAPES in order to safely index into boundary_condition.blocks[]
+  if(level->boundary_condition.type == BC_PERIODIC)return; // no BC's to apply !
+  if(level->box_dim<2){apply_BCs_v1(level,x_id,shape);return;}
+
+  const int   faces[27] = {0,0,0,0,1,0,0,0,0,  0,1,0,1,0,1,0,1,0,  0,0,0,0,1,0,0,0,0};
+  const int   edges[27] = {0,1,0,1,0,1,0,1,0,  1,0,1,0,0,0,1,0,1,  0,1,0,1,0,1,0,1,0};
+  const int corners[27] = {1,0,1,0,0,0,1,0,1,  0,0,0,0,0,0,0,0,0,  1,0,1,0,0,0,1,0,1};
+
+  int buffer;
+  double _timeStart = getTime();
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->boundary_condition.num_blocks[shape])
+  for(buffer=0;buffer<level->boundary_condition.num_blocks[shape];buffer++){
+    int i,j,k;
+    const int       box = level->boundary_condition.blocks[shape][buffer].read.box; 
+    const int     dim_i = level->boundary_condition.blocks[shape][buffer].dim.i;
+    const int     dim_j = level->boundary_condition.blocks[shape][buffer].dim.j;
+    const int     dim_k = level->boundary_condition.blocks[shape][buffer].dim.k;
+    const int       ilo = level->boundary_condition.blocks[shape][buffer].read.i;
+    const int       jlo = level->boundary_condition.blocks[shape][buffer].read.j;
+    const int       klo = level->boundary_condition.blocks[shape][buffer].read.k;
+    const int   subtype = level->boundary_condition.blocks[shape][buffer].subtype;
+  //const int    normal = 26-subtype;
+ 
+    // hard code for box to box BC's 
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const double * __restrict__ x  = level->my_boxes[box].vectors[x_id] + level->box_ghosts*(1+jStride+kStride);
+          double * __restrict__ xn = level->my_boxes[box].vectors[x_id] + level->box_ghosts*(1+jStride+kStride); // physically the same, but use different pointers for read/write
+
+    // zero out entire ghost region when not all points will be updated...
+    if(box_ghosts>1){
+    for(k=0;k<dim_k;k++){
+    for(j=0;j<dim_j;j++){
+    for(i=0;i<dim_i;i++){
+      int ijk = (i+ilo) + (j+jlo)*jStride + (k+klo)*kStride;
+      xn[ijk] = 0.0;
+    }}}}
+
+    // apply the appropriate BC subtype (face, edge, corner)...
+    if(faces[subtype]){
+      //
+      //    :.......|.......:.......:
+      //    :   ?   | -5/2  :  1/2  :
+      //    :.......|.......:.......:
+      //
+      int r=-1,rStride=-1,dim_r=-1,rlo=-1;
+      int s=-1,sStride=-1,dim_s=-1,slo=-1;
+      int t=-1,tStride=-1,dt=-1;
+    
+      // the two 4-point stencils can point in 6 different directions...
+      switch(subtype){
+        case  4:rlo=ilo;dim_r=dim_i;rStride=      1;slo=jlo;dim_s=dim_j;sStride=jStride;t=     -1;tStride=kStride;dt= tStride;break; // ij face, low k
+        case 10:rlo=ilo;dim_r=dim_i;rStride=      1;slo=klo;dim_s=dim_k;sStride=kStride;t=     -1;tStride=jStride;dt= tStride;break; // ik face, low j
+        case 12:rlo=jlo;dim_r=dim_j;rStride=jStride;slo=klo;dim_s=dim_k;sStride=kStride;t=     -1;tStride=      1;dt= tStride;break; // jk face, low i
+        case 14:rlo=jlo;dim_r=dim_j;rStride=jStride;slo=klo;dim_s=dim_k;sStride=kStride;t=box_dim;tStride=      1;dt=-tStride;break; // jk face, high i
+        case 16:rlo=ilo;dim_r=dim_i;rStride=      1;slo=klo;dim_s=dim_k;sStride=kStride;t=box_dim;tStride=jStride;dt=-tStride;break; // ik face, high j
+        case 22:rlo=ilo;dim_r=dim_i;rStride=      1;slo=jlo;dim_s=dim_j;sStride=jStride;t=box_dim;tStride=kStride;dt=-tStride;break; // ij face, high k
+      }
+      // FIX... optimize for rStride==1 (unit-stride)
+      for(s=0;s<dim_s;s++){
+      for(r=0;r<dim_r;r++){
+        int ijk = (r+rlo)*rStride + (s+slo)*sStride + (t)*tStride;
+        xn[ijk] = -2.5*x[ijk+dt] + 0.5*x[ijk+2*dt];
+      }}
+    }else
+    if(edges[subtype]){
+      //
+      //          r   +---+---+ dt
+      //          ^  /   /   /|/
+      //          | +---+---+ |
+      //          |/   /   /|/
+      //      +---+---+---+ |
+      //     /   /|   |   |/
+      //    +---+ |---+---+---> ds
+      //    |   |/   
+      //    +---+    
+      //
+      int r=-1,rStride=-1,dim_r=-1,rlo=-1;
+      int s=-1,sStride=-1,ds=-1;
+      int t=-1,tStride=-1,dt=-1;
+      // the four 16-point stencils (symmetry allows you to view it as 12 4-point) can point in 12 different directions...
+      switch(subtype){
+        case  1:rlo=ilo;dim_r=dim_i;rStride=      1;s=     -1;sStride=jStride;t=     -1;tStride=kStride;ds= sStride;dt= tStride;break; // i-edge,  low j,  low k
+        case  3:rlo=jlo;dim_r=dim_j;rStride=jStride;s=     -1;sStride=      1;t=     -1;tStride=kStride;ds= sStride;dt= tStride;break; // j-edge,  low i,  low k
+        case  5:rlo=jlo;dim_r=dim_j;rStride=jStride;s=box_dim;sStride=      1;t=     -1;tStride=kStride;ds=-sStride;dt= tStride;break; // j-edge, high i,  low k
+        case  7:rlo=ilo;dim_r=dim_i;rStride=      1;s=box_dim;sStride=jStride;t=     -1;tStride=kStride;ds=-sStride;dt= tStride;break; // i-edge, high j,  low k
+        case  9:rlo=klo;dim_r=dim_k;rStride=kStride;s=     -1;sStride=      1;t=     -1;tStride=jStride;ds= sStride;dt= tStride;break; // k-edge,  low i,  low j
+        case 11:rlo=klo;dim_r=dim_k;rStride=kStride;s=box_dim;sStride=      1;t=     -1;tStride=jStride;ds=-sStride;dt= tStride;break; // k-edge, high i,  low j
+        case 15:rlo=klo;dim_r=dim_k;rStride=kStride;s=     -1;sStride=      1;t=box_dim;tStride=jStride;ds= sStride;dt=-tStride;break; // k-edge,  low i, high j
+        case 17:rlo=klo;dim_r=dim_k;rStride=kStride;s=box_dim;sStride=      1;t=box_dim;tStride=jStride;ds=-sStride;dt=-tStride;break; // k-edge, high i, high j
+        case 19:rlo=ilo;dim_r=dim_i;rStride=      1;s=     -1;sStride=jStride;t=box_dim;tStride=kStride;ds= sStride;dt=-tStride;break; // i-edge,  low j, high k
+        case 21:rlo=jlo;dim_r=dim_j;rStride=jStride;s=     -1;sStride=      1;t=box_dim;tStride=kStride;ds= sStride;dt=-tStride;break; // j-edge,  low i, high k
+        case 23:rlo=jlo;dim_r=dim_j;rStride=jStride;s=box_dim;sStride=      1;t=box_dim;tStride=kStride;ds=-sStride;dt=-tStride;break; // j-edge, high i, high k
+        case 25:rlo=ilo;dim_r=dim_i;rStride=      1;s=box_dim;sStride=jStride;t=box_dim;tStride=kStride;ds=-sStride;dt=-tStride;break; // i-edge, high j, high k
+      }
+      // FIX... optimize for rStride==1 (unit-stride)
+      for(r=0;r<dim_r;r++){
+        int ijk = (r+rlo)*rStride + (s)*sStride + (t)*tStride;
+        xn[ijk] =   6.25*x[ijk+  ds+  dt] 
+                  - 1.25*x[ijk+2*ds+  dt]
+                  - 1.25*x[ijk+  ds+2*dt]
+                  + 0.25*x[ijk+2*ds+2*dt];
+      }
+    }else
+    if(corners[subtype]){
+      //
+      //                  +---+---+
+      //                 /   /   /|
+      //                +---+---+ |
+      //               /   /   /|/|
+      //              +---+---+ | | 
+      //              |   |   |/|/
+      //              +---+---+ |
+      //              |   |   |/
+      //          +---+---+---+
+      //         /   /|
+      //        +---+ |
+      //        |   |/
+      //        +---+
+      //
+      int i=-1,di=-1;
+      int j=-1,dj=-1;
+      int k=-1,dk=-1;
+      // the eight 64-point stencils (symmetry allows you to view it as 56 4-point) can point in 8 different directions...
+      switch(subtype){
+        case  0:i=     -1;j=     -1;k=     -1;di= 1;dj= jStride;dk= kStride;break; //  low i,  low j,  low k
+        case  2:i=box_dim;j=     -1;k=     -1;di=-1;dj= jStride;dk= kStride;break; // high i,  low j,  low k
+        case  6:i=     -1;j=box_dim;k=     -1;di= 1;dj=-jStride;dk= kStride;break; //  low i, high j,  low k
+        case  8:i=box_dim;j=box_dim;k=     -1;di=-1;dj=-jStride;dk= kStride;break; // high i, high j,  low k
+        case 18:i=     -1;j=     -1;k=box_dim;di= 1;dj= jStride;dk=-kStride;break; //  low i,  low j, high k
+        case 20:i=box_dim;j=     -1;k=box_dim;di=-1;dj= jStride;dk=-kStride;break; // high i,  low j, high k
+        case 24:i=     -1;j=box_dim;k=box_dim;di= 1;dj=-jStride;dk=-kStride;break; //  low i, high j, high k
+        case 26:i=box_dim;j=box_dim;k=box_dim;di=-1;dj=-jStride;dk=-kStride;break; // high i, high j, high k
+      }
+      int ijk = (i) + (j)*jStride + (k)*kStride;
+      xn[ijk] =  -15.625*x[ijk+  di+  dj+  dk] 
+                 + 3.125*x[ijk+2*di+  dj+  dk] 
+                 + 3.125*x[ijk+  di+2*dj+  dk] 
+                 + 3.125*x[ijk+  di+  dj+2*dk] 
+                 - 0.625*x[ijk+2*di+2*dj+  dk] 
+                 - 0.625*x[ijk+  di+2*dj+2*dk] 
+                 - 0.625*x[ijk+2*di+  dj+2*dk] 
+                 + 0.125*x[ijk+2*di+2*dj+2*dk];
+    }
+  }
+  level->timers.boundary_conditions += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// For cell-centered/averaged, one must fill in a ghost zone in order to affect a boundary condition
+// The argument shape indicates on which regions of the domain (not the individual boxes) must the boundary condition be enforced.
+//   If shape exceeds the range of defined shapes, the boundary condition will be applied to all faces, edges, and corners
+// This code performs a simple quartic volume averages extrapolation for homogeneous dirichlet (0 on boundary)
+//   Nominally, this is first performed across faces, then to edges, then to corners.  
+//   In this implementation, these three steps are fused
+// It is considered an error to call this routine if the domain has less that two ghost zones
+// This code will drop order if one attempts to apply quartic BC's to boxes of less than 4^3
+void apply_BCs_v4(level_type * level, int x_id, int shape){
+  const int box_dim    = level->box_dim;
+  const int box_ghosts = level->box_ghosts;
+  if(shape>=STENCIL_MAX_SHAPES)shape=STENCIL_SHAPE_BOX;  // shape must be < STENCIL_MAX_SHAPES in order to safely index into boundary_condition.blocks[]
+  if(level->boundary_condition.type == BC_PERIODIC)return; // no BC's to apply !
+  if(box_ghosts<2){fprintf(stderr,"called quartic BC's with only 1 ghost zone!!!\n");exit(0);}
+//if(box_dim   <4){fprintf(stderr,"called quartic BC's with boxes < 4^3 \n");exit(0);}
+  if(box_dim   <4){apply_BCs_v2(level,x_id,shape);return;} // FIX... is it safe to drop order on the boundary on coarse grids ??
+
+  const int   faces[27] = {0,0,0,0,1,0,0,0,0,  0,1,0,1,0,1,0,1,0,  0,0,0,0,1,0,0,0,0};
+  const int   edges[27] = {0,1,0,1,0,1,0,1,0,  1,0,1,0,0,0,1,0,1,  0,1,0,1,0,1,0,1,0};
+  const int corners[27] = {1,0,1,0,0,0,1,0,1,  0,0,0,0,0,0,0,0,0,  1,0,1,0,0,0,1,0,1};
+
+  int buffer;
+  double _timeStart = getTime();
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->boundary_condition.num_blocks[shape])
+  for(buffer=0;buffer<level->boundary_condition.num_blocks[shape];buffer++){
+    int i,j,k;
+    const int       box = level->boundary_condition.blocks[shape][buffer].read.box; 
+    const int     dim_i = level->boundary_condition.blocks[shape][buffer].dim.i;
+    const int     dim_j = level->boundary_condition.blocks[shape][buffer].dim.j;
+    const int     dim_k = level->boundary_condition.blocks[shape][buffer].dim.k;
+    const int       ilo = level->boundary_condition.blocks[shape][buffer].read.i;
+    const int       jlo = level->boundary_condition.blocks[shape][buffer].read.j;
+    const int       klo = level->boundary_condition.blocks[shape][buffer].read.k;
+    const int   subtype = level->boundary_condition.blocks[shape][buffer].subtype;
+  //const int    normal = 26-subtype;
+ 
+    // hard code for box to box BC's 
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const double * __restrict__ x  = level->my_boxes[box].vectors[x_id] + level->box_ghosts*(1+jStride+kStride);
+          double * __restrict__ xn = level->my_boxes[box].vectors[x_id] + level->box_ghosts*(1+jStride+kStride); // physically the same, but use different pointers for read/write
+
+    double OneTwelfth = 1.0/12.0;
+
+    // zero out entire ghost region when not all points will be updated...
+    if(box_ghosts>2){
+    for(k=0;k<dim_k;k++){
+    for(j=0;j<dim_j;j++){
+    for(i=0;i<dim_i;i++){
+      int ijk = (i+ilo) + (j+jlo)*jStride + (k+klo)*kStride;
+      xn[ijk] = 0.0;
+    }}}}
+
+    // apply the appropriate BC subtype (face, edge, corner)...
+    if(faces[subtype]){
+      //
+      //    :....:....|....:....:....:....:.
+      //    : ?? : ?? | x1 : x2 : x3 : x4 :
+      //    :....:....|....:....:....:....:.
+      //
+      int r=-1,rStride=-1,dim_r=-1,rlo=-1;
+      int s=-1,sStride=-1,dim_s=-1,slo=-1;
+      int t=-1,tStride=-1,dt=-1;
+    
+      // the two 4-point stencils can point in 6 different directions...
+      switch(subtype){
+        case  4:rlo=ilo;dim_r=dim_i;rStride=      1;slo=jlo;dim_s=dim_j;sStride=jStride;t=     -1;tStride=kStride;dt= tStride;break; // ij face, low k
+        case 10:rlo=ilo;dim_r=dim_i;rStride=      1;slo=klo;dim_s=dim_k;sStride=kStride;t=     -1;tStride=jStride;dt= tStride;break; // ik face, low j
+        case 12:rlo=jlo;dim_r=dim_j;rStride=jStride;slo=klo;dim_s=dim_k;sStride=kStride;t=     -1;tStride=      1;dt= tStride;break; // jk face, low i
+        case 14:rlo=jlo;dim_r=dim_j;rStride=jStride;slo=klo;dim_s=dim_k;sStride=kStride;t=box_dim;tStride=      1;dt=-tStride;break; // jk face, high i
+        case 16:rlo=ilo;dim_r=dim_i;rStride=      1;slo=klo;dim_s=dim_k;sStride=kStride;t=box_dim;tStride=jStride;dt=-tStride;break; // ik face, high j
+        case 22:rlo=ilo;dim_r=dim_i;rStride=      1;slo=jlo;dim_s=dim_j;sStride=jStride;t=box_dim;tStride=kStride;dt=-tStride;break; // ij face, high k
+      }
+      // FIX... optimize for rStride==1 (unit-stride)
+      // FIX... optimize for dt==+/-1
+      double * __restrict__  ghost0 = (double * __restrict__)(x   ); // convince the compiler that read (box) & write (ghost zone) are disjoint
+      double * __restrict__  ghost1 = (double * __restrict__)(x-dt); // convince the compiler that read (box) & write (ghost zone) are disjoint
+      for(s=0;s<dim_s;s++){
+      for(r=0;r<dim_r;r++){
+        int ijk = (r+rlo)*rStride + (s+slo)*sStride + (t)*tStride;
+        double x1=x[ijk+dt], x2=x[ijk+2*dt], x3=x[ijk+3*dt], x4=x[ijk+4*dt];
+        #if 0 // compiler cannot disambiguate xn[ijk] and xn[ijk-dt]
+        xn[ijk   ] = OneTwelfth*(  -77.0*x1 +  43.0*x2 -  17.0*x3 +  3.0*x4 );
+        xn[ijk-dt] = OneTwelfth*( -505.0*x1 + 335.0*x2 - 145.0*x3 + 27.0*x4 );
+        #else
+        ghost0[ijk] = OneTwelfth*(  -77.0*x1 +  43.0*x2 -  17.0*x3 +  3.0*x4 );
+        ghost1[ijk] = OneTwelfth*( -505.0*x1 + 335.0*x2 - 145.0*x3 + 27.0*x4 );
+        #endif
+      }}
+    }else
+    if(edges[subtype]){
+      //
+      //                        +---+---+---+---+ dt
+      //                       /   /   /   /   /|/
+      //                      +---+---+---+---+ |
+      //                r    /   /   /   /   /|/
+      //                ^   +---+---+---+---+ |
+      //                |  /   /   /   /   /|/
+      //                | +---+---+---+---+ |
+      //                |/   /   /   /   /|/
+      //        +---+---+---+---+---+---+ |
+      //       /   /   /|   |   |   |   |/
+      //      +---+---+ |---+---+---+---+---> ds
+      //     /   /   /|/ 
+      //    +---+---+ |  
+      //    |   |   |/   
+      //    +---+---+    
+      //
+      //              ^ dt
+      //              |
+      //    :....:....|....:....:....:....:.
+      //    : f4 : n4 | 14 : 24 : 34 : 44 :
+      //    :....:....|....:....:....:....:.
+      //    : f3 : n3 | 13 : 23 : 33 : 43 :
+      //    :....:....|....:....:....:....:.
+      //    : f2 : n2 | 12 : 22 : 32 : 42 :
+      //    :....:....|....:....:....:....:.
+      //    : f1 : n1 | 11 : 21 : 31 : 41 :
+      //    ----------+---------------------> ds
+      //    : ?? : ?? |
+      //    :....:....|
+      //    : ?? : ?? |
+      //    :....:....|
+      //
+      int r=-1,rStride=-1,dim_r=-1,rlo=-1;
+      int s=-1,sStride=-1,ds=-1;
+      int t=-1,tStride=-1,dt=-1;
+      // the four 16-point stencils (symmetry allows you to view it as 12 4-point) can point in 12 different directions...
+      switch(subtype){
+        case  1:rlo=ilo;dim_r=dim_i;rStride=      1;s=     -1;sStride=jStride;t=     -1;tStride=kStride;ds= sStride;dt= tStride;break; // i-edge,  low j,  low k
+        case  3:rlo=jlo;dim_r=dim_j;rStride=jStride;s=     -1;sStride=      1;t=     -1;tStride=kStride;ds= sStride;dt= tStride;break; // j-edge,  low i,  low k
+        case  5:rlo=jlo;dim_r=dim_j;rStride=jStride;s=box_dim;sStride=      1;t=     -1;tStride=kStride;ds=-sStride;dt= tStride;break; // j-edge, high i,  low k
+        case  7:rlo=ilo;dim_r=dim_i;rStride=      1;s=box_dim;sStride=jStride;t=     -1;tStride=kStride;ds=-sStride;dt= tStride;break; // i-edge, high j,  low k
+        case  9:rlo=klo;dim_r=dim_k;rStride=kStride;s=     -1;sStride=      1;t=     -1;tStride=jStride;ds= sStride;dt= tStride;break; // k-edge,  low i,  low j
+        case 11:rlo=klo;dim_r=dim_k;rStride=kStride;s=box_dim;sStride=      1;t=     -1;tStride=jStride;ds=-sStride;dt= tStride;break; // k-edge, high i,  low j
+        case 15:rlo=klo;dim_r=dim_k;rStride=kStride;s=     -1;sStride=      1;t=box_dim;tStride=jStride;ds= sStride;dt=-tStride;break; // k-edge,  low i, high j
+        case 17:rlo=klo;dim_r=dim_k;rStride=kStride;s=box_dim;sStride=      1;t=box_dim;tStride=jStride;ds=-sStride;dt=-tStride;break; // k-edge, high i, high j
+        case 19:rlo=ilo;dim_r=dim_i;rStride=      1;s=     -1;sStride=jStride;t=box_dim;tStride=kStride;ds= sStride;dt=-tStride;break; // i-edge,  low j, high k
+        case 21:rlo=jlo;dim_r=dim_j;rStride=jStride;s=     -1;sStride=      1;t=box_dim;tStride=kStride;ds= sStride;dt=-tStride;break; // j-edge,  low i, high k
+        case 23:rlo=jlo;dim_r=dim_j;rStride=jStride;s=box_dim;sStride=      1;t=box_dim;tStride=kStride;ds=-sStride;dt=-tStride;break; // j-edge, high i, high k
+        case 25:rlo=ilo;dim_r=dim_i;rStride=      1;s=box_dim;sStride=jStride;t=box_dim;tStride=kStride;ds=-sStride;dt=-tStride;break; // i-edge, high j, high k
+      }
+      // FIX... optimize for rStride==1 (unit-stride)
+      // FIX... optimize for ds==+/-1
+      double * __restrict__  ghost00 = (double * __restrict__)(x      ); // convince the compiler that read (box) & write (ghost zone) are disjoint
+      double * __restrict__  ghost01 = (double * __restrict__)(x   -dt); // convince the compiler that read (box) & write (ghost zone) are disjoint
+      double * __restrict__  ghost10 = (double * __restrict__)(x-ds   ); // convince the compiler that read (box) & write (ghost zone) are disjoint
+      double * __restrict__  ghost11 = (double * __restrict__)(x-ds-dt); // convince the compiler that read (box) & write (ghost zone) are disjoint
+      for(r=0;r<dim_r;r++){
+        int ijk = (r+rlo)*rStride + (s)*sStride + (t)*tStride;
+        double x11 = x[ijk+  ds+  dt], x21 = x[ijk+2*ds+  dt], x31 = x[ijk+3*ds+  dt], x41 = x[ijk+4*ds+  dt];
+        double x12 = x[ijk+  ds+2*dt], x22 = x[ijk+2*ds+2*dt], x32 = x[ijk+3*ds+2*dt], x42 = x[ijk+4*ds+2*dt];
+        double x13 = x[ijk+  ds+3*dt], x23 = x[ijk+2*ds+3*dt], x33 = x[ijk+3*ds+3*dt], x43 = x[ijk+4*ds+3*dt];
+        double x14 = x[ijk+  ds+4*dt], x24 = x[ijk+2*ds+4*dt], x34 = x[ijk+3*ds+4*dt], x44 = x[ijk+4*ds+4*dt];
+            double n1 = OneTwelfth*(  -77.0*x11 +  43.0*x21 -  17.0*x31 +  3.0*x41 );
+            double n2 = OneTwelfth*(  -77.0*x12 +  43.0*x22 -  17.0*x32 +  3.0*x42 );
+            double n3 = OneTwelfth*(  -77.0*x13 +  43.0*x23 -  17.0*x33 +  3.0*x43 );
+            double n4 = OneTwelfth*(  -77.0*x14 +  43.0*x24 -  17.0*x34 +  3.0*x44 );
+            double f1 = OneTwelfth*( -505.0*x11 + 335.0*x21 - 145.0*x31 + 27.0*x41 );
+            double f2 = OneTwelfth*( -505.0*x12 + 335.0*x22 - 145.0*x32 + 27.0*x42 );
+            double f3 = OneTwelfth*( -505.0*x13 + 335.0*x23 - 145.0*x33 + 27.0*x43 );
+            double f4 = OneTwelfth*( -505.0*x14 + 335.0*x24 - 145.0*x34 + 27.0*x44 );
+        #if 0 // compiler cannot disambiguate pointers
+        xn[ijk      ] = OneTwelfth*(  -77.0*n1  +  43.0*n2  -  17.0*n3  +  3.0*n4  );
+        xn[ijk   -dt] = OneTwelfth*( -505.0*n1  + 335.0*n2  - 145.0*n3  + 27.0*n4  );
+        xn[ijk-ds   ] = OneTwelfth*(  -77.0*f1  +  43.0*f2  -  17.0*f3  +  3.0*f4  );
+        xn[ijk-ds-dt] = OneTwelfth*( -505.0*f1  + 335.0*f2  - 145.0*f3  + 27.0*f4  );
+        #else
+        ghost00[ijk] = OneTwelfth*(  -77.0*n1  +  43.0*n2  -  17.0*n3  +  3.0*n4  );
+        ghost01[ijk] = OneTwelfth*( -505.0*n1  + 335.0*n2  - 145.0*n3  + 27.0*n4  );
+        ghost10[ijk] = OneTwelfth*(  -77.0*f1  +  43.0*f2  -  17.0*f3  +  3.0*f4  );
+        ghost11[ijk] = OneTwelfth*( -505.0*f1  + 335.0*f2  - 145.0*f3  + 27.0*f4  );
+        #endif
+      }
+    }else
+    if(corners[subtype]){
+      //
+      //                        +---+---+---+---+
+      //                       /   /   /   /   /|
+      //                      +---+---+---+---+ |
+      //                     /   /   /   /   /|/|
+      //                    +---+---+---+---+ | |
+      //                   /   /   /   /   /|/|/|
+      //                  +---+---+---+---+ | | |
+      //                 /   /   /   /   /|/|/|/|
+      //                +---+---+---+---+ | | | |
+      //                |   |   |   |   |/|/|/|/
+      //                +---+---+---+---+ | | |
+      //                |   |   |   |   |/|/|/
+      //                +---+---+---+---+ | |
+      //                |   |   |   |   |/|/
+      //                +---+---+---+---+ |
+      //                |   |   |   |   |/
+      //        +---+---+---+---+---+---+
+      //       /   /   /|
+      //      +---+---+ |
+      //     /   /   /|/|
+      //    +---+---+ | |
+      //    |   |   |/|/
+      //    +---+---+ |
+      //    |   |   |/
+      //    +---+---+
+      //
+      int i=-1,di=-1;
+      int j=-1,dj=-1;
+      int k=-1,dk=-1;
+      // the eight 64-point stencils (symmetry allows you to view it as 56 4-point) can point in 8 different directions...
+      // FIX... optimize for di==+/-1
+      switch(subtype){
+        case  0:i=     -1;j=     -1;k=     -1;di= 1;dj= jStride;dk= kStride;break; //  low i,  low j,  low k
+        case  2:i=box_dim;j=     -1;k=     -1;di=-1;dj= jStride;dk= kStride;break; // high i,  low j,  low k
+        case  6:i=     -1;j=box_dim;k=     -1;di= 1;dj=-jStride;dk= kStride;break; //  low i, high j,  low k
+        case  8:i=box_dim;j=box_dim;k=     -1;di=-1;dj=-jStride;dk= kStride;break; // high i, high j,  low k
+        case 18:i=     -1;j=     -1;k=box_dim;di= 1;dj= jStride;dk=-kStride;break; //  low i,  low j, high k
+        case 20:i=box_dim;j=     -1;k=box_dim;di=-1;dj= jStride;dk=-kStride;break; // high i,  low j, high k
+        case 24:i=     -1;j=box_dim;k=box_dim;di= 1;dj=-jStride;dk=-kStride;break; //  low i, high j, high k
+        case 26:i=box_dim;j=box_dim;k=box_dim;di=-1;dj=-jStride;dk=-kStride;break; // high i, high j, high k
+      }
+      int ijk = (i) + (j)*jStride + (k)*kStride;
+      double x144 = x[ijk+  di+4*dj+4*dk];double x244 = x[ijk+2*di+4*dj+4*dk];double x344 = x[ijk+3*di+4*dj+4*dk];double x444 = x[ijk+4*di+4*dj+4*dk];
+      double x134 = x[ijk+  di+3*dj+4*dk];double x234 = x[ijk+2*di+3*dj+4*dk];double x334 = x[ijk+3*di+3*dj+4*dk];double x434 = x[ijk+4*di+3*dj+4*dk];
+      double x124 = x[ijk+  di+2*dj+4*dk];double x224 = x[ijk+2*di+2*dj+4*dk];double x324 = x[ijk+3*di+2*dj+4*dk];double x424 = x[ijk+4*di+2*dj+4*dk];
+      double x114 = x[ijk+  di+  dj+4*dk];double x214 = x[ijk+2*di+  dj+4*dk];double x314 = x[ijk+3*di+  dj+4*dk];double x414 = x[ijk+4*di+  dj+4*dk];
+
+      double x143 = x[ijk+  di+4*dj+3*dk];double x243 = x[ijk+2*di+4*dj+3*dk];double x343 = x[ijk+3*di+4*dj+3*dk];double x443 = x[ijk+4*di+4*dj+3*dk];
+      double x133 = x[ijk+  di+3*dj+3*dk];double x233 = x[ijk+2*di+3*dj+3*dk];double x333 = x[ijk+3*di+3*dj+3*dk];double x433 = x[ijk+4*di+3*dj+3*dk];
+      double x123 = x[ijk+  di+2*dj+3*dk];double x223 = x[ijk+2*di+2*dj+3*dk];double x323 = x[ijk+3*di+2*dj+3*dk];double x423 = x[ijk+4*di+2*dj+3*dk];
+      double x113 = x[ijk+  di+  dj+3*dk];double x213 = x[ijk+2*di+  dj+3*dk];double x313 = x[ijk+3*di+  dj+3*dk];double x413 = x[ijk+4*di+  dj+3*dk];
+
+      double x142 = x[ijk+  di+4*dj+2*dk];double x242 = x[ijk+2*di+4*dj+2*dk];double x342 = x[ijk+3*di+4*dj+2*dk];double x442 = x[ijk+4*di+4*dj+2*dk];
+      double x132 = x[ijk+  di+3*dj+2*dk];double x232 = x[ijk+2*di+3*dj+2*dk];double x332 = x[ijk+3*di+3*dj+2*dk];double x432 = x[ijk+4*di+3*dj+2*dk];
+      double x122 = x[ijk+  di+2*dj+2*dk];double x222 = x[ijk+2*di+2*dj+2*dk];double x322 = x[ijk+3*di+2*dj+2*dk];double x422 = x[ijk+4*di+2*dj+2*dk];
+      double x112 = x[ijk+  di+  dj+2*dk];double x212 = x[ijk+2*di+  dj+2*dk];double x312 = x[ijk+3*di+  dj+2*dk];double x412 = x[ijk+4*di+  dj+2*dk];
+
+      double x141 = x[ijk+  di+4*dj+  dk];double x241 = x[ijk+2*di+4*dj+  dk];double x341 = x[ijk+3*di+4*dj+  dk];double x441 = x[ijk+4*di+4*dj+  dk];
+      double x131 = x[ijk+  di+3*dj+  dk];double x231 = x[ijk+2*di+3*dj+  dk];double x331 = x[ijk+3*di+3*dj+  dk];double x431 = x[ijk+4*di+3*dj+  dk];
+      double x121 = x[ijk+  di+2*dj+  dk];double x221 = x[ijk+2*di+2*dj+  dk];double x321 = x[ijk+3*di+2*dj+  dk];double x421 = x[ijk+4*di+2*dj+  dk];
+      double x111 = x[ijk+  di+  dj+  dk];double x211 = x[ijk+2*di+  dj+  dk];double x311 = x[ijk+3*di+  dj+  dk];double x411 = x[ijk+4*di+  dj+  dk];
+
+      // 32 stencils in i...
+      double n11 = OneTwelfth*(  -77.0*x111 +  43.0*x211 -  17.0*x311 +  3.0*x411 );
+      double n21 = OneTwelfth*(  -77.0*x121 +  43.0*x221 -  17.0*x321 +  3.0*x421 );
+      double n31 = OneTwelfth*(  -77.0*x131 +  43.0*x231 -  17.0*x331 +  3.0*x431 );
+      double n41 = OneTwelfth*(  -77.0*x141 +  43.0*x241 -  17.0*x341 +  3.0*x441 );
+      double n12 = OneTwelfth*(  -77.0*x112 +  43.0*x212 -  17.0*x312 +  3.0*x412 );
+      double n22 = OneTwelfth*(  -77.0*x122 +  43.0*x222 -  17.0*x322 +  3.0*x422 );
+      double n32 = OneTwelfth*(  -77.0*x132 +  43.0*x232 -  17.0*x332 +  3.0*x432 );
+      double n42 = OneTwelfth*(  -77.0*x142 +  43.0*x242 -  17.0*x342 +  3.0*x442 );
+      double n13 = OneTwelfth*(  -77.0*x113 +  43.0*x213 -  17.0*x313 +  3.0*x413 );
+      double n23 = OneTwelfth*(  -77.0*x123 +  43.0*x223 -  17.0*x323 +  3.0*x423 );
+      double n33 = OneTwelfth*(  -77.0*x133 +  43.0*x233 -  17.0*x333 +  3.0*x433 );
+      double n43 = OneTwelfth*(  -77.0*x143 +  43.0*x243 -  17.0*x343 +  3.0*x443 );
+      double n14 = OneTwelfth*(  -77.0*x114 +  43.0*x214 -  17.0*x314 +  3.0*x414 );
+      double n24 = OneTwelfth*(  -77.0*x124 +  43.0*x224 -  17.0*x324 +  3.0*x424 );
+      double n34 = OneTwelfth*(  -77.0*x134 +  43.0*x234 -  17.0*x334 +  3.0*x434 );
+      double n44 = OneTwelfth*(  -77.0*x144 +  43.0*x244 -  17.0*x344 +  3.0*x444 );
+
+      double f11 = OneTwelfth*( -505.0*x111 + 335.0*x211 - 145.0*x311 +  27.0*x411 );
+      double f21 = OneTwelfth*( -505.0*x121 + 335.0*x221 - 145.0*x321 +  27.0*x421 );
+      double f31 = OneTwelfth*( -505.0*x131 + 335.0*x231 - 145.0*x331 +  27.0*x431 );
+      double f41 = OneTwelfth*( -505.0*x141 + 335.0*x241 - 145.0*x341 +  27.0*x441 );
+      double f12 = OneTwelfth*( -505.0*x112 + 335.0*x212 - 145.0*x312 +  27.0*x412 );
+      double f22 = OneTwelfth*( -505.0*x122 + 335.0*x222 - 145.0*x322 +  27.0*x422 );
+      double f32 = OneTwelfth*( -505.0*x132 + 335.0*x232 - 145.0*x332 +  27.0*x432 );
+      double f42 = OneTwelfth*( -505.0*x142 + 335.0*x242 - 145.0*x342 +  27.0*x442 );
+      double f13 = OneTwelfth*( -505.0*x113 + 335.0*x213 - 145.0*x313 +  27.0*x413 );
+      double f23 = OneTwelfth*( -505.0*x123 + 335.0*x223 - 145.0*x323 +  27.0*x423 );
+      double f33 = OneTwelfth*( -505.0*x133 + 335.0*x233 - 145.0*x333 +  27.0*x433 );
+      double f43 = OneTwelfth*( -505.0*x143 + 335.0*x243 - 145.0*x343 +  27.0*x443 );
+      double f14 = OneTwelfth*( -505.0*x114 + 335.0*x214 - 145.0*x314 +  27.0*x414 );
+      double f24 = OneTwelfth*( -505.0*x124 + 335.0*x224 - 145.0*x324 +  27.0*x424 );
+      double f34 = OneTwelfth*( -505.0*x134 + 335.0*x234 - 145.0*x334 +  27.0*x434 );
+      double f44 = OneTwelfth*( -505.0*x144 + 335.0*x244 - 145.0*x344 +  27.0*x444 );
+
+      // 16 stencils in j...
+      double nn1 = OneTwelfth*(  -77.0*n11 +  43.0*n21 -  17.0*n31 +  3.0*n41 );
+      double nn2 = OneTwelfth*(  -77.0*n12 +  43.0*n22 -  17.0*n32 +  3.0*n42 );
+      double nn3 = OneTwelfth*(  -77.0*n13 +  43.0*n23 -  17.0*n33 +  3.0*n43 );
+      double nn4 = OneTwelfth*(  -77.0*n14 +  43.0*n24 -  17.0*n34 +  3.0*n44 );
+      double nf1 = OneTwelfth*( -505.0*n11 + 335.0*n21 - 145.0*n31 + 27.0*n41 );
+      double nf2 = OneTwelfth*( -505.0*n12 + 335.0*n22 - 145.0*n32 + 27.0*n42 );
+      double nf3 = OneTwelfth*( -505.0*n13 + 335.0*n23 - 145.0*n33 + 27.0*n43 );
+      double nf4 = OneTwelfth*( -505.0*n14 + 335.0*n24 - 145.0*n34 + 27.0*n44 );
+
+      double fn1 = OneTwelfth*(  -77.0*f11 +  43.0*f21 -  17.0*f31 +  3.0*f41 );
+      double fn2 = OneTwelfth*(  -77.0*f12 +  43.0*f22 -  17.0*f32 +  3.0*f42 );
+      double fn3 = OneTwelfth*(  -77.0*f13 +  43.0*f23 -  17.0*f33 +  3.0*f43 );
+      double fn4 = OneTwelfth*(  -77.0*f14 +  43.0*f24 -  17.0*f34 +  3.0*f44 );
+      double ff1 = OneTwelfth*( -505.0*f11 + 335.0*f21 - 145.0*f31 + 27.0*f41 );
+      double ff2 = OneTwelfth*( -505.0*f12 + 335.0*f22 - 145.0*f32 + 27.0*f42 );
+      double ff3 = OneTwelfth*( -505.0*f13 + 335.0*f23 - 145.0*f33 + 27.0*f43 );
+      double ff4 = OneTwelfth*( -505.0*f14 + 335.0*f24 - 145.0*f34 + 27.0*f44 );
+
+      //  8 stencils in k...
+      double nnn = OneTwelfth*(  -77.0*nn1 +  43.0*nn2 -  17.0*nn3 +  3.0*nn4 );
+      double nnf = OneTwelfth*( -505.0*nn1 + 335.0*nn2 - 145.0*nn3 + 27.0*nn4 );
+      double nfn = OneTwelfth*(  -77.0*nf1 +  43.0*nf2 -  17.0*nf3 +  3.0*nf4 );
+      double nff = OneTwelfth*( -505.0*nf1 + 335.0*nf2 - 145.0*nf3 + 27.0*nf4 );
+      double fnn = OneTwelfth*(  -77.0*fn1 +  43.0*fn2 -  17.0*fn3 +  3.0*fn4 );
+      double fnf = OneTwelfth*( -505.0*fn1 + 335.0*fn2 - 145.0*fn3 + 27.0*fn4 );
+      double ffn = OneTwelfth*(  -77.0*ff1 +  43.0*ff2 -  17.0*ff3 +  3.0*ff4 );
+      double fff = OneTwelfth*( -505.0*ff1 + 335.0*ff2 - 145.0*ff3 + 27.0*ff4 );
+
+      // commit to the 8 ghost zones in this corner...
+      xn[ijk         ] = nnn;
+      xn[ijk      -dk] = nnf;
+      xn[ijk   -dj   ] = nfn;
+      xn[ijk   -dj-dk] = nff;
+      xn[ijk-di      ] = fnn;
+      xn[ijk-di   -dk] = fnf;
+      xn[ijk-di-dj   ] = ffn;
+      xn[ijk-di-dj-dk] = fff;
+    }
+  }
+  level->timers.boundary_conditions += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void extrapolate_betas(level_type * level){
+  if(level->boundary_condition.type == BC_PERIODIC)return; // no BC's to apply !
+  int shape=0;
+
+  int buffer;
+  double _timeStart = getTime();
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->boundary_condition.num_blocks[shape])
+  for(buffer=0;buffer<level->boundary_condition.num_blocks[shape];buffer++){
+    int i,j,k;
+    const int       box = level->boundary_condition.blocks[shape][buffer].read.box; 
+    const int     dim_i = level->boundary_condition.blocks[shape][buffer].dim.i;
+    const int     dim_j = level->boundary_condition.blocks[shape][buffer].dim.j;
+    const int     dim_k = level->boundary_condition.blocks[shape][buffer].dim.k;
+    const int       ilo = level->boundary_condition.blocks[shape][buffer].read.i;
+    const int       jlo = level->boundary_condition.blocks[shape][buffer].read.j;
+    const int       klo = level->boundary_condition.blocks[shape][buffer].read.k;
+
+    // total hack/reuse of the existing boundary list...
+    //   however, whereas boundary subtype represents the normal to the domain at that point, 
+    //   one needs the box-relative (not domain-relative) normal when extending the face averaged beta's into the ghost zones
+    //   Thus, I reuse the list to tell me which areas are beyond the domain boundary, but must calculate their normals here
+          int   subtype = 13;
+    if(ilo <               0)subtype-=1;
+    if(jlo <               0)subtype-=3;
+    if(klo <               0)subtype-=9;
+    if(ilo >= level->box_dim)subtype+=1;
+    if(jlo >= level->box_dim)subtype+=3;
+    if(klo >= level->box_dim)subtype+=9;
+    const int    normal = 26-subtype; // invert the normal vector
+ 
+    // hard code for box to box BC's 
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    double * __restrict__  beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + level->box_ghosts*(1+jStride+kStride);
+    double * __restrict__  beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + level->box_ghosts*(1+jStride+kStride);
+    double * __restrict__  beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + level->box_ghosts*(1+jStride+kStride);
+
+    // convert normal vector into pointer offsets...
+    const int di = (((normal % 3)  )-1);
+    const int dj = (((normal % 9)/3)-1);
+    const int dk = (((normal / 9)  )-1);
+
+    // beta_i should be extrapolated in the j- and k-directions, but not i
+    // beta_j should be extrapolated in the i- and k-directions, but not j
+    // beta_k should be extrapolated in the i- and j-directions, but not k
+    // e.g.
+    //                  .................................
+    //                 .       .       .       .       .
+    //                .       .  ???  .  ???  .       .
+    //               .       .       .       .       .
+    //              ........+-------+-------+........
+    //             .       /       /       /       .
+    //            .  ???  /<betaK>/<betaK>/  ???  .
+    //           .       /       /       /       .
+    //          ........+-------+-------+........
+    //         .       /       /       /       .
+    //        .  ???  /<betaK>/<betaK>/  ???  .
+    //       .       /       /       /       .
+    //      ........+-------+-------+........   k   j
+    //     .       .       .       .       .    ^  ^   
+    //    .       .  ???  .  ???  .       .     | /
+    //   .       .       .       .       .      |/
+    //  .................................       +-----> i
+    //
+    const int biStride =      dj*jStride + dk*kStride;
+    const int bjStride = di              + dk*kStride;
+    const int bkStride = di + dj*jStride             ;
+
+    // note, 
+    //   the face values normal to i should have been filled via RESTRICT_I (skip them)
+    //   the face values normal to j should have been filled via RESTRICT_J (skip them)
+    //   the face values normal to k should have been filled via RESTRICT_K (skip them)
+    if(level->box_dim>=5){
+      // quartic extrapolation... 
+      for(k=0;k<dim_k;k++){
+      for(j=0;j<dim_j;j++){
+      for(i=0;i<dim_i;i++){
+        int ijk = (i+ilo) + (j+jlo)*jStride + (k+klo)*kStride;
+        if( (subtype!=14) && (subtype!=12) ){beta_i[ijk] = 5.0*beta_i[ijk+biStride] - 10.0*beta_i[ijk+2*biStride] + 10.0*beta_i[ijk+3*biStride] - 5.0*beta_i[ijk+4*biStride] + beta_i[ijk+5*biStride];}
+        if( (subtype!=16) && (subtype!=10) ){beta_j[ijk] = 5.0*beta_j[ijk+bjStride] - 10.0*beta_j[ijk+2*bjStride] + 10.0*beta_j[ijk+3*bjStride] - 5.0*beta_j[ijk+4*bjStride] + beta_j[ijk+5*bjStride];}
+        if( (subtype!=22) && (subtype!= 4) ){beta_k[ijk] = 5.0*beta_k[ijk+bkStride] - 10.0*beta_k[ijk+2*bkStride] + 10.0*beta_k[ijk+3*bkStride] - 5.0*beta_k[ijk+4*bkStride] + beta_k[ijk+5*bkStride];}
+      }}}
+    }else 
+    if(level->box_dim>=4){
+      // cubic extrapolation... 
+      for(k=0;k<dim_k;k++){
+      for(j=0;j<dim_j;j++){
+      for(i=0;i<dim_i;i++){
+        int ijk = (i+ilo) + (j+jlo)*jStride + (k+klo)*kStride;
+        if( (subtype!=14) && (subtype!=12) ){beta_i[ijk] = 4.0*beta_i[ijk+biStride] - 6.0*beta_i[ijk+2*biStride] + 4.0*beta_i[ijk+3*biStride] - beta_i[ijk+4*biStride];}
+        if( (subtype!=16) && (subtype!=10) ){beta_j[ijk] = 4.0*beta_j[ijk+bjStride] - 6.0*beta_j[ijk+2*bjStride] + 4.0*beta_j[ijk+3*bjStride] - beta_j[ijk+4*bjStride];}
+        if( (subtype!=22) && (subtype!= 4) ){beta_k[ijk] = 4.0*beta_k[ijk+bkStride] - 6.0*beta_k[ijk+2*bkStride] + 4.0*beta_k[ijk+3*bkStride] - beta_k[ijk+4*bkStride];}
+      }}}
+    }else 
+    if(level->box_dim>=2){
+      // linear extrapolation...
+      for(k=0;k<dim_k;k++){
+      for(j=0;j<dim_j;j++){
+      for(i=0;i<dim_i;i++){
+        int ijk = (i+ilo) + (j+jlo)*jStride + (k+klo)*kStride;
+        if( (subtype!=14) && (subtype!=12) ){beta_i[ijk] = 2.0*beta_i[ijk+biStride] - beta_i[ijk+2*biStride];}
+        if( (subtype!=16) && (subtype!=10) ){beta_j[ijk] = 2.0*beta_j[ijk+bjStride] - beta_j[ijk+2*bjStride];}
+        if( (subtype!=22) && (subtype!= 4) ){beta_k[ijk] = 2.0*beta_k[ijk+bkStride] - beta_k[ijk+2*bkStride];}
+      }}}
+    }
+
+  }
+  level->timers.boundary_conditions += (double)(getTime()-_timeStart);
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators/chebyshev.c b/Util/hpgmg/finite-volume/source/operators/chebyshev.c
new file mode 100644
index 00000000..311ebf40
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/chebyshev.c
@@ -0,0 +1,99 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+// Based on Yousef Saad's Iterative Methods for Sparse Linear Algebra, Algorithm 12.1, page 399
+//------------------------------------------------------------------------------------------------------------------------------
+void smooth(level_type * level, int x_id, int rhs_id, double a, double b){
+  if((CHEBYSHEV_DEGREE*NUM_SMOOTHS)&1){
+    fprintf(stderr,"error... CHEBYSHEV_DEGREE*NUM_SMOOTHS must be even for the chebyshev smoother...\n");
+    exit(0);
+  }
+  if( (level->dominant_eigenvalue_of_DinvA<=0.0) && (level->my_rank==0) )fprintf(stderr,"dominant_eigenvalue_of_DinvA <= 0.0 !\n");
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  int s;
+  int block;
+
+
+  // compute the Chebyshev coefficients...
+  double beta     = 1.000*level->dominant_eigenvalue_of_DinvA;
+//double alpha    = 0.300000*beta;
+//double alpha    = 0.250000*beta;
+//double alpha    = 0.166666*beta;
+  double alpha    = 0.125000*beta;
+  double theta    = 0.5*(beta+alpha);		// center of the spectral ellipse
+  double delta    = 0.5*(beta-alpha);		// major axis?
+  double sigma = theta/delta;
+  double rho_n = 1/sigma;			// rho_0
+  double chebyshev_c1[CHEBYSHEV_DEGREE];	// + c1*(x_n-x_nm1) == rho_n*rho_nm1
+  double chebyshev_c2[CHEBYSHEV_DEGREE];	// + c2*(b-Ax_n)
+  chebyshev_c1[0] = 0.0;
+  chebyshev_c2[0] = 1/theta;
+  for(s=1;s<CHEBYSHEV_DEGREE;s++){
+    double rho_nm1 = rho_n;
+    rho_n = 1.0/(2.0*sigma - rho_nm1);
+    chebyshev_c1[s] = rho_n*rho_nm1;
+    chebyshev_c2[s] = rho_n*2.0/delta;
+  }
+
+
+  for(s=0;s<CHEBYSHEV_DEGREE*NUM_SMOOTHS;s++){
+    // get ghost zone data... Chebyshev ping pongs between x_id and VECTOR_TEMP
+    if((s&1)==0){exchange_boundary(level,       x_id,stencil_get_shape());apply_BCs(level,       x_id,stencil_get_shape());}
+            else{exchange_boundary(level,VECTOR_TEMP,stencil_get_shape());apply_BCs(level,VECTOR_TEMP,stencil_get_shape());}
+   
+    // apply the smoother... Chebyshev ping pongs between x_id and VECTOR_TEMP
+    double _timeStart = getTime();
+
+    PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
+    for(block=0;block<level->num_my_blocks;block++){
+      const int box = level->my_blocks[block].read.box;
+      const int ilo = level->my_blocks[block].read.i;
+      const int jlo = level->my_blocks[block].read.j;
+      const int klo = level->my_blocks[block].read.k;
+      const int ihi = level->my_blocks[block].dim.i + ilo;
+      const int jhi = level->my_blocks[block].dim.j + jlo;
+      const int khi = level->my_blocks[block].dim.k + klo;
+      int i,j,k;
+      const int ghosts = level->box_ghosts;
+      const int jStride = level->my_boxes[box].jStride;
+      const int kStride = level->my_boxes[box].kStride;
+      const double h2inv = 1.0/(level->h*level->h);
+      const double * __restrict__ rhs      = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ alpha    = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_i   = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_j   = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_k   = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ Dinv     = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
+
+            double * __restrict__ x_np1;
+      const double * __restrict__ x_n;
+      const double * __restrict__ x_nm1;
+                       if((s&1)==0){x_n    = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride);
+                                    x_nm1  = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride); 
+                                    x_np1  = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);}
+                               else{x_n    = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);
+                                    x_nm1  = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride); 
+                                    x_np1  = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride);}
+      const double c1 = chebyshev_c1[s%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE.
+      const double c2 = chebyshev_c2[s%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE.
+
+      for(k=klo;k<khi;k++){
+      for(j=jlo;j<jhi;j++){
+      for(i=ilo;i<ihi;i++){
+        const int ijk = i + j*jStride + k*kStride;
+        // According to Saad... but his was missing a Dinv[ijk] == D^{-1} !!!
+        //  x_{n+1} = x_{n} + rho_{n} [ rho_{n-1}(x_{n} - x_{n-1}) + (2/delta)(b-Ax_{n}) ]
+        //  x_temp[ijk] = x_n[ijk] + c1*(x_n[ijk]-x_temp[ijk]) + c2*Dinv[ijk]*(rhs[ijk]-Ax_n);
+        const double Ax_n   = apply_op_ijk(x_n);
+        const double lambda =     Dinv_ijk();
+        x_np1[ijk] = x_n[ijk] + c1*(x_n[ijk]-x_nm1[ijk]) + c2*lambda*(rhs[ijk]-Ax_n);
+      }}}
+
+    } // box-loop
+    level->timers.smooth += (double)(getTime()-_timeStart);
+  } // s-loop
+}
diff --git a/Util/hpgmg/finite-volume/source/operators/exchange_boundary.c b/Util/hpgmg/finite-volume/source/operators/exchange_boundary.c
new file mode 100644
index 00000000..d2884739
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/exchange_boundary.c
@@ -0,0 +1,117 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+// perform a (intra-level) ghost zone exchange on vector id
+//  NOTE exchange_boundary() only exchanges the boundary.  
+//  It will not enforce any boundary conditions
+//  BC's are either the responsibility of a separate function or should be fused into the stencil
+// The argument shape indicates which of faces, edges, and corners on each box must be exchanged
+//  If the specified shape exceeds the range of defined shapes, the code will default to STENCIL_SHAPE_BOX (i.e. exchange faces, edges, and corners)
+void exchange_boundary(level_type * level, int id, int shape){
+  double _timeCommunicationStart = getTime();
+  double _timeStart,_timeEnd;
+
+  if(shape>=STENCIL_MAX_SHAPES)shape=STENCIL_SHAPE_BOX;  // shape must be < STENCIL_MAX_SHAPES in order to safely index into exchange_ghosts[]
+  int my_tag = (level->tag<<4) | shape;
+  int buffer=0;
+  int n;
+
+  #ifdef USE_MPI
+  int nMessages = level->exchange_ghosts[shape].num_recvs + level->exchange_ghosts[shape].num_sends;
+  MPI_Request *recv_requests = level->exchange_ghosts[shape].requests;
+  MPI_Request *send_requests = level->exchange_ghosts[shape].requests + level->exchange_ghosts[shape].num_recvs;
+
+  // loop through packed list of MPI receives and prepost Irecv's...
+  if(level->exchange_ghosts[shape].num_recvs>0){
+    _timeStart = getTime();
+    #ifdef USE_MPI_THREAD_MULTIPLE
+    #pragma omp parallel for schedule(dynamic,1)
+    #endif
+    for(n=0;n<level->exchange_ghosts[shape].num_recvs;n++){
+      MPI_Irecv(level->exchange_ghosts[shape].recv_buffers[n],
+                level->exchange_ghosts[shape].recv_sizes[n],
+                MPI_DOUBLE,
+                level->exchange_ghosts[shape].recv_ranks[n],
+                my_tag,
+                MPI_COMM_WORLD,
+                &recv_requests[n]
+      );
+    }
+    _timeEnd = getTime();
+    level->timers.ghostZone_recv += (_timeEnd-_timeStart);
+  }
+
+
+  // pack MPI send buffers...
+  if(level->exchange_ghosts[shape].num_blocks[0]){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->exchange_ghosts[shape].num_blocks[0])
+    for(buffer=0;buffer<level->exchange_ghosts[shape].num_blocks[0];buffer++){
+      CopyBlock(level,id,&level->exchange_ghosts[shape].blocks[0][buffer]);
+    }
+    _timeEnd = getTime();
+    level->timers.ghostZone_pack += (_timeEnd-_timeStart);
+  }
+
+ 
+  // loop through MPI send buffers and post Isend's...
+  if(level->exchange_ghosts[shape].num_sends>0){
+    _timeStart = getTime();
+    #ifdef USE_MPI_THREAD_MULTIPLE
+    #pragma omp parallel for schedule(dynamic,1)
+    #endif
+    for(n=0;n<level->exchange_ghosts[shape].num_sends;n++){
+      MPI_Isend(level->exchange_ghosts[shape].send_buffers[n],
+                level->exchange_ghosts[shape].send_sizes[n],
+                MPI_DOUBLE,
+                level->exchange_ghosts[shape].send_ranks[n],
+                my_tag,
+                MPI_COMM_WORLD,
+                &send_requests[n]
+      ); 
+    }
+    _timeEnd = getTime();
+    level->timers.ghostZone_send += (_timeEnd-_timeStart);
+  }
+  #endif
+
+
+  // exchange locally... try and hide within Isend latency... 
+  if(level->exchange_ghosts[shape].num_blocks[1]){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->exchange_ghosts[shape].num_blocks[1])
+    for(buffer=0;buffer<level->exchange_ghosts[shape].num_blocks[1];buffer++){
+      CopyBlock(level,id,&level->exchange_ghosts[shape].blocks[1][buffer]);
+    }
+    _timeEnd = getTime();
+    level->timers.ghostZone_local += (_timeEnd-_timeStart);
+  }
+
+
+  // wait for MPI to finish...
+  #ifdef USE_MPI 
+  if(nMessages){
+    _timeStart = getTime();
+    MPI_Waitall(nMessages,level->exchange_ghosts[shape].requests,level->exchange_ghosts[shape].status);
+    _timeEnd = getTime();
+    level->timers.ghostZone_wait += (_timeEnd-_timeStart);
+  }
+
+
+  // unpack MPI receive buffers 
+  if(level->exchange_ghosts[shape].num_blocks[2]){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level,buffer,level->exchange_ghosts[shape].num_blocks[2])
+    for(buffer=0;buffer<level->exchange_ghosts[shape].num_blocks[2];buffer++){
+      CopyBlock(level,id,&level->exchange_ghosts[shape].blocks[2][buffer]);
+    }
+    _timeEnd = getTime();
+    level->timers.ghostZone_unpack += (_timeEnd-_timeStart);
+  }
+  #endif
+
+ 
+  level->timers.ghostZone_total += (double)(getTime()-_timeCommunicationStart);
+}
diff --git a/Util/hpgmg/finite-volume/source/operators/gsrb.c b/Util/hpgmg/finite-volume/source/operators/gsrb.c
new file mode 100644
index 00000000..aad48371
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/gsrb.c
@@ -0,0 +1,136 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#if   defined(GSRB_FP)
+  #warning Overriding default GSRB implementation and using pre-computed 1.0/0.0 FP array for Red-Black to facilitate vectorization...
+#elif defined(GSRB_STRIDE2)
+  #if defined(GSRB_OOP)
+  #warning Overriding default GSRB implementation and using out-of-place and stride-2 accesses to minimize the number of flops
+  #else
+  #warning Overriding default GSRB implementation and using stride-2 accesses to minimize the number of flops
+  #endif
+#elif defined(GSRB_BRANCH)
+  #if defined(GSRB_OOP)
+  #warning Overriding default GSRB implementation and using out-of-place implementation with an if-then-else on loop indices...
+  #else
+  #warning Overriding default GSRB implementation and using if-then-else on loop indices...
+  #endif
+#else
+#define GSRB_STRIDE2 // default implementation
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+void smooth(level_type * level, int x_id, int rhs_id, double a, double b){
+  int block,s;
+  for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps per GSRB smooth
+
+    // exchange the ghost zone...
+    #ifdef GSRB_OOP // out-of-place GSRB ping pongs between x and VECTOR_TEMP
+    if((s&1)==0){exchange_boundary(level,       x_id,stencil_get_shape());apply_BCs(level,       x_id,stencil_get_shape());}
+            else{exchange_boundary(level,VECTOR_TEMP,stencil_get_shape());apply_BCs(level,VECTOR_TEMP,stencil_get_shape());}
+    #else // in-place GSRB only operates on x
+                 exchange_boundary(level,       x_id,stencil_get_shape());apply_BCs(level,        x_id,stencil_get_shape());
+    #endif
+
+    // apply the smoother...
+    double _timeStart = getTime();
+
+    // loop over all block/tiles this process owns...
+    PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
+    for(block=0;block<level->num_my_blocks;block++){
+      const int box = level->my_blocks[block].read.box;
+      const int ilo = level->my_blocks[block].read.i;
+      const int jlo = level->my_blocks[block].read.j;
+      const int klo = level->my_blocks[block].read.k;
+      const int ihi = level->my_blocks[block].dim.i + ilo;
+      const int jhi = level->my_blocks[block].dim.j + jlo;
+      const int khi = level->my_blocks[block].dim.k + klo;
+
+      int i,j,k;
+      const double h2inv = 1.0/(level->h*level->h);
+      const int ghosts =  level->box_ghosts;
+      const int jStride = level->my_boxes[box].jStride;
+      const int kStride = level->my_boxes[box].kStride;
+      const int color000 = (level->my_boxes[box].low.i^level->my_boxes[box].low.j^level->my_boxes[box].low.k^s)&1;  // is element 000 red or black on *THIS* sweep
+
+      const double * __restrict__ rhs      = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ alpha    = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_i   = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_j   = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_k   = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ Dinv     = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
+      #ifdef GSRB_OOP
+      const double * __restrict__ x_n;
+            double * __restrict__ x_np1;
+                     if((s&1)==0){x_n      = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride);
+                                  x_np1    = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);}
+                             else{x_n      = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);
+                                  x_np1    = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride);}
+      #else
+      const double * __restrict__ x_n      = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+            double * __restrict__ x_np1    = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+      #endif
+          
+
+      #if defined(GSRB_FP)
+      for(k=klo;k<khi;k++){const double * __restrict__ RedBlack = level->RedBlack_FP + ghosts*(1+jStride) + kStride*((k^color000)&0x1);
+      for(j=jlo;j<jhi;j++){
+      for(i=ilo;i<ihi;i++){
+            int ij  = i + j*jStride;
+            int ijk = i + j*jStride + k*kStride;
+            double Ax     = apply_op_ijk(x_n);
+            double lambda =     Dinv_ijk();
+            x_np1[ijk] = x_n[ijk] + RedBlack[ij]*1.15*lambda*(rhs[ijk]-Ax);
+            //x_np1[ijk] = ((i^j^k^color000)&1) ? x_n[ijk] : x_n[ijk] + lambda*(rhs[ijk]-Ax);
+      }}}
+
+
+      #elif defined(GSRB_STRIDE2)
+      for(k=klo;k<khi;k++){
+      for(j=jlo;j<jhi;j++){
+        #ifdef GSRB_OOP
+        // out-of-place must copy old value...
+        for(i=ilo;i<ihi;i++){
+          int ijk = i + j*jStride + k*kStride; 
+          x_np1[ijk] = x_n[ijk];
+        }
+        #endif
+        for(i=ilo+((ilo^j^k^color000)&1);i<ihi;i+=2){ // stride-2 GSRB
+          int ijk = i + j*jStride + k*kStride; 
+          double Ax     = apply_op_ijk(x_n);
+          double lambda =     Dinv_ijk();
+          x_np1[ijk] = x_n[ijk] + 1.15*lambda*(rhs[ijk]-Ax);
+        }
+      }}
+
+
+      #elif defined(GSRB_BRANCH)
+      for(k=klo;k<khi;k++){
+      for(j=jlo;j<jhi;j++){
+      for(i=ilo;i<ihi;i++){
+        int ijk = i + j*jStride + k*kStride;
+        if((i^j^k^color000^1)&1){ // looks very clean when [0] is i,j,k=0,0,0 
+          double Ax     = apply_op_ijk(x_n);
+          double lambda =     Dinv_ijk();
+          x_np1[ijk] = x_n[ijk] + 1.15*lambda*(rhs[ijk]-Ax);
+        #ifdef GSRB_OOP
+        }else{
+          x_np1[ijk] = x_n[ijk]; // copy old value when sweep color != cell color
+        #endif
+        }
+      }}}
+
+
+      #else
+      #error no GSRB implementation was specified
+      #endif
+
+
+    } // boxes
+    level->timers.smooth += (double)(getTime()-_timeStart);
+  } // s-loop
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators/interpolation_p0.c b/Util/hpgmg/finite-volume/source/operators/interpolation_p0.c
new file mode 100644
index 00000000..51f7bfd0
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/interpolation_p0.c
@@ -0,0 +1,159 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+static inline void interpolation_p0_block(level_type *level_f, int id_f, double prescale_f, level_type *level_c, int id_c, blockCopy_type *block){
+  // interpolate 3D array from read_i,j,k of read[] to write_i,j,k in write[]
+  int   dim_i       = block->dim.i<<1; // calculate the dimensions of the resultant fine block
+  int   dim_j       = block->dim.j<<1;
+  int   dim_k       = block->dim.k<<1;
+
+  int  read_i       = block->read.i;
+  int  read_j       = block->read.j;
+  int  read_k       = block->read.k;
+  int  read_jStride = block->read.jStride;
+  int  read_kStride = block->read.kStride;
+
+  int write_i       = block->write.i;
+  int write_j       = block->write.j;
+  int write_k       = block->write.k;
+  int write_jStride = block->write.jStride;
+  int write_kStride = block->write.kStride;
+
+  double * __restrict__  read = block->read.ptr;
+  double * __restrict__ write = block->write.ptr;
+  if(block->read.box >=0){
+     read = level_c->my_boxes[ block->read.box].vectors[id_c] + level_c->my_boxes[ block->read.box].ghosts*(1+level_c->my_boxes[ block->read.box].jStride+level_c->my_boxes[ block->read.box].kStride);
+     read_jStride = level_c->my_boxes[block->read.box ].jStride;
+     read_kStride = level_c->my_boxes[block->read.box ].kStride;
+  }
+  if(block->write.box>=0){
+    write = level_f->my_boxes[block->write.box].vectors[id_f] + level_f->my_boxes[block->write.box].ghosts*(1+level_f->my_boxes[block->write.box].jStride+level_f->my_boxes[block->write.box].kStride);
+    write_jStride = level_f->my_boxes[block->write.box].jStride;
+    write_kStride = level_f->my_boxes[block->write.box].kStride;
+  }
+ 
+ 
+  int i,j,k;
+  for(k=0;k<dim_k;k++){
+  for(j=0;j<dim_j;j++){
+  for(i=0;i<dim_i;i++){
+    int write_ijk = ((i   )+write_i) + (((j   )+write_j)*write_jStride) + (((k   )+write_k)*write_kStride);
+    int  read_ijk = ((i>>1)+ read_i) + (((j>>1)+ read_j)* read_jStride) + (((k>>1)+ read_k)* read_kStride);
+    write[write_ijk] = prescale_f*write[write_ijk] + read[read_ijk]; // CAREFUL !!!  you must guarantee you zero'd the MPI buffers(write[]) and destination boxes at some point to avoid 0.0*NaN or 0.0*inf
+  }}}
+
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// perform a (inter-level) piecewise constant interpolation
+void interpolation_p0(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){
+  double _timeCommunicationStart = getTime();
+  double _timeStart,_timeEnd;
+  int my_tag = (level_f->tag<<4) | 0x6;
+  int buffer=0;
+  int n;
+
+
+  #ifdef USE_MPI
+  // by convention, level_f allocates a combined array of requests for both level_f recvs and level_c sends...
+  int nMessages = level_c->interpolation.num_sends + level_f->interpolation.num_recvs;
+  MPI_Request *recv_requests = level_f->interpolation.requests;
+  MPI_Request *send_requests = level_f->interpolation.requests + level_f->interpolation.num_recvs;
+
+
+  // loop through packed list of MPI receives and prepost Irecv's...
+  if(level_f->interpolation.num_recvs>0){
+    _timeStart = getTime();
+    #ifdef USE_MPI_THREAD_MULTIPLE
+    #pragma omp parallel for schedule(dynamic,1)
+    #endif
+    for(n=0;n<level_f->interpolation.num_recvs;n++){
+      MPI_Irecv(level_f->interpolation.recv_buffers[n],
+                level_f->interpolation.recv_sizes[n],
+                MPI_DOUBLE,
+                level_f->interpolation.recv_ranks[n],
+                my_tag,
+                MPI_COMM_WORLD,
+                &recv_requests[n]
+      );
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_recv += (_timeEnd-_timeStart);
+  }
+
+
+  // pack MPI send buffers...
+  if(level_c->interpolation.num_blocks[0]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[0])
+    for(buffer=0;buffer<level_c->interpolation.num_blocks[0];buffer++){
+      // !!! prescale==0 because you don't want to increment the MPI buffer
+      interpolation_p0_block(level_f,id_f,0.0,level_c,id_c,&level_c->interpolation.blocks[0][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_pack += (_timeEnd-_timeStart);
+  }
+
+
+  // loop through MPI send buffers and post Isend's...
+  if(level_c->interpolation.num_sends>0){
+    _timeStart = getTime();
+    #ifdef USE_MPI_THREAD_MULTIPLE
+    #pragma omp parallel for schedule(dynamic,1)
+    #endif
+    for(n=0;n<level_c->interpolation.num_sends;n++){
+      MPI_Isend(level_c->interpolation.send_buffers[n],
+                level_c->interpolation.send_sizes[n],
+                MPI_DOUBLE,
+                level_c->interpolation.send_ranks[n],
+                my_tag,
+                MPI_COMM_WORLD,
+                &send_requests[n]
+      );
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_send += (_timeEnd-_timeStart);
+  }
+  #endif
+
+
+  // perform local interpolation... try and hide within Isend latency... 
+  if(level_c->interpolation.num_blocks[1]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[1])
+    for(buffer=0;buffer<level_c->interpolation.num_blocks[1];buffer++){
+      interpolation_p0_block(level_f,id_f,prescale_f,level_c,id_c,&level_c->interpolation.blocks[1][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_local += (_timeEnd-_timeStart);
+  }
+
+
+  // wait for MPI to finish...
+  #ifdef USE_MPI 
+  if(nMessages>0){
+    _timeStart = getTime();
+    MPI_Waitall(nMessages,level_f->interpolation.requests,level_f->interpolation.status);
+    _timeEnd = getTime();
+    level_f->timers.interpolation_wait += (_timeEnd-_timeStart);
+  }
+
+
+  // unpack MPI receive buffers 
+  if(level_f->interpolation.num_blocks[2]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->interpolation.num_blocks[2])
+    for(buffer=0;buffer<level_f->interpolation.num_blocks[2];buffer++){
+      IncrementBlock(level_f,id_f,prescale_f,&level_f->interpolation.blocks[2][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_unpack += (_timeEnd-_timeStart);
+  }
+  #endif 
+ 
+ 
+  level_f->timers.interpolation_total += (double)(getTime()-_timeCommunicationStart);
+}
diff --git a/Util/hpgmg/finite-volume/source/operators/interpolation_p1.c b/Util/hpgmg/finite-volume/source/operators/interpolation_p1.c
new file mode 100644
index 00000000..9a05232b
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/interpolation_p1.c
@@ -0,0 +1,179 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+static inline void interpolation_p1_block(level_type *level_f, int id_f, double prescale_f, level_type *level_c, int id_c, blockCopy_type *block){
+  // interpolate 3D array from read_i,j,k of read[] to write_i,j,k in write[]
+  int write_dim_i   = block->dim.i<<1; // calculate the dimensions of the resultant fine block
+  int write_dim_j   = block->dim.j<<1;
+  int write_dim_k   = block->dim.k<<1;
+
+  int  read_i       = block->read.i;
+  int  read_j       = block->read.j;
+  int  read_k       = block->read.k;
+  int  read_jStride = block->read.jStride;
+  int  read_kStride = block->read.kStride;
+
+  int write_i       = block->write.i;
+  int write_j       = block->write.j;
+  int write_k       = block->write.k;
+  int write_jStride = block->write.jStride;
+  int write_kStride = block->write.kStride;
+
+  double * __restrict__  read = block->read.ptr;
+  double * __restrict__ write = block->write.ptr;
+  if(block->read.box >=0){
+     read = level_c->my_boxes[ block->read.box].vectors[id_c] + level_c->my_boxes[ block->read.box].ghosts*(1+level_c->my_boxes[ block->read.box].jStride+level_c->my_boxes[ block->read.box].kStride);
+     read_jStride = level_c->my_boxes[block->read.box ].jStride;
+     read_kStride = level_c->my_boxes[block->read.box ].kStride;
+  }
+  if(block->write.box>=0){
+    write = level_f->my_boxes[block->write.box].vectors[id_f] + level_f->my_boxes[block->write.box].ghosts*(1+level_f->my_boxes[block->write.box].jStride+level_f->my_boxes[block->write.box].kStride);
+    write_jStride = level_f->my_boxes[block->write.box].jStride;
+    write_kStride = level_f->my_boxes[block->write.box].kStride;
+  }
+ 
+ 
+  int i,j,k;
+  for(k=0;k<write_dim_k;k++){int delta_k=-read_kStride;if(k&0x1)delta_k=read_kStride;
+  for(j=0;j<write_dim_j;j++){int delta_j=-read_jStride;if(j&0x1)delta_j=read_jStride;
+  for(i=0;i<write_dim_i;i++){int delta_i=           -1;if(i&0x1)delta_i=           1; // i.e. even points look backwards while odd points look forward
+    int write_ijk = ((i   )+write_i) + (((j   )+write_j)*write_jStride) + (((k   )+write_k)*write_kStride);
+    int  read_ijk = ((i>>1)+ read_i) + (((j>>1)+ read_j)* read_jStride) + (((k>>1)+ read_k)* read_kStride);
+    //
+    // |   o   |   o   |
+    // +---+---+---+---+
+    // |   | x | x |   |
+    //
+    // CAREFUL !!!  you must guarantee you zero'd the MPI buffers(write[]) and destination boxes at some point to avoid 0.0*NaN or 0.0*inf
+    // piecewise linear interpolation... NOTE, BC's must have been previously applied
+    write[write_ijk] = prescale_f*write[write_ijk] + 
+        0.421875*read[read_ijk                        ] +
+        0.140625*read[read_ijk                +delta_k] +
+        0.140625*read[read_ijk        +delta_j        ] +
+        0.046875*read[read_ijk        +delta_j+delta_k] +
+        0.140625*read[read_ijk+delta_i                ] +
+        0.046875*read[read_ijk+delta_i        +delta_k] +
+        0.046875*read[read_ijk+delta_i+delta_j        ] +
+        0.015625*read[read_ijk+delta_i+delta_j+delta_k];
+  }}}
+
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// perform a (inter-level) piecewise linear interpolation
+void interpolation_p1(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){
+  exchange_boundary(level_c,id_c,STENCIL_SHAPE_BOX);
+       apply_BCs_p1(level_c,id_c,STENCIL_SHAPE_BOX);
+
+  double _timeCommunicationStart = getTime();
+  double _timeStart,_timeEnd;
+  int buffer=0;
+  int n;
+  int my_tag = (level_f->tag<<4) | 0x7;
+
+
+  #ifdef USE_MPI
+  // by convention, level_f allocates a combined array of requests for both level_f recvs and level_c sends...
+  int nMessages = level_c->interpolation.num_sends + level_f->interpolation.num_recvs;
+  MPI_Request *recv_requests = level_f->interpolation.requests;
+  MPI_Request *send_requests = level_f->interpolation.requests + level_f->interpolation.num_recvs;
+
+
+  // loop through packed list of MPI receives and prepost Irecv's...
+  if(level_f->interpolation.num_recvs>0){
+    _timeStart = getTime();
+    #ifdef USE_MPI_THREAD_MULTIPLE
+    #pragma omp parallel for schedule(dynamic,1)
+    #endif
+    for(n=0;n<level_f->interpolation.num_recvs;n++){
+      MPI_Irecv(level_f->interpolation.recv_buffers[n],
+                level_f->interpolation.recv_sizes[n],
+                MPI_DOUBLE,
+                level_f->interpolation.recv_ranks[n],
+                my_tag,
+                MPI_COMM_WORLD,
+                &recv_requests[n]
+      );
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_recv += (_timeEnd-_timeStart);
+  }
+
+
+  // pack MPI send buffers...
+  if(level_c->interpolation.num_blocks[0]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[0])
+    for(buffer=0;buffer<level_c->interpolation.num_blocks[0];buffer++){
+      // !!! prescale==0 because you don't want to increment the MPI buffer
+      interpolation_p1_block(level_f,id_f,0.0,level_c,id_c,&level_c->interpolation.blocks[0][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_pack += (_timeEnd-_timeStart);
+  }
+
+
+  // loop through MPI send buffers and post Isend's...
+  if(level_c->interpolation.num_sends>0){
+    _timeStart = getTime();
+    #ifdef USE_MPI_THREAD_MULTIPLE
+    #pragma omp parallel for schedule(dynamic,1)
+    #endif
+    for(n=0;n<level_c->interpolation.num_sends;n++){
+      MPI_Isend(level_c->interpolation.send_buffers[n],
+                level_c->interpolation.send_sizes[n],
+                MPI_DOUBLE,
+                level_c->interpolation.send_ranks[n],
+                my_tag,
+                MPI_COMM_WORLD,
+                &send_requests[n]
+      );
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_send += (_timeEnd-_timeStart);
+  }
+  #endif
+
+
+  // perform local interpolation... try and hide within Isend latency... 
+  if(level_c->interpolation.num_blocks[1]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[1])
+    for(buffer=0;buffer<level_c->interpolation.num_blocks[1];buffer++){
+      interpolation_p1_block(level_f,id_f,prescale_f,level_c,id_c,&level_c->interpolation.blocks[1][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_local += (_timeEnd-_timeStart);
+  }
+
+
+  // wait for MPI to finish...
+  #ifdef USE_MPI 
+  if(nMessages>0){
+    _timeStart = getTime();
+    MPI_Waitall(nMessages,level_f->interpolation.requests,level_f->interpolation.status);
+    _timeEnd = getTime();
+    level_f->timers.interpolation_wait += (_timeEnd-_timeStart);
+  }
+
+
+  // unpack MPI receive buffers 
+  if(level_f->interpolation.num_blocks[2]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->interpolation.num_blocks[2])
+    for(buffer=0;buffer<level_f->interpolation.num_blocks[2];buffer++){
+      IncrementBlock(level_f,id_f,prescale_f,&level_f->interpolation.blocks[2][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_unpack += (_timeEnd-_timeStart);
+  }
+  #endif 
+ 
+ 
+  level_f->timers.interpolation_total += (double)(getTime()-_timeCommunicationStart);
+}
diff --git a/Util/hpgmg/finite-volume/source/operators/interpolation_p2.c b/Util/hpgmg/finite-volume/source/operators/interpolation_p2.c
new file mode 100644
index 00000000..6ad1fa62
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/interpolation_p2.c
@@ -0,0 +1,338 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+static inline void interpolation_p2_block(level_type *level_f, int id_f, double prescale_f, level_type *level_c, int id_c, blockCopy_type *block){
+  // interpolate 3D array from read_i,j,k of read[] to write_i,j,k in write[]
+  int write_dim_i   = block->dim.i<<1; // calculate the dimensions of the resultant fine block
+  int write_dim_j   = block->dim.j<<1;
+  int write_dim_k   = block->dim.k<<1;
+
+  int  read_i       = block->read.i;
+  int  read_j       = block->read.j;
+  int  read_k       = block->read.k;
+  int  read_jStride = block->read.jStride;
+  int  read_kStride = block->read.kStride;
+
+  int write_i       = block->write.i;
+  int write_j       = block->write.j;
+  int write_k       = block->write.k;
+  int write_jStride = block->write.jStride;
+  int write_kStride = block->write.kStride;
+
+  const double * __restrict__  read = block->read.ptr;
+        double * __restrict__ write = block->write.ptr;
+
+  if(block->read.box >=0){
+     read_jStride = level_c->my_boxes[block->read.box ].jStride;
+     read_kStride = level_c->my_boxes[block->read.box ].kStride;
+     read = level_c->my_boxes[ block->read.box].vectors[id_c] + level_c->box_ghosts*(1+ read_jStride+ read_kStride);
+  }
+  if(block->write.box>=0){
+    write_jStride = level_f->my_boxes[block->write.box].jStride;
+    write_kStride = level_f->my_boxes[block->write.box].kStride;
+    write = level_f->my_boxes[block->write.box].vectors[id_f] + level_f->box_ghosts*(1+write_jStride+write_kStride);
+  }
+ 
+
+  #ifdef USE_NAIVE_INTERP
+  int i,j,k;
+  double OneOver32Cubed = 1.0/32768.0;
+  for(k=0;k<write_dim_k;k++){int delta_k=-read_kStride;if(k&0x1)delta_k=read_kStride;
+  for(j=0;j<write_dim_j;j++){int delta_j=-read_jStride;if(j&0x1)delta_j=read_jStride;
+  for(i=0;i<write_dim_i;i++){int delta_i=           -1;if(i&0x1)delta_i=           1; // i.e. even points look backwards while odd points look forward
+    int write_ijk = ((i   )+write_i) + (((j   )+write_j)*write_jStride) + (((k   )+write_k)*write_kStride);
+    int  read_ijk = ((i>>1)+ read_i) + (((j>>1)+ read_j)* read_jStride) + (((k>>1)+ read_k)* read_kStride);
+    //
+    // | -3/32 | 30/32 |  5/32 |
+    // |---+---|---+---|---+---|
+    // |   |   |   | x |   |   |
+    //
+    write[write_ijk] = prescale_f*write[write_ijk] +
+                       OneOver32Cubed*(
+                         -27.0*read[read_ijk-delta_i-delta_j-delta_k] +
+                         270.0*read[read_ijk        -delta_j-delta_k] +
+                          45.0*read[read_ijk+delta_i-delta_j-delta_k] +
+                         270.0*read[read_ijk-delta_i        -delta_k] +
+                       -2700.0*read[read_ijk                -delta_k] +
+                        -450.0*read[read_ijk+delta_i        -delta_k] +
+                          45.0*read[read_ijk-delta_i+delta_j-delta_k] +
+                        -450.0*read[read_ijk        +delta_j-delta_k] +
+                         -75.0*read[read_ijk+delta_i+delta_j-delta_k] +
+
+                         270.0*read[read_ijk-delta_i-delta_j        ] +
+                       -2700.0*read[read_ijk        -delta_j        ] +
+                        -450.0*read[read_ijk+delta_i-delta_j        ] +
+                       -2700.0*read[read_ijk-delta_i                ] +
+                       27000.0*read[read_ijk                        ] +
+                        4500.0*read[read_ijk+delta_i                ] +
+                        -450.0*read[read_ijk-delta_i+delta_j        ] +
+                        4500.0*read[read_ijk        +delta_j        ] +
+                         750.0*read[read_ijk+delta_i+delta_j        ] +
+                       
+                          45.0*read[read_ijk-delta_i-delta_j+delta_k] +
+                        -450.0*read[read_ijk        -delta_j+delta_k] +
+                         -75.0*read[read_ijk+delta_i-delta_j+delta_k] +
+                        -450.0*read[read_ijk-delta_i        +delta_k] +
+                        4500.0*read[read_ijk                +delta_k] +
+                         750.0*read[read_ijk+delta_i        +delta_k] +
+                         -75.0*read[read_ijk-delta_i+delta_j+delta_k] +
+                         750.0*read[read_ijk        +delta_j+delta_k] +
+                         125.0*read[read_ijk+delta_i+delta_j+delta_k] 
+                       );
+
+  }}}
+  #else
+  int i,j,k;
+  int ii,jj,kk;
+  double w0 =  5.0/32.0;
+  double w1 = 30.0/32.0;
+  double w2 = -3.0/32.0;
+  for(k=0,kk=0;k<write_dim_k;k+=2,kk++){
+  for(j=0,jj=0;j<write_dim_j;j+=2,jj++){
+  // compiler cannot infer/speculate write[ijk+write_jStride] is disjoint from write[ijk], so create a unique restrict pointers for each nonliteral offset...
+  double * __restrict__ write00 = write + write_i + (write_j+j+0)*write_jStride + (write_k+k+0)*write_kStride;
+  double * __restrict__ write10 = write + write_i + (write_j+j+1)*write_jStride + (write_k+k+0)*write_kStride;
+  double * __restrict__ write01 = write + write_i + (write_j+j+0)*write_jStride + (write_k+k+1)*write_kStride;
+  double * __restrict__ write11 = write + write_i + (write_j+j+1)*write_jStride + (write_k+k+1)*write_kStride;
+  for(i=0,ii=0;i<write_dim_i;i+=2,ii++){
+    int write_ijk = ( i+write_i) + ( j+write_j)*write_jStride + ( k+write_k)*write_kStride;
+    int  read_ijk = (ii+ read_i) + (jj+ read_j)* read_jStride + (kk+ read_k)* read_kStride;
+    //
+    // |  5/32 | 30/32 | -3/32 | coarse grid
+    // |---+---|---+---|---+---|
+    // |   |   | ? |   |   |   | fine grid
+    //
+
+    // grab all coarse grid points...
+    const double c000=read[read_ijk-1-read_jStride-read_kStride], c100=read[read_ijk  -read_jStride-read_kStride], c200=read[read_ijk+1-read_jStride-read_kStride];
+    const double c010=read[read_ijk-1             -read_kStride], c110=read[read_ijk               -read_kStride], c210=read[read_ijk+1             -read_kStride];
+    const double c020=read[read_ijk-1+read_jStride-read_kStride], c120=read[read_ijk  +read_jStride-read_kStride], c220=read[read_ijk+1+read_jStride-read_kStride];
+    const double c001=read[read_ijk-1-read_jStride             ], c101=read[read_ijk  -read_jStride             ], c201=read[read_ijk+1-read_jStride             ];
+    const double c011=read[read_ijk-1                          ], c111=read[read_ijk                            ], c211=read[read_ijk+1                          ];
+    const double c021=read[read_ijk-1+read_jStride             ], c121=read[read_ijk  +read_jStride             ], c221=read[read_ijk+1+read_jStride             ];
+    const double c002=read[read_ijk-1-read_jStride+read_kStride], c102=read[read_ijk  -read_jStride+read_kStride], c202=read[read_ijk+1-read_jStride+read_kStride];
+    const double c012=read[read_ijk-1             +read_kStride], c112=read[read_ijk               +read_kStride], c212=read[read_ijk+1             +read_kStride];
+    const double c022=read[read_ijk-1+read_jStride+read_kStride], c122=read[read_ijk  +read_jStride+read_kStride], c222=read[read_ijk+1+read_jStride+read_kStride];
+
+    // interpolate in i to create fine i / coarse jk points...
+    //
+    // +-------+-------+-------+      :.......+---+---+.......:
+    // |       |       |       |      :       |   |   |       :
+    // |   c   |   c   |   c   |      :       | f | f |       :
+    // |       |       |       |      :       |   |   |       :
+    // +-------+-------+-------+      :.......+---+---+.......:
+    // |       |       |       |      :       |   |   |       :
+    // |   c   |   c   |   c   |  ->  :       | f | f |       :
+    // |       |       |       |      :       |   |   |       :
+    // +-------+-------+-------+      :.......+---+---+.......:
+    // |       |       |       |      :       |   |   |       :
+    // |   c   |   c   |   c   |      :       | f | f |       :
+    // |       |       |       |      :       |   |   |       :
+    // +-------+-------+-------+      :.......+---+---+.......:
+    //
+    const double f0c00 = ( w1*c100 + w0*c000 + w2*c200 );
+    const double f1c00 = ( w1*c100 + w2*c000 + w0*c200 );
+    const double f0c10 = ( w1*c110 + w0*c010 + w2*c210 );
+    const double f1c10 = ( w1*c110 + w2*c010 + w0*c210 );
+    const double f0c20 = ( w1*c120 + w0*c020 + w2*c220 );
+    const double f1c20 = ( w1*c120 + w2*c020 + w0*c220 );
+
+    const double f0c01 = ( w1*c101 + w0*c001 + w2*c201 );
+    const double f1c01 = ( w1*c101 + w2*c001 + w0*c201 );
+    const double f0c11 = ( w1*c111 + w0*c011 + w2*c211 );
+    const double f1c11 = ( w1*c111 + w2*c011 + w0*c211 );
+    const double f0c21 = ( w1*c121 + w0*c021 + w2*c221 );
+    const double f1c21 = ( w1*c121 + w2*c021 + w0*c221 );
+
+    const double f0c02 = ( w1*c102 + w0*c002 + w2*c202 );
+    const double f1c02 = ( w1*c102 + w2*c002 + w0*c202 );
+    const double f0c12 = ( w1*c112 + w0*c012 + w2*c212 );
+    const double f1c12 = ( w1*c112 + w2*c012 + w0*c212 );
+    const double f0c22 = ( w1*c122 + w0*c022 + w2*c222 );
+    const double f1c22 = ( w1*c122 + w2*c022 + w0*c222 );
+
+    // interpolate in j to create fine ij / coarse k points...
+    //
+    // :.......+---+---+.......:      :.......:.......:.......:
+    // :       |   |   |       :      :       :       :       :
+    // :       |   |   |       :      :       :       :       :
+    // :       |   |   |       :      :       :       :       :
+    // :.......+---+---+.......:      :.......+---+---+.......:
+    // :       |   |   |       :      :       |   |   |       :
+    // :       |   |   |       :  ->  :       +---+---+       :
+    // :       |   |   |       :      :       |   |   |       :
+    // :.......+---+---+.......:      :.......+---+---+.......:
+    // :       |   |   |       :      :       :       :       :
+    // :       |   |   |       :      :       :       :       :
+    // :       |   |   |       :      :       :       :       :
+    // :.......+---+---+.......:      :.......:.......:.......:
+    //
+    const double f00c0 = ( w1*f0c10 + w0*f0c00 + w2*f0c20 );
+    const double f10c0 = ( w1*f1c10 + w0*f1c00 + w2*f1c20 );
+    const double f01c0 = ( w1*f0c10 + w2*f0c00 + w0*f0c20 );
+    const double f11c0 = ( w1*f1c10 + w2*f1c00 + w0*f1c20 );
+
+    const double f00c1 = ( w1*f0c11 + w0*f0c01 + w2*f0c21 );
+    const double f10c1 = ( w1*f1c11 + w0*f1c01 + w2*f1c21 );
+    const double f01c1 = ( w1*f0c11 + w2*f0c01 + w0*f0c21 );
+    const double f11c1 = ( w1*f1c11 + w2*f1c01 + w0*f1c21 );
+
+    const double f00c2 = ( w1*f0c12 + w0*f0c02 + w2*f0c22 );
+    const double f10c2 = ( w1*f1c12 + w0*f1c02 + w2*f1c22 );
+    const double f01c2 = ( w1*f0c12 + w2*f0c02 + w0*f0c22 );
+    const double f11c2 = ( w1*f1c12 + w2*f1c02 + w0*f1c22 );
+
+    // interpolate in k to create fine ijk points...
+    const double f000 = ( w1*f00c1 + w0*f00c0 + w2*f00c2 );
+    const double f100 = ( w1*f10c1 + w0*f10c0 + w2*f10c2 );
+    const double f010 = ( w1*f01c1 + w0*f01c0 + w2*f01c2 );
+    const double f110 = ( w1*f11c1 + w0*f11c0 + w2*f11c2 );
+    const double f001 = ( w1*f00c1 + w2*f00c0 + w0*f00c2 );
+    const double f101 = ( w1*f10c1 + w2*f10c0 + w0*f10c2 );
+    const double f011 = ( w1*f01c1 + w2*f01c0 + w0*f01c2 );
+    const double f111 = ( w1*f11c1 + w2*f11c0 + w0*f11c2 );
+
+    // commit to memory...
+    #if 0 // compiler cannot infer/speculate write[ijk+write_jStride] is disjoint from write[ijk], and thus cannot vectorize...
+    write[write_ijk                              ] = prescale_f*write[write_ijk                              ] + f000;
+    write[write_ijk+1                            ] = prescale_f*write[write_ijk+1                            ] + f100;
+    write[write_ijk  +write_jStride              ] = prescale_f*write[write_ijk  +write_jStride              ] + f010;
+    write[write_ijk+1+write_jStride              ] = prescale_f*write[write_ijk+1+write_jStride              ] + f110;
+    write[write_ijk                +write_kStride] = prescale_f*write[write_ijk                +write_kStride] + f001;
+    write[write_ijk+1              +write_kStride] = prescale_f*write[write_ijk+1              +write_kStride] + f101;
+    write[write_ijk  +write_jStride+write_kStride] = prescale_f*write[write_ijk  +write_jStride+write_kStride] + f011;
+    write[write_ijk+1+write_jStride+write_kStride] = prescale_f*write[write_ijk+1+write_jStride+write_kStride] + f111;
+    #else // use a unique restrict pointer for each pencil...
+    write00[i  ] = prescale_f*write00[i  ] + f000;
+    write00[i+1] = prescale_f*write00[i+1] + f100;
+    write10[i  ] = prescale_f*write10[i  ] + f010;
+    write10[i+1] = prescale_f*write10[i+1] + f110;
+    write01[i  ] = prescale_f*write01[i  ] + f001;
+    write01[i+1] = prescale_f*write01[i+1] + f101;
+    write11[i  ] = prescale_f*write11[i  ] + f011;
+    write11[i+1] = prescale_f*write11[i+1] + f111;
+    #endif
+
+  }}}
+  #endif
+
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// perform a (inter-level) piecewise quadratic interpolation
+void interpolation_p2(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){
+    exchange_boundary(level_c,id_c,STENCIL_SHAPE_BOX);
+         apply_BCs_p2(level_c,id_c,STENCIL_SHAPE_BOX);
+
+  double _timeCommunicationStart = getTime();
+  double _timeStart,_timeEnd;
+  int buffer=0;
+  int n;
+  int my_tag = (level_f->tag<<4) | 0x7;
+
+
+  #ifdef USE_MPI
+  // by convention, level_f allocates a combined array of requests for both level_f recvs and level_c sends...
+  int nMessages = level_c->interpolation.num_sends + level_f->interpolation.num_recvs;
+  MPI_Request *recv_requests = level_f->interpolation.requests;
+  MPI_Request *send_requests = level_f->interpolation.requests + level_f->interpolation.num_recvs;
+
+
+  // loop through packed list of MPI receives and prepost Irecv's...
+  if(level_f->interpolation.num_recvs>0){
+    _timeStart = getTime();
+    #ifdef USE_MPI_THREAD_MULTIPLE
+    #pragma omp parallel for schedule(dynamic,1)
+    #endif
+    for(n=0;n<level_f->interpolation.num_recvs;n++){
+      MPI_Irecv(level_f->interpolation.recv_buffers[n],
+                level_f->interpolation.recv_sizes[n],
+                MPI_DOUBLE,
+                level_f->interpolation.recv_ranks[n],
+                my_tag,
+                MPI_COMM_WORLD,
+                &recv_requests[n]
+      );
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_recv += (_timeEnd-_timeStart);
+  }
+
+
+  // pack MPI send buffers...
+  if(level_c->interpolation.num_blocks[0]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[0])
+    for(buffer=0;buffer<level_c->interpolation.num_blocks[0];buffer++){
+      // !!! prescale==0 because you don't want to increment the MPI buffer
+      interpolation_p2_block(level_f,id_f,0.0,level_c,id_c,&level_c->interpolation.blocks[0][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_pack += (_timeEnd-_timeStart);
+  }
+
+
+  // loop through MPI send buffers and post Isend's...
+  if(level_c->interpolation.num_sends>0){
+    _timeStart = getTime();
+    #ifdef USE_MPI_THREAD_MULTIPLE
+    #pragma omp parallel for schedule(dynamic,1)
+    #endif
+    for(n=0;n<level_c->interpolation.num_sends;n++){
+      MPI_Isend(level_c->interpolation.send_buffers[n],
+                level_c->interpolation.send_sizes[n],
+                MPI_DOUBLE,
+                level_c->interpolation.send_ranks[n],
+                my_tag,
+                MPI_COMM_WORLD,
+                &send_requests[n]
+      );
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_send += (_timeEnd-_timeStart);
+  }
+  #endif
+
+
+  // perform local interpolation... try and hide within Isend latency... 
+  if(level_c->interpolation.num_blocks[1]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[1])
+    for(buffer=0;buffer<level_c->interpolation.num_blocks[1];buffer++){
+      interpolation_p2_block(level_f,id_f,prescale_f,level_c,id_c,&level_c->interpolation.blocks[1][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_local += (_timeEnd-_timeStart);
+  }
+
+
+  // wait for MPI to finish...
+  #ifdef USE_MPI 
+  if(nMessages>0){
+    _timeStart = getTime();
+    MPI_Waitall(nMessages,level_f->interpolation.requests,level_f->interpolation.status);
+    _timeEnd = getTime();
+    level_f->timers.interpolation_wait += (_timeEnd-_timeStart);
+  }
+
+
+  // unpack MPI receive buffers 
+  if(level_f->interpolation.num_blocks[2]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->interpolation.num_blocks[2])
+    for(buffer=0;buffer<level_f->interpolation.num_blocks[2];buffer++){
+      IncrementBlock(level_f,id_f,prescale_f,&level_f->interpolation.blocks[2][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_unpack += (_timeEnd-_timeStart);
+  }
+  #endif 
+ 
+ 
+  level_f->timers.interpolation_total += (double)(getTime()-_timeCommunicationStart);
+}
diff --git a/Util/hpgmg/finite-volume/source/operators/interpolation_v2.c b/Util/hpgmg/finite-volume/source/operators/interpolation_v2.c
new file mode 100644
index 00000000..9052d9c2
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/interpolation_v2.c
@@ -0,0 +1,320 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+static inline void interpolation_v2_block(level_type *level_f, int id_f, double prescale_f, level_type *level_c, int id_c, blockCopy_type *block){
+  // interpolate 3D array from read_i,j,k of read[] to write_i,j,k in write[] using volume averaged quadratic prolongation
+  int write_dim_i   = block->dim.i<<1; // calculate the dimensions of the resultant fine block
+  int write_dim_j   = block->dim.j<<1;
+  int write_dim_k   = block->dim.k<<1;
+
+  int  read_i       = block->read.i;
+  int  read_j       = block->read.j;
+  int  read_k       = block->read.k;
+  int  read_jStride = block->read.jStride;
+  int  read_kStride = block->read.kStride;
+
+  int write_i       = block->write.i;
+  int write_j       = block->write.j;
+  int write_k       = block->write.k;
+  int write_jStride = block->write.jStride;
+  int write_kStride = block->write.kStride;
+
+  const double * __restrict__  read = block->read.ptr;
+        double * __restrict__ write = block->write.ptr;
+
+  if(block->read.box >=0){
+     read_jStride = level_c->my_boxes[block->read.box ].jStride;
+     read_kStride = level_c->my_boxes[block->read.box ].kStride;
+     read = level_c->my_boxes[ block->read.box].vectors[id_c] + level_c->box_ghosts*(1+ read_jStride+ read_kStride);
+  }
+  if(block->write.box>=0){
+    write_jStride = level_f->my_boxes[block->write.box].jStride;
+    write_kStride = level_f->my_boxes[block->write.box].kStride;
+    write = level_f->my_boxes[block->write.box].vectors[id_f] + level_f->box_ghosts*(1+write_jStride+write_kStride);
+  }
+
+
+  #ifdef USE_NAIVE_INTERP
+  // naive 27pt per fine grid cell
+  int i,j,k;
+  double c1 = 1.0/8.0;
+  for(k=0;k<write_dim_k;k++){double c1k=c1;if(k&0x1){c1k=-c1;}
+  for(j=0;j<write_dim_j;j++){double c1j=c1;if(j&0x1){c1j=-c1;}
+  for(i=0;i<write_dim_i;i++){double c1i=c1;if(i&0x1){c1i=-c1;}
+    int write_ijk = ((i   )+write_i) + (((j   )+write_j)*write_jStride) + (((k   )+write_k)*write_kStride);
+    int  read_ijk = ((i>>1)+ read_i) + (((j>>1)+ read_j)* read_jStride) + (((k>>1)+ read_k)* read_kStride);
+    //
+    // |  1/8  |  1.0  | -1/8  | coarse grid
+    // |---+---|---+---|---+---|
+    // |   |   |???|   |   |   | fine grid
+    //
+    write[write_ijk] = prescale_f*write[write_ijk] +
+                       + c1k*( + c1j*( c1i*read[read_ijk-1-read_jStride-read_kStride] + read[read_ijk-read_jStride-read_kStride] - c1i*read[read_ijk+1-read_jStride-read_kStride] )
+                               +     ( c1i*read[read_ijk-1             -read_kStride] + read[read_ijk             -read_kStride] - c1i*read[read_ijk+1             -read_kStride] )
+                               - c1j*( c1i*read[read_ijk-1+read_jStride-read_kStride] + read[read_ijk+read_jStride-read_kStride] - c1i*read[read_ijk+1+read_jStride-read_kStride] ) )
+                       +     ( + c1j*( c1i*read[read_ijk-1-read_jStride             ] + read[read_ijk-read_jStride             ] - c1i*read[read_ijk+1-read_jStride             ] )
+                               +     ( c1i*read[read_ijk-1                          ] + read[read_ijk                          ] - c1i*read[read_ijk+1                          ] )
+                               - c1j*( c1i*read[read_ijk-1+read_jStride             ] + read[read_ijk+read_jStride             ] - c1i*read[read_ijk+1+read_jStride             ] ) )
+                       - c1k*( + c1j*( c1i*read[read_ijk-1-read_jStride+read_kStride] + read[read_ijk-read_jStride+read_kStride] - c1i*read[read_ijk+1-read_jStride+read_kStride] )
+                               +     ( c1i*read[read_ijk-1             +read_kStride] + read[read_ijk             +read_kStride] - c1i*read[read_ijk+1             +read_kStride] )
+                               - c1j*( c1i*read[read_ijk-1+read_jStride+read_kStride] + read[read_ijk+read_jStride+read_kStride] - c1i*read[read_ijk+1+read_jStride+read_kStride] ) );
+  }}}
+  #else
+  int i,j,k;
+  int ii,jj,kk;
+  double c1 = 1.0/8.0;
+  for(k=0,kk=0;k<write_dim_k;k+=2,kk++){
+  for(j=0,jj=0;j<write_dim_j;j+=2,jj++){
+  // compiler cannot infer/speculate write[ijk+write_jStride] is disjoint from write[ijk], so create a unique restrict pointers for each nonliteral offset...
+  double * __restrict__ write00 = write + write_i + (write_j+j+0)*write_jStride + (write_k+k+0)*write_kStride;
+  double * __restrict__ write10 = write + write_i + (write_j+j+1)*write_jStride + (write_k+k+0)*write_kStride;
+  double * __restrict__ write01 = write + write_i + (write_j+j+0)*write_jStride + (write_k+k+1)*write_kStride;
+  double * __restrict__ write11 = write + write_i + (write_j+j+1)*write_jStride + (write_k+k+1)*write_kStride;
+  for(i=0,ii=0;i<write_dim_i;i+=2,ii++){
+    int write_ijk = ( i+write_i) + ( j+write_j)*write_jStride + ( k+write_k)*write_kStride;
+    int  read_ijk = (ii+ read_i) + (jj+ read_j)* read_jStride + (kk+ read_k)* read_kStride;
+    //
+    // |  1/8  |  1.0  | -1/8  | coarse grid
+    // |---+---|---+---|---+---|
+    // |   |   |???|   |   |   | fine grid
+    //
+
+    // grab all coarse grid points...
+    const double c000=read[read_ijk-1-read_jStride-read_kStride], c100=read[read_ijk  -read_jStride-read_kStride], c200=read[read_ijk+1-read_jStride-read_kStride];
+    const double c010=read[read_ijk-1             -read_kStride], c110=read[read_ijk               -read_kStride], c210=read[read_ijk+1             -read_kStride];
+    const double c020=read[read_ijk-1+read_jStride-read_kStride], c120=read[read_ijk  +read_jStride-read_kStride], c220=read[read_ijk+1+read_jStride-read_kStride];
+    const double c001=read[read_ijk-1-read_jStride             ], c101=read[read_ijk  -read_jStride             ], c201=read[read_ijk+1-read_jStride             ];
+    const double c011=read[read_ijk-1                          ], c111=read[read_ijk                            ], c211=read[read_ijk+1                          ];
+    const double c021=read[read_ijk-1+read_jStride             ], c121=read[read_ijk  +read_jStride             ], c221=read[read_ijk+1+read_jStride             ];
+    const double c002=read[read_ijk-1-read_jStride+read_kStride], c102=read[read_ijk  -read_jStride+read_kStride], c202=read[read_ijk+1-read_jStride+read_kStride];
+    const double c012=read[read_ijk-1             +read_kStride], c112=read[read_ijk               +read_kStride], c212=read[read_ijk+1             +read_kStride];
+    const double c022=read[read_ijk-1+read_jStride+read_kStride], c122=read[read_ijk  +read_jStride+read_kStride], c222=read[read_ijk+1+read_jStride+read_kStride];
+
+    // interpolate in i to create fine i / coarse jk points...
+    //
+    // +-------+-------+-------+      :.......+---+---+.......:
+    // |       |       |       |      :       |   |   |       :
+    // |   c   |   c   |   c   |      :       | f | f |       :
+    // |       |       |       |      :       |   |   |       :
+    // +-------+-------+-------+      :.......+---+---+.......:
+    // |       |       |       |      :       |   |   |       :
+    // |   c   |   c   |   c   |  ->  :       | f | f |       :
+    // |       |       |       |      :       |   |   |       :
+    // +-------+-------+-------+      :.......+---+---+.......:
+    // |       |       |       |      :       |   |   |       :
+    // |   c   |   c   |   c   |      :       | f | f |       :
+    // |       |       |       |      :       |   |   |       :
+    // +-------+-------+-------+      :.......+---+---+.......:
+    //
+    const double f0c00 = ( c100 + c1*(c000-c200) ); // same as original 3pt stencil... f0c00 = ( c1*c000 + c100 - c1*c200 );
+    const double f1c00 = ( c100 - c1*(c000-c200) );
+    const double f0c10 = ( c110 + c1*(c010-c210) );
+    const double f1c10 = ( c110 - c1*(c010-c210) );
+    const double f0c20 = ( c120 + c1*(c020-c220) );
+    const double f1c20 = ( c120 - c1*(c020-c220) );
+
+    const double f0c01 = ( c101 + c1*(c001-c201) );
+    const double f1c01 = ( c101 - c1*(c001-c201) );
+    const double f0c11 = ( c111 + c1*(c011-c211) );
+    const double f1c11 = ( c111 - c1*(c011-c211) );
+    const double f0c21 = ( c121 + c1*(c021-c221) );
+    const double f1c21 = ( c121 - c1*(c021-c221) );
+
+    const double f0c02 = ( c102 + c1*(c002-c202) );
+    const double f1c02 = ( c102 - c1*(c002-c202) );
+    const double f0c12 = ( c112 + c1*(c012-c212) );
+    const double f1c12 = ( c112 - c1*(c012-c212) );
+    const double f0c22 = ( c122 + c1*(c022-c222) );
+    const double f1c22 = ( c122 - c1*(c022-c222) );
+
+    // interpolate in j to create fine ij / coarse k points...
+    //
+    // :.......+---+---+.......:      :.......:.......:.......:
+    // :       |   |   |       :      :       :       :       :
+    // :       |   |   |       :      :       :       :       :
+    // :       |   |   |       :      :       :       :       :
+    // :.......+---+---+.......:      :.......+---+---+.......:
+    // :       |   |   |       :      :       |   |   |       :
+    // :       |   |   |       :  ->  :       +---+---+       :
+    // :       |   |   |       :      :       |   |   |       :
+    // :.......+---+---+.......:      :.......+---+---+.......:
+    // :       |   |   |       :      :       :       :       :
+    // :       |   |   |       :      :       :       :       :
+    // :       |   |   |       :      :       :       :       :
+    // :.......+---+---+.......:      :.......:.......:.......:
+    //
+    const double f00c0 = ( f0c10 + c1*(f0c00-f0c20) );
+    const double f10c0 = ( f1c10 + c1*(f1c00-f1c20) );
+    const double f01c0 = ( f0c10 - c1*(f0c00-f0c20) );
+    const double f11c0 = ( f1c10 - c1*(f1c00-f1c20) );
+
+    const double f00c1 = ( f0c11 + c1*(f0c01-f0c21) );
+    const double f10c1 = ( f1c11 + c1*(f1c01-f1c21) );
+    const double f01c1 = ( f0c11 - c1*(f0c01-f0c21) );
+    const double f11c1 = ( f1c11 - c1*(f1c01-f1c21) );
+
+    const double f00c2 = ( f0c12 + c1*(f0c02-f0c22) );
+    const double f10c2 = ( f1c12 + c1*(f1c02-f1c22) );
+    const double f01c2 = ( f0c12 - c1*(f0c02-f0c22) );
+    const double f11c2 = ( f1c12 - c1*(f1c02-f1c22) );
+
+    // interpolate in k to create fine ijk points...
+    const double f000 = ( f00c1 + c1*(f00c0-f00c2) );
+    const double f100 = ( f10c1 + c1*(f10c0-f10c2) );
+    const double f010 = ( f01c1 + c1*(f01c0-f01c2) );
+    const double f110 = ( f11c1 + c1*(f11c0-f11c2) );
+    const double f001 = ( f00c1 - c1*(f00c0-f00c2) );
+    const double f101 = ( f10c1 - c1*(f10c0-f10c2) );
+    const double f011 = ( f01c1 - c1*(f01c0-f01c2) );
+    const double f111 = ( f11c1 - c1*(f11c0-f11c2) );
+
+    // commit to memory...
+    #if 0 // compiler cannot infer/speculate write[ijk+write_jStride] is disjoint from write[ijk], and thus cannot vectorize...
+    write[write_ijk                              ] = prescale_f*write[write_ijk                              ] + f000;
+    write[write_ijk+1                            ] = prescale_f*write[write_ijk+1                            ] + f100;
+    write[write_ijk  +write_jStride              ] = prescale_f*write[write_ijk  +write_jStride              ] + f010;
+    write[write_ijk+1+write_jStride              ] = prescale_f*write[write_ijk+1+write_jStride              ] + f110;
+    write[write_ijk                +write_kStride] = prescale_f*write[write_ijk                +write_kStride] + f001;
+    write[write_ijk+1              +write_kStride] = prescale_f*write[write_ijk+1              +write_kStride] + f101;
+    write[write_ijk  +write_jStride+write_kStride] = prescale_f*write[write_ijk  +write_jStride+write_kStride] + f011;
+    write[write_ijk+1+write_jStride+write_kStride] = prescale_f*write[write_ijk+1+write_jStride+write_kStride] + f111;
+    #else // use a unique restrict pointer for each pencil...
+    write00[i  ] = prescale_f*write00[i  ] + f000;
+    write00[i+1] = prescale_f*write00[i+1] + f100;
+    write10[i  ] = prescale_f*write10[i  ] + f010;
+    write10[i+1] = prescale_f*write10[i+1] + f110;
+    write01[i  ] = prescale_f*write01[i  ] + f001;
+    write01[i+1] = prescale_f*write01[i+1] + f101;
+    write11[i  ] = prescale_f*write11[i  ] + f011;
+    write11[i+1] = prescale_f*write11[i+1] + f111;
+    #endif
+
+  }}}
+  #endif
+
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// perform a (inter-level) volumetric quadratic interpolation on vector id_c of the coarse level and increments prescale_f*vector id_f on the fine level by the result
+// i.e. id_f = prescale_f*id_f + P*id_c
+// prescale_f is nominally 1.0 or 0.0
+// quadratic interpolation requires a full ghost zone exchange and boundary condition
+// This is a rather bulk synchronous implementation which packs all MPI buffers before initiating any sends
+// Similarly, it waits for all remote data before copying any into local boxes.
+// It does however attempt to overlap local interpolation with MPI
+void interpolation_v2(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){
+    exchange_boundary(level_c,id_c,STENCIL_SHAPE_BOX);
+         apply_BCs_v2(level_c,id_c,STENCIL_SHAPE_BOX);
+
+  double _timeCommunicationStart = getTime();
+  double _timeStart,_timeEnd;
+  int buffer=0;
+  int n;
+  int my_tag = (level_f->tag<<4) | 0x7;
+
+
+  #ifdef USE_MPI
+  // by convention, level_f allocates a combined array of requests for both level_f recvs and level_c sends...
+  int nMessages = level_c->interpolation.num_sends + level_f->interpolation.num_recvs;
+  MPI_Request *recv_requests = level_f->interpolation.requests;
+  MPI_Request *send_requests = level_f->interpolation.requests + level_f->interpolation.num_recvs;
+
+
+  // loop through packed list of MPI receives and prepost Irecv's...
+  if(level_f->interpolation.num_recvs>0){
+    _timeStart = getTime();
+    #ifdef USE_MPI_THREAD_MULTIPLE
+    #pragma omp parallel for schedule(dynamic,1)
+    #endif
+    for(n=0;n<level_f->interpolation.num_recvs;n++){
+      MPI_Irecv(level_f->interpolation.recv_buffers[n],
+                level_f->interpolation.recv_sizes[n],
+                MPI_DOUBLE,
+                level_f->interpolation.recv_ranks[n],
+                my_tag,
+                MPI_COMM_WORLD,
+                &recv_requests[n]
+      );
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_recv += (_timeEnd-_timeStart);
+  }
+
+
+  // pack MPI send buffers...
+  if(level_c->interpolation.num_blocks[0]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[0])
+    for(buffer=0;buffer<level_c->interpolation.num_blocks[0];buffer++){
+      // !!! prescale==0 because you don't want to increment the MPI buffer
+      interpolation_v2_block(level_f,id_f,0.0,level_c,id_c,&level_c->interpolation.blocks[0][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_pack += (_timeEnd-_timeStart);
+  }
+
+
+  // loop through MPI send buffers and post Isend's...
+  if(level_c->interpolation.num_sends>0){
+    _timeStart = getTime();
+    #ifdef USE_MPI_THREAD_MULTIPLE
+    #pragma omp parallel for schedule(dynamic,1)
+    #endif
+    for(n=0;n<level_c->interpolation.num_sends;n++){
+      MPI_Isend(level_c->interpolation.send_buffers[n],
+                level_c->interpolation.send_sizes[n],
+                MPI_DOUBLE,
+                level_c->interpolation.send_ranks[n],
+                my_tag,
+                MPI_COMM_WORLD,
+                &send_requests[n]
+      );
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_send += (_timeEnd-_timeStart);
+  }
+  #endif
+
+
+  // perform local interpolation... try and hide within Isend latency... 
+  if(level_c->interpolation.num_blocks[1]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[1])
+    for(buffer=0;buffer<level_c->interpolation.num_blocks[1];buffer++){
+      interpolation_v2_block(level_f,id_f,prescale_f,level_c,id_c,&level_c->interpolation.blocks[1][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_local += (_timeEnd-_timeStart);
+  }
+
+
+  // wait for MPI to finish...
+  #ifdef USE_MPI 
+  if(nMessages>0){
+    _timeStart = getTime();
+    MPI_Waitall(nMessages,level_f->interpolation.requests,level_f->interpolation.status);
+    _timeEnd = getTime();
+    level_f->timers.interpolation_wait += (_timeEnd-_timeStart);
+  }
+
+
+  // unpack MPI receive buffers 
+  if(level_f->interpolation.num_blocks[2]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->interpolation.num_blocks[2])
+    for(buffer=0;buffer<level_f->interpolation.num_blocks[2];buffer++){
+      IncrementBlock(level_f,id_f,prescale_f,&level_f->interpolation.blocks[2][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_unpack += (_timeEnd-_timeStart);
+  }
+  #endif 
+ 
+ 
+  level_f->timers.interpolation_total += (double)(getTime()-_timeCommunicationStart);
+}
diff --git a/Util/hpgmg/finite-volume/source/operators/interpolation_v4.c b/Util/hpgmg/finite-volume/source/operators/interpolation_v4.c
new file mode 100644
index 00000000..8a0d6d89
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/interpolation_v4.c
@@ -0,0 +1,386 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+static inline void interpolation_v4_block(level_type *level_f, int id_f, double prescale_f, level_type *level_c, int id_c, blockCopy_type *block){
+  // interpolate 3D array from read_i,j,k of read[] to write_i,j,k in write[] using volume averaged quartic prolongation
+  int write_dim_i   = block->dim.i<<1; // calculate the dimensions of the resultant fine block
+  int write_dim_j   = block->dim.j<<1;
+  int write_dim_k   = block->dim.k<<1;
+
+  int  read_i       = block->read.i;
+  int  read_j       = block->read.j;
+  int  read_k       = block->read.k;
+  int  read_jStride = block->read.jStride;
+  int  read_kStride = block->read.kStride;
+
+  int write_i       = block->write.i;
+  int write_j       = block->write.j;
+  int write_k       = block->write.k;
+  int write_jStride = block->write.jStride;
+  int write_kStride = block->write.kStride;
+
+  const double * __restrict__  read = block->read.ptr;
+        double * __restrict__ write = block->write.ptr;
+
+  if(block->read.box >=0){
+     read_jStride = level_c->my_boxes[block->read.box ].jStride;
+     read_kStride = level_c->my_boxes[block->read.box ].kStride;
+     read = level_c->my_boxes[ block->read.box].vectors[id_c] + level_c->box_ghosts*(1+ read_jStride+ read_kStride);
+  }
+  if(block->write.box>=0){
+    write_jStride = level_f->my_boxes[block->write.box].jStride;
+    write_kStride = level_f->my_boxes[block->write.box].kStride;
+    write = level_f->my_boxes[block->write.box].vectors[id_f] + level_f->box_ghosts*(1+write_jStride+write_kStride);
+  }
+ 
+
+  #ifdef USE_NAIVE_INTERP
+  // naive 125pt per fine grid cell
+  int i,j,k;
+  double c2 = -3.0/128.0;
+  double c1 = 22.0/128.0;
+  int dj  =   read_jStride;
+  int dk  =   read_kStride;
+  int dj2 = 2*read_jStride;
+  int dk2 = 2*read_kStride;
+  for(k=0;k<write_dim_k;k++){double sk1=c1,sk2=c2;if(k&0x1){sk1=-c1;sk2=-c2;}
+  for(j=0;j<write_dim_j;j++){double sj1=c1,sj2=c2;if(j&0x1){sj1=-c1;sj2=-c2;}
+  for(i=0;i<write_dim_i;i++){double si1=c1,si2=c2;if(i&0x1){si1=-c1;si2=-c2;}
+    int write_ijk = ((i   )+write_i) + (((j   )+write_j)*write_jStride) + (((k   )+write_k)*write_kStride);
+    int  read_ijk = ((i>>1)+ read_i) + (((j>>1)+ read_j)* read_jStride) + (((k>>1)+ read_k)* read_kStride);
+    //
+    // |   -3/128  |  +22/128  |    1.0    |  -22/128  |   +3/128  | coarse grid
+    // |-----+-----|-----+-----|-----+-----|-----+-----|-----+-----|
+    // |     |     |     |     |?????|     |     |     |     |     | fine grid
+    //
+    write[write_ijk] = prescale_f*write[write_ijk] +
+                       + sk2*( + sj2*( si2*read[read_ijk-2-dj2-dk2] + si1*read[read_ijk-1-dj2-dk2] + read[read_ijk-dj2-dk2] - si1*read[read_ijk+1-dj2-dk2] - si2*read[read_ijk+2-dj2-dk2] )
+                               + sj1*( si2*read[read_ijk-2-dj -dk2] + si1*read[read_ijk-1-dj -dk2] + read[read_ijk-dj -dk2] - si1*read[read_ijk+1-dj -dk2] - si2*read[read_ijk+2-dj -dk2] )
+                               +     ( si2*read[read_ijk-2    -dk2] + si1*read[read_ijk-1    -dk2] + read[read_ijk    -dk2] - si1*read[read_ijk+1    -dk2] - si2*read[read_ijk+2    -dk2] )
+                               - sj1*( si2*read[read_ijk-2+dj -dk2] + si1*read[read_ijk-1+dj -dk2] + read[read_ijk+dj -dk2] - si1*read[read_ijk+1+dj -dk2] - si2*read[read_ijk+2+dj -dk2] )
+                               - sj2*( si2*read[read_ijk-2+dj2-dk2] + si1*read[read_ijk-1+dj2-dk2] + read[read_ijk+dj2-dk2] - si1*read[read_ijk+1+dj2-dk2] - si2*read[read_ijk+2+dj2-dk2] ) )
+                       + sk1*( + sj2*( si2*read[read_ijk-2-dj2-dk ] + si1*read[read_ijk-1-dj2-dk ] + read[read_ijk-dj2-dk ] - si1*read[read_ijk+1-dj2-dk ] - si2*read[read_ijk+2-dj2-dk ] )
+                               + sj1*( si2*read[read_ijk-2-dj -dk ] + si1*read[read_ijk-1-dj -dk ] + read[read_ijk-dj -dk ] - si1*read[read_ijk+1-dj -dk ] - si2*read[read_ijk+2-dj -dk ] )
+                               +     ( si2*read[read_ijk-2    -dk ] + si1*read[read_ijk-1    -dk ] + read[read_ijk    -dk ] - si1*read[read_ijk+1    -dk ] - si2*read[read_ijk+2    -dk ] )
+                               - sj1*( si2*read[read_ijk-2+dj -dk ] + si1*read[read_ijk-1+dj -dk ] + read[read_ijk+dj -dk ] - si1*read[read_ijk+1+dj -dk ] - si2*read[read_ijk+2+dj -dk ] )
+                               - sj2*( si2*read[read_ijk-2+dj2-dk ] + si1*read[read_ijk-1+dj2-dk ] + read[read_ijk+dj2-dk ] - si1*read[read_ijk+1+dj2-dk ] - si2*read[read_ijk+2+dj2-dk ] ) )
+                       +     ( + sj2*( si2*read[read_ijk-2-dj2    ] + si1*read[read_ijk-1-dj2    ] + read[read_ijk-dj2    ] - si1*read[read_ijk+1-dj2    ] - si2*read[read_ijk+2-dj2    ] )
+                               + sj1*( si2*read[read_ijk-2-dj     ] + si1*read[read_ijk-1-dj     ] + read[read_ijk-dj     ] - si1*read[read_ijk+1-dj     ] - si2*read[read_ijk+2-dj     ] )
+                               +     ( si2*read[read_ijk-2        ] + si1*read[read_ijk-1        ] + read[read_ijk        ] - si1*read[read_ijk+1        ] - si2*read[read_ijk+2        ] )
+                               - sj1*( si2*read[read_ijk-2+dj     ] + si1*read[read_ijk-1+dj     ] + read[read_ijk+dj     ] - si1*read[read_ijk+1+dj     ] - si2*read[read_ijk+2+dj     ] )
+                               - sj2*( si2*read[read_ijk-2+dj2    ] + si1*read[read_ijk-1+dj2    ] + read[read_ijk+dj2    ] - si1*read[read_ijk+1+dj2    ] - si2*read[read_ijk+2+dj2    ] ) )
+                       - sk1*( + sj2*( si2*read[read_ijk-2-dj2+dk ] + si1*read[read_ijk-1-dj2+dk ] + read[read_ijk-dj2+dk ] - si1*read[read_ijk+1-dj2+dk ] - si2*read[read_ijk+2-dj2+dk ] )
+                               + sj1*( si2*read[read_ijk-2-dj +dk ] + si1*read[read_ijk-1-dj +dk ] + read[read_ijk-dj +dk ] - si1*read[read_ijk+1-dj +dk ] - si2*read[read_ijk+2-dj +dk ] )
+                               +     ( si2*read[read_ijk-2    +dk ] + si1*read[read_ijk-1    +dk ] + read[read_ijk    +dk ] - si1*read[read_ijk+1    +dk ] - si2*read[read_ijk+2    +dk ] )
+                               - sj1*( si2*read[read_ijk-2+dj +dk ] + si1*read[read_ijk-1+dj +dk ] + read[read_ijk+dj +dk ] - si1*read[read_ijk+1+dj +dk ] - si2*read[read_ijk+2+dj +dk ] )
+                               - sj2*( si2*read[read_ijk-2+dj2+dk ] + si1*read[read_ijk-1+dj2+dk ] + read[read_ijk+dj2+dk ] - si1*read[read_ijk+1+dj2+dk ] - si2*read[read_ijk+2+dj2+dk ] ) )
+                       - sk2*( + sj2*( si2*read[read_ijk-2-dj2+dk2] + si1*read[read_ijk-1-dj2+dk2] + read[read_ijk-dj2+dk2] - si1*read[read_ijk+1-dj2+dk2] - si2*read[read_ijk+2-dj2+dk2] )
+                               + sj1*( si2*read[read_ijk-2-dj +dk2] + si1*read[read_ijk-1-dj +dk2] + read[read_ijk-dj +dk2] - si1*read[read_ijk+1-dj +dk2] - si2*read[read_ijk+2-dj +dk2] )
+                               +     ( si2*read[read_ijk-2    +dk2] + si1*read[read_ijk-1    +dk2] + read[read_ijk    +dk2] - si1*read[read_ijk+1    +dk2] - si2*read[read_ijk+2    +dk2] )
+                               - sj1*( si2*read[read_ijk-2+dj +dk2] + si1*read[read_ijk-1+dj +dk2] + read[read_ijk+dj +dk2] - si1*read[read_ijk+1+dj +dk2] - si2*read[read_ijk+2+dj +dk2] )
+                               - sj2*( si2*read[read_ijk-2+dj2+dk2] + si1*read[read_ijk-1+dj2+dk2] + read[read_ijk+dj2+dk2] - si1*read[read_ijk+1+dj2+dk2] - si2*read[read_ijk+2+dj2+dk2] ) );
+  }}}
+  #else
+  // exploit tensor product symmetry and perform 8 fine grid interpolations at a time...
+  //   50 x 5pt for i
+  //   20 x 5pt for j 
+  //    8 x 5pt for k
+  // ----------------
+  //   78 x 5pt for 8 cells (vs 8x125pt = 200x5pt in naive)
+  int i,j,k;
+  int ii,jj,kk;
+  double c2 = -3.0/128.0;
+  double c1 = 22.0/128.0;
+  int dj  =   read_jStride;
+  int dk  =   read_kStride;
+  int dj2 = 2*read_jStride;
+  int dk2 = 2*read_kStride;
+  for(k=0,kk=0;k<write_dim_k;k+=2,kk++){
+  for(j=0,jj=0;j<write_dim_j;j+=2,jj++){
+  // compiler cannot infer/speculate write[ijk+write_jStride] is disjoint from write[ijk], so create a unique restrict pointers for each nonliteral offset...
+  double * __restrict__ write00 = write + write_i + (write_j+j+0)*write_jStride + (write_k+k+0)*write_kStride;
+  double * __restrict__ write10 = write + write_i + (write_j+j+1)*write_jStride + (write_k+k+0)*write_kStride;
+  double * __restrict__ write01 = write + write_i + (write_j+j+0)*write_jStride + (write_k+k+1)*write_kStride;
+  double * __restrict__ write11 = write + write_i + (write_j+j+1)*write_jStride + (write_k+k+1)*write_kStride;
+  for(i=0,ii=0;i<write_dim_i;i+=2,ii++){
+    int write_ijk = ( i+write_i) + ( j+write_j)*write_jStride + ( k+write_k)*write_kStride;
+    int  read_ijk = (ii+ read_i) + (jj+ read_j)* read_jStride + (kk+ read_k)* read_kStride;
+    //
+    // |   -3/128  |  +22/128  |    1.0    |  -22/128  |   +3/128  | coarse grid
+    // |-----+-----|-----+-----|-----+-----|-----+-----|-----+-----|
+    // |     |     |     |     |?????|     |     |     |     |     | fine grid
+    //
+
+    // grab all coarse grid points...
+    const double c000=read[read_ijk-2-dj2-dk2], c100=read[read_ijk-1-dj2-dk2], c200=read[read_ijk-dj2-dk2], c300=read[read_ijk+1-dj2-dk2], c400=read[read_ijk+2-dj2-dk2];
+    const double c010=read[read_ijk-2-dj -dk2], c110=read[read_ijk-1-dj -dk2], c210=read[read_ijk-dj -dk2], c310=read[read_ijk+1-dj -dk2], c410=read[read_ijk+2-dj -dk2];
+    const double c020=read[read_ijk-2    -dk2], c120=read[read_ijk-1    -dk2], c220=read[read_ijk    -dk2], c320=read[read_ijk+1    -dk2], c420=read[read_ijk+2    -dk2];
+    const double c030=read[read_ijk-2+dj -dk2], c130=read[read_ijk-1+dj -dk2], c230=read[read_ijk+dj -dk2], c330=read[read_ijk+1+dj -dk2], c430=read[read_ijk+2+dj -dk2];
+    const double c040=read[read_ijk-2+dj2-dk2], c140=read[read_ijk-1+dj2-dk2], c240=read[read_ijk+dj2-dk2], c340=read[read_ijk+1+dj2-dk2], c440=read[read_ijk+2+dj2-dk2];
+
+    const double c001=read[read_ijk-2-dj2-dk ], c101=read[read_ijk-1-dj2-dk ], c201=read[read_ijk-dj2-dk ], c301=read[read_ijk+1-dj2-dk ], c401=read[read_ijk+2-dj2-dk ];
+    const double c011=read[read_ijk-2-dj -dk ], c111=read[read_ijk-1-dj -dk ], c211=read[read_ijk-dj -dk ], c311=read[read_ijk+1-dj -dk ], c411=read[read_ijk+2-dj -dk ];
+    const double c021=read[read_ijk-2    -dk ], c121=read[read_ijk-1    -dk ], c221=read[read_ijk    -dk ], c321=read[read_ijk+1    -dk ], c421=read[read_ijk+2    -dk ];
+    const double c031=read[read_ijk-2+dj -dk ], c131=read[read_ijk-1+dj -dk ], c231=read[read_ijk+dj -dk ], c331=read[read_ijk+1+dj -dk ], c431=read[read_ijk+2+dj -dk ];
+    const double c041=read[read_ijk-2+dj2-dk ], c141=read[read_ijk-1+dj2-dk ], c241=read[read_ijk+dj2-dk ], c341=read[read_ijk+1+dj2-dk ], c441=read[read_ijk+2+dj2-dk ];
+
+    const double c002=read[read_ijk-2-dj2    ], c102=read[read_ijk-1-dj2    ], c202=read[read_ijk-dj2    ], c302=read[read_ijk+1-dj2    ], c402=read[read_ijk+2-dj2    ];
+    const double c012=read[read_ijk-2-dj     ], c112=read[read_ijk-1-dj     ], c212=read[read_ijk-dj     ], c312=read[read_ijk+1-dj     ], c412=read[read_ijk+2-dj     ];
+    const double c022=read[read_ijk-2        ], c122=read[read_ijk-1        ], c222=read[read_ijk        ], c322=read[read_ijk+1        ], c422=read[read_ijk+2        ];
+    const double c032=read[read_ijk-2+dj     ], c132=read[read_ijk-1+dj     ], c232=read[read_ijk+dj     ], c332=read[read_ijk+1+dj     ], c432=read[read_ijk+2+dj     ];
+    const double c042=read[read_ijk-2+dj2    ], c142=read[read_ijk-1+dj2    ], c242=read[read_ijk+dj2    ], c342=read[read_ijk+1+dj2    ], c442=read[read_ijk+2+dj2    ];
+
+    const double c003=read[read_ijk-2-dj2+dk ], c103=read[read_ijk-1-dj2+dk ], c203=read[read_ijk-dj2+dk ], c303=read[read_ijk+1-dj2+dk ], c403=read[read_ijk+2-dj2+dk ];
+    const double c013=read[read_ijk-2-dj +dk ], c113=read[read_ijk-1-dj +dk ], c213=read[read_ijk-dj +dk ], c313=read[read_ijk+1-dj +dk ], c413=read[read_ijk+2-dj +dk ];
+    const double c023=read[read_ijk-2    +dk ], c123=read[read_ijk-1    +dk ], c223=read[read_ijk    +dk ], c323=read[read_ijk+1    +dk ], c423=read[read_ijk+2    +dk ];
+    const double c033=read[read_ijk-2+dj +dk ], c133=read[read_ijk-1+dj +dk ], c233=read[read_ijk+dj +dk ], c333=read[read_ijk+1+dj +dk ], c433=read[read_ijk+2+dj +dk ];
+    const double c043=read[read_ijk-2+dj2+dk ], c143=read[read_ijk-1+dj2+dk ], c243=read[read_ijk+dj2+dk ], c343=read[read_ijk+1+dj2+dk ], c443=read[read_ijk+2+dj2+dk ];
+
+    const double c004=read[read_ijk-2-dj2+dk2], c104=read[read_ijk-1-dj2+dk2], c204=read[read_ijk-dj2+dk2], c304=read[read_ijk+1-dj2+dk2], c404=read[read_ijk+2-dj2+dk2];
+    const double c014=read[read_ijk-2-dj +dk2], c114=read[read_ijk-1-dj +dk2], c214=read[read_ijk-dj +dk2], c314=read[read_ijk+1-dj +dk2], c414=read[read_ijk+2-dj +dk2];
+    const double c024=read[read_ijk-2    +dk2], c124=read[read_ijk-1    +dk2], c224=read[read_ijk    +dk2], c324=read[read_ijk+1    +dk2], c424=read[read_ijk+2    +dk2];
+    const double c034=read[read_ijk-2+dj +dk2], c134=read[read_ijk-1+dj +dk2], c234=read[read_ijk+dj +dk2], c334=read[read_ijk+1+dj +dk2], c434=read[read_ijk+2+dj +dk2];
+    const double c044=read[read_ijk-2+dj2+dk2], c144=read[read_ijk-1+dj2+dk2], c244=read[read_ijk+dj2+dk2], c344=read[read_ijk+1+dj2+dk2], c444=read[read_ijk+2+dj2+dk2];
+
+    // interpolate in i to create fine i / coarse jk points...
+    const double f0c00 = ( c200 + c1*(c100-c300) + c2*(c000-c400) ); // same as original 5pt stencil...  f0c00 = ( c2*c000 + c1*c100 + c200 - c1*c300 - c2*c400 )
+    const double f1c00 = ( c200 - c1*(c100-c300) - c2*(c000-c400) );
+    const double f0c10 = ( c210 + c1*(c110-c310) + c2*(c010-c410) );
+    const double f1c10 = ( c210 - c1*(c110-c310) - c2*(c010-c410) );
+    const double f0c20 = ( c220 + c1*(c120-c320) + c2*(c020-c420) );
+    const double f1c20 = ( c220 - c1*(c120-c320) - c2*(c020-c420) );
+    const double f0c30 = ( c230 + c1*(c130-c330) + c2*(c030-c430) );
+    const double f1c30 = ( c230 - c1*(c130-c330) - c2*(c030-c430) );
+    const double f0c40 = ( c240 + c1*(c140-c340) + c2*(c040-c440) );
+    const double f1c40 = ( c240 - c1*(c140-c340) - c2*(c040-c440) );
+
+    const double f0c01 = ( c201 + c1*(c101-c301) + c2*(c001-c401) );
+    const double f1c01 = ( c201 - c1*(c101-c301) - c2*(c001-c401) );
+    const double f0c11 = ( c211 + c1*(c111-c311) + c2*(c011-c411) );
+    const double f1c11 = ( c211 - c1*(c111-c311) - c2*(c011-c411) );
+    const double f0c21 = ( c221 + c1*(c121-c321) + c2*(c021-c421) );
+    const double f1c21 = ( c221 - c1*(c121-c321) - c2*(c021-c421) );
+    const double f0c31 = ( c231 + c1*(c131-c331) + c2*(c031-c431) );
+    const double f1c31 = ( c231 - c1*(c131-c331) - c2*(c031-c431) );
+    const double f0c41 = ( c241 + c1*(c141-c341) + c2*(c041-c441) );
+    const double f1c41 = ( c241 - c1*(c141-c341) - c2*(c041-c441) );
+
+    const double f0c02 = ( c202 + c1*(c102-c302) + c2*(c002-c402) );
+    const double f1c02 = ( c202 - c1*(c102-c302) - c2*(c002-c402) );
+    const double f0c12 = ( c212 + c1*(c112-c312) + c2*(c012-c412) );
+    const double f1c12 = ( c212 - c1*(c112-c312) - c2*(c012-c412) );
+    const double f0c22 = ( c222 + c1*(c122-c322) + c2*(c022-c422) );
+    const double f1c22 = ( c222 - c1*(c122-c322) - c2*(c022-c422) );
+    const double f0c32 = ( c232 + c1*(c132-c332) + c2*(c032-c432) );
+    const double f1c32 = ( c232 - c1*(c132-c332) - c2*(c032-c432) );
+    const double f0c42 = ( c242 + c1*(c142-c342) + c2*(c042-c442) );
+    const double f1c42 = ( c242 - c1*(c142-c342) - c2*(c042-c442) );
+
+    const double f0c03 = ( c203 + c1*(c103-c303) + c2*(c003-c403) );
+    const double f1c03 = ( c203 - c1*(c103-c303) - c2*(c003-c403) );
+    const double f0c13 = ( c213 + c1*(c113-c313) + c2*(c013-c413) );
+    const double f1c13 = ( c213 - c1*(c113-c313) - c2*(c013-c413) );
+    const double f0c23 = ( c223 + c1*(c123-c323) + c2*(c023-c423) );
+    const double f1c23 = ( c223 - c1*(c123-c323) - c2*(c023-c423) );
+    const double f0c33 = ( c233 + c1*(c133-c333) + c2*(c033-c433) );
+    const double f1c33 = ( c233 - c1*(c133-c333) - c2*(c033-c433) );
+    const double f0c43 = ( c243 + c1*(c143-c343) + c2*(c043-c443) );
+    const double f1c43 = ( c243 - c1*(c143-c343) - c2*(c043-c443) );
+
+    const double f0c04 = ( c204 + c1*(c104-c304) + c2*(c004-c404) );
+    const double f1c04 = ( c204 - c1*(c104-c304) - c2*(c004-c404) );
+    const double f0c14 = ( c214 + c1*(c114-c314) + c2*(c014-c414) );
+    const double f1c14 = ( c214 - c1*(c114-c314) - c2*(c014-c414) );
+    const double f0c24 = ( c224 + c1*(c124-c324) + c2*(c024-c424) );
+    const double f1c24 = ( c224 - c1*(c124-c324) - c2*(c024-c424) );
+    const double f0c34 = ( c234 + c1*(c134-c334) + c2*(c034-c434) );
+    const double f1c34 = ( c234 - c1*(c134-c334) - c2*(c034-c434) );
+    const double f0c44 = ( c244 + c1*(c144-c344) + c2*(c044-c444) );
+    const double f1c44 = ( c244 - c1*(c144-c344) - c2*(c044-c444) );
+
+    // interpolate in j to create fine ij / coarse k points...
+    const double f00c0 = (f0c20 + c1*(f0c10-f0c30) + c2*(f0c00-f0c40) );
+    const double f10c0 = (f1c20 + c1*(f1c10-f1c30) + c2*(f1c00-f1c40) );
+    const double f01c0 = (f0c20 - c1*(f0c10-f0c30) - c2*(f0c00-f0c40) );
+    const double f11c0 = (f1c20 - c1*(f1c10-f1c30) - c2*(f1c00-f1c40) );
+
+    const double f00c1 = (f0c21 + c1*(f0c11-f0c31) + c2*(f0c01-f0c41) );
+    const double f10c1 = (f1c21 + c1*(f1c11-f1c31) + c2*(f1c01-f1c41) );
+    const double f01c1 = (f0c21 - c1*(f0c11-f0c31) - c2*(f0c01-f0c41) );
+    const double f11c1 = (f1c21 - c1*(f1c11-f1c31) - c2*(f1c01-f1c41) );
+
+    const double f00c2 = (f0c22 + c1*(f0c12-f0c32) + c2*(f0c02-f0c42) );
+    const double f10c2 = (f1c22 + c1*(f1c12-f1c32) + c2*(f1c02-f1c42) );
+    const double f01c2 = (f0c22 - c1*(f0c12-f0c32) - c2*(f0c02-f0c42) );
+    const double f11c2 = (f1c22 - c1*(f1c12-f1c32) - c2*(f1c02-f1c42) );
+
+    const double f00c3 = (f0c23 + c1*(f0c13-f0c33) + c2*(f0c03-f0c43) );
+    const double f10c3 = (f1c23 + c1*(f1c13-f1c33) + c2*(f1c03-f1c43) );
+    const double f01c3 = (f0c23 - c1*(f0c13-f0c33) - c2*(f0c03-f0c43) );
+    const double f11c3 = (f1c23 - c1*(f1c13-f1c33) - c2*(f1c03-f1c43) );
+
+    const double f00c4 = (f0c24 + c1*(f0c14-f0c34) + c2*(f0c04-f0c44) );
+    const double f10c4 = (f1c24 + c1*(f1c14-f1c34) + c2*(f1c04-f1c44) );
+    const double f01c4 = (f0c24 - c1*(f0c14-f0c34) - c2*(f0c04-f0c44) );
+    const double f11c4 = (f1c24 - c1*(f1c14-f1c34) - c2*(f1c04-f1c44) );
+
+    // interpolate in k to create fine ijk points...
+    const double f000 = (f00c2 + c1*(f00c1-f00c3) + c2*(f00c0-f00c4) );
+    const double f100 = (f10c2 + c1*(f10c1-f10c3) + c2*(f10c0-f10c4) );
+    const double f010 = (f01c2 + c1*(f01c1-f01c3) + c2*(f01c0-f01c4) );
+    const double f110 = (f11c2 + c1*(f11c1-f11c3) + c2*(f11c0-f11c4) );
+    const double f001 = (f00c2 - c1*(f00c1-f00c3) - c2*(f00c0-f00c4) );
+    const double f101 = (f10c2 - c1*(f10c1-f10c3) - c2*(f10c0-f10c4) );
+    const double f011 = (f01c2 - c1*(f01c1-f01c3) - c2*(f01c0-f01c4) );
+    const double f111 = (f11c2 - c1*(f11c1-f11c3) - c2*(f11c0-f11c4) );
+
+    // commit to memory...
+    #if 0 // compiler cannot infer/speculate write[ijk+write_jStride] is disjoint from write[ijk], and thus cannot vectorize...
+    write[write_ijk                              ] = prescale_f*write[write_ijk                              ] + f000;
+    write[write_ijk+1                            ] = prescale_f*write[write_ijk+1                            ] + f100;
+    write[write_ijk  +write_jStride              ] = prescale_f*write[write_ijk  +write_jStride              ] + f010;
+    write[write_ijk+1+write_jStride              ] = prescale_f*write[write_ijk+1+write_jStride              ] + f110;
+    write[write_ijk                +write_kStride] = prescale_f*write[write_ijk                +write_kStride] + f001;
+    write[write_ijk+1              +write_kStride] = prescale_f*write[write_ijk+1              +write_kStride] + f101;
+    write[write_ijk  +write_jStride+write_kStride] = prescale_f*write[write_ijk  +write_jStride+write_kStride] + f011;
+    write[write_ijk+1+write_jStride+write_kStride] = prescale_f*write[write_ijk+1+write_jStride+write_kStride] + f111;
+    #else // use a unique restrict pointer for each pencil...
+    write00[i  ] = prescale_f*write00[i  ] + f000;
+    write00[i+1] = prescale_f*write00[i+1] + f100;
+    write10[i  ] = prescale_f*write10[i  ] + f010;
+    write10[i+1] = prescale_f*write10[i+1] + f110;
+    write01[i  ] = prescale_f*write01[i  ] + f001;
+    write01[i+1] = prescale_f*write01[i+1] + f101;
+    write11[i  ] = prescale_f*write11[i  ] + f011;
+    write11[i+1] = prescale_f*write11[i+1] + f111;
+    #endif
+
+  }}}
+  #endif
+
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// perform a (inter-level) volumetric quartic interpolation on vector id_c of the coarse level and increments prescale_f*vector id_f on the fine level by the result
+// i.e. id_f = prescale_f*id_f + P*id_c
+// prescale_f is nominally 1.0 or 0.0
+// quartic interpolation requires a full ghost zone exchange and boundary condition
+// This is a rather bulk synchronous implementation which packs all MPI buffers before initiating any sends
+// Similarly, it waits for all remote data before copying any into local boxes.
+// It does however attempt to overlap local interpolation with MPI
+void interpolation_v4(level_type * level_f, int id_f, double prescale_f, level_type *level_c, int id_c){
+    exchange_boundary(level_c,id_c,STENCIL_SHAPE_BOX);
+         apply_BCs_v4(level_c,id_c,STENCIL_SHAPE_BOX);
+
+  double _timeCommunicationStart = getTime();
+  double _timeStart,_timeEnd;
+  int buffer=0;
+  int n;
+  int my_tag = (level_f->tag<<4) | 0x7;
+
+
+  #ifdef USE_MPI
+  // by convention, level_f allocates a combined array of requests for both level_f recvs and level_c sends...
+  int nMessages = level_c->interpolation.num_sends + level_f->interpolation.num_recvs;
+  MPI_Request *recv_requests = level_f->interpolation.requests;
+  MPI_Request *send_requests = level_f->interpolation.requests + level_f->interpolation.num_recvs;
+
+
+  // loop through packed list of MPI receives and prepost Irecv's...
+  if(level_f->interpolation.num_recvs>0){
+    _timeStart = getTime();
+    #ifdef USE_MPI_THREAD_MULTIPLE
+    #pragma omp parallel for schedule(dynamic,1)
+    #endif
+    for(n=0;n<level_f->interpolation.num_recvs;n++){
+      MPI_Irecv(level_f->interpolation.recv_buffers[n],
+                level_f->interpolation.recv_sizes[n],
+                MPI_DOUBLE,
+                level_f->interpolation.recv_ranks[n],
+                my_tag,
+                MPI_COMM_WORLD,
+                &recv_requests[n]
+      );
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_recv += (_timeEnd-_timeStart);
+  }
+
+
+  // pack MPI send buffers...
+  if(level_c->interpolation.num_blocks[0]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[0])
+    for(buffer=0;buffer<level_c->interpolation.num_blocks[0];buffer++){
+      // !!! prescale==0 because you don't want to increment the MPI buffer
+      interpolation_v4_block(level_f,id_f,0.0,level_c,id_c,&level_c->interpolation.blocks[0][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_pack += (_timeEnd-_timeStart);
+  }
+
+
+  // loop through MPI send buffers and post Isend's...
+  if(level_c->interpolation.num_sends>0){
+    _timeStart = getTime();
+    #ifdef USE_MPI_THREAD_MULTIPLE
+    #pragma omp parallel for schedule(dynamic,1)
+    #endif
+    for(n=0;n<level_c->interpolation.num_sends;n++){
+      MPI_Isend(level_c->interpolation.send_buffers[n],
+                level_c->interpolation.send_sizes[n],
+                MPI_DOUBLE,
+                level_c->interpolation.send_ranks[n],
+                my_tag,
+                MPI_COMM_WORLD,
+                &send_requests[n]
+      );
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_send += (_timeEnd-_timeStart);
+  }
+  #endif
+
+
+  // perform local interpolation... try and hide within Isend latency... 
+  if(level_c->interpolation.num_blocks[1]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->interpolation.num_blocks[1])
+    for(buffer=0;buffer<level_c->interpolation.num_blocks[1];buffer++){
+      interpolation_v4_block(level_f,id_f,prescale_f,level_c,id_c,&level_c->interpolation.blocks[1][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_local += (_timeEnd-_timeStart);
+  }
+
+
+  // wait for MPI to finish...
+  #ifdef USE_MPI 
+  if(nMessages>0){
+    _timeStart = getTime();
+    MPI_Waitall(nMessages,level_f->interpolation.requests,level_f->interpolation.status);
+    _timeEnd = getTime();
+    level_f->timers.interpolation_wait += (_timeEnd-_timeStart);
+  }
+
+
+  // unpack MPI receive buffers 
+  if(level_f->interpolation.num_blocks[2]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->interpolation.num_blocks[2])
+    for(buffer=0;buffer<level_f->interpolation.num_blocks[2];buffer++){
+      IncrementBlock(level_f,id_f,prescale_f,&level_f->interpolation.blocks[2][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.interpolation_unpack += (_timeEnd-_timeStart);
+  }
+  #endif 
+ 
+ 
+  level_f->timers.interpolation_total += (double)(getTime()-_timeCommunicationStart);
+}
diff --git a/Util/hpgmg/finite-volume/source/operators/jacobi.c b/Util/hpgmg/finite-volume/source/operators/jacobi.c
new file mode 100644
index 00000000..30efce4a
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/jacobi.c
@@ -0,0 +1,73 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+//------------------------------------------------------------------------------------------------------------------------------
+void smooth(level_type * level, int x_id, int rhs_id, double a, double b){
+  if(NUM_SMOOTHS&1){
+    fprintf(stderr,"error - NUM_SMOOTHS must be even...\n");
+    exit(0);
+  }
+
+  #ifdef USE_L1JACOBI
+  double weight = 1.0;
+  #else
+  double weight = 2.0/3.0;
+  #endif
+ 
+  int block,s;
+  for(s=0;s<NUM_SMOOTHS;s++){
+    // exchange ghost zone data... Jacobi ping pongs between x_id and VECTOR_TEMP
+    if((s&1)==0){exchange_boundary(level,       x_id,stencil_get_shape());apply_BCs(level,       x_id,stencil_get_shape());}
+            else{exchange_boundary(level,VECTOR_TEMP,stencil_get_shape());apply_BCs(level,VECTOR_TEMP,stencil_get_shape());}
+
+    // apply the smoother... Jacobi ping pongs between x_id and VECTOR_TEMP
+    double _timeStart = getTime();
+
+    PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
+    for(block=0;block<level->num_my_blocks;block++){
+      const int box = level->my_blocks[block].read.box;
+      const int ilo = level->my_blocks[block].read.i;
+      const int jlo = level->my_blocks[block].read.j;
+      const int klo = level->my_blocks[block].read.k;
+      const int ihi = level->my_blocks[block].dim.i + ilo;
+      const int jhi = level->my_blocks[block].dim.j + jlo;
+      const int khi = level->my_blocks[block].dim.k + klo;
+      int i,j,k;
+      const int ghosts = level->box_ghosts;
+      const int jStride = level->my_boxes[box].jStride;
+      const int kStride = level->my_boxes[box].kStride;
+      const double h2inv = 1.0/(level->h*level->h);
+      const double * __restrict__ rhs    = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+      #ifdef USE_L1JACOBI
+      const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride);
+      #else
+      const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
+      #endif
+        const double * __restrict__ x_n;
+              double * __restrict__ x_np1;
+                      if((s&1)==0){x_n   = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride);
+                                   x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);}
+                              else{x_n   = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride);
+                                   x_np1 = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride);}
+
+      for(k=klo;k<khi;k++){
+      for(j=jlo;j<jhi;j++){
+      for(i=ilo;i<ihi;i++){
+        int ijk = i + j*jStride + k*kStride;
+        double Ax_n = apply_op_ijk(x_n);
+        x_np1[ijk] = x_n[ijk] + weight*lambda[ijk]*(rhs[ijk]-Ax_n);
+      }}}
+
+    } // box-loop
+    level->timers.smooth += (double)(getTime()-_timeStart);
+  } // s-loop
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators/misc.c b/Util/hpgmg/finite-volume/source/operators/misc.c
new file mode 100644
index 00000000..a90b12a7
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/misc.c
@@ -0,0 +1,508 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+void zero_vector(level_type * level, int id_a){
+  // zero's the entire grid INCLUDING ghost zones...
+  double _timeStart = getTime();
+  int block;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+          int ilo = level->my_blocks[block].read.i;
+          int jlo = level->my_blocks[block].read.j;
+          int klo = level->my_blocks[block].read.k;
+          int ihi = level->my_blocks[block].dim.i + ilo;
+          int jhi = level->my_blocks[block].dim.j + jlo;
+          int khi = level->my_blocks[block].dim.k + klo;
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int     dim = level->my_boxes[box].dim;
+
+    // expand the size of the block to include the ghost zones...
+    if(ilo<=  0)ilo-=ghosts; 
+    if(jlo<=  0)jlo-=ghosts; 
+    if(klo<=  0)klo-=ghosts; 
+    if(ihi>=dim)ihi+=ghosts; 
+    if(jhi>=dim)jhi+=ghosts; 
+    if(khi>=dim)khi+=ghosts; 
+
+    double * __restrict__ grid = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride);
+
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){
+      int ijk = i + j*jStride + k*kStride;
+      grid[ijk] = 0.0;
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void init_vector(level_type * level, int id_a, double scalar){
+  // initializes the grid to a scalar while zero'ing the ghost zones...
+  double _timeStart = getTime();
+  int block;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+          int ilo = level->my_blocks[block].read.i;
+          int jlo = level->my_blocks[block].read.j;
+          int klo = level->my_blocks[block].read.k;
+          int ihi = level->my_blocks[block].dim.i + ilo;
+          int jhi = level->my_blocks[block].dim.j + jlo;
+          int khi = level->my_blocks[block].dim.k + klo;
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int     dim = level->my_boxes[box].dim;
+
+    // expand the size of the block to include the ghost zones...
+    if(ilo<=  0)ilo-=ghosts; 
+    if(jlo<=  0)jlo-=ghosts; 
+    if(klo<=  0)klo-=ghosts; 
+    if(ihi>=dim)ihi+=ghosts; 
+    if(jhi>=dim)jhi+=ghosts; 
+    if(khi>=dim)khi+=ghosts; 
+
+    double * __restrict__ grid = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride);
+
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){
+        int ijk = i + j*jStride + k*kStride;
+        int ghostZone = (i<0) || (j<0) || (k<0) || (i>=dim) || (j>=dim) || (k>=dim);
+        grid[ijk] = ghostZone ? 0.0 : scalar;
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// add vectors id_a (scaled by scale_a) and id_b (scaled by scale_b) and store the result in vector id_c
+// i.e. c[] = scale_a*a[] + scale_b*b[]
+// note, only non ghost zone values are included in this calculation
+void add_vectors(level_type * level, int id_c, double scale_a, int id_a, double scale_b, int id_b){
+  double _timeStart = getTime();
+
+  int block;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+    const int ilo = level->my_blocks[block].read.i;
+    const int jlo = level->my_blocks[block].read.j;
+    const int klo = level->my_blocks[block].read.k;
+    const int ihi = level->my_blocks[block].dim.i + ilo;
+    const int jhi = level->my_blocks[block].dim.j + jlo;
+    const int khi = level->my_blocks[block].dim.k + klo;
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride);
+    double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride);
+    double * __restrict__ grid_b = level->my_boxes[box].vectors[id_b] + ghosts*(1+jStride+kStride);
+
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){
+        int ijk = i + j*jStride + k*kStride;
+        grid_c[ijk] = scale_a*grid_a[ijk] + scale_b*grid_b[ijk];
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// multiply each element of vector id_a by vector id_b and scale, and place the result in vector id_c
+// i.e. c[]=scale*a[]*b[]
+// note, only non ghost zone values are included in this calculation
+void mul_vectors(level_type * level, int id_c, double scale, int id_a, int id_b){
+  double _timeStart = getTime();
+
+  int block;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+    const int ilo = level->my_blocks[block].read.i;
+    const int jlo = level->my_blocks[block].read.j;
+    const int klo = level->my_blocks[block].read.k;
+    const int ihi = level->my_blocks[block].dim.i + ilo;
+    const int jhi = level->my_blocks[block].dim.j + jlo;
+    const int khi = level->my_blocks[block].dim.k + klo;
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride);
+    double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride);
+    double * __restrict__ grid_b = level->my_boxes[box].vectors[id_b] + ghosts*(1+jStride+kStride);
+
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){
+        int ijk = i + j*jStride + k*kStride;
+        grid_c[ijk] = scale*grid_a[ijk]*grid_b[ijk];
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// invert each element of vector id_a, scale by scale_a, and place the result in vector id_c
+// i.e. c[]=scale_a/a[]
+// note, only non ghost zone values are included in this calculation
+void invert_vector(level_type * level, int id_c, double scale_a, int id_a){
+  double _timeStart = getTime();
+
+  int block;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+    const int ilo = level->my_blocks[block].read.i;
+    const int jlo = level->my_blocks[block].read.j;
+    const int klo = level->my_blocks[block].read.k;
+    const int ihi = level->my_blocks[block].dim.i + ilo;
+    const int jhi = level->my_blocks[block].dim.j + jlo;
+    const int khi = level->my_blocks[block].dim.k + klo;
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride);
+    double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride);
+
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){
+        int ijk = i + j*jStride + k*kStride;
+        grid_c[ijk] = scale_a/grid_a[ijk];
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// scale vector id_a by scale_a and place the result in vector id_c
+// i.e. c[]=scale_a*a[]
+// note, only non ghost zone values are included in this calculation
+void scale_vector(level_type * level, int id_c, double scale_a, int id_a){
+  double _timeStart = getTime();
+
+  int block;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+    const int ilo = level->my_blocks[block].read.i;
+    const int jlo = level->my_blocks[block].read.j;
+    const int klo = level->my_blocks[block].read.k;
+    const int ihi = level->my_blocks[block].dim.i + ilo;
+    const int jhi = level->my_blocks[block].dim.j + jlo;
+    const int khi = level->my_blocks[block].dim.k + klo;
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride);
+    double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride);
+
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){
+        int ijk = i + j*jStride + k*kStride;
+        grid_c[ijk] = scale_a*grid_a[ijk];
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// return the dot product of vectors id_a and id_b
+// note, only non ghost zone values are included in this calculation
+double dot(level_type * level, int id_a, int id_b){
+  double _timeStart = getTime();
+
+
+  int block;
+  double a_dot_b_level =  0.0;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,block,level->num_my_blocks,a_dot_b_level)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+    const int ilo = level->my_blocks[block].read.i;
+    const int jlo = level->my_blocks[block].read.j;
+    const int klo = level->my_blocks[block].read.k;
+    const int ihi = level->my_blocks[block].dim.i + ilo;
+    const int jhi = level->my_blocks[block].dim.j + jlo;
+    const int khi = level->my_blocks[block].dim.k + klo;
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+    double * __restrict__ grid_b = level->my_boxes[box].vectors[id_b] + ghosts*(1+jStride+kStride);
+    double a_dot_b_block = 0.0;
+
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){
+      int ijk = i + j*jStride + k*kStride;
+      a_dot_b_block += grid_a[ijk]*grid_b[ijk];
+    }}}
+    a_dot_b_level+=a_dot_b_block;
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+
+  #ifdef USE_MPI
+  double _timeStartAllReduce = getTime();
+  double send = a_dot_b_level;
+  MPI_Allreduce(&send,&a_dot_b_level,1,MPI_DOUBLE,MPI_SUM,level->MPI_COMM_ALLREDUCE);
+  double _timeEndAllReduce = getTime();
+  level->timers.collectives   += (double)(_timeEndAllReduce-_timeStartAllReduce);
+  #endif
+
+  return(a_dot_b_level);
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
+// return the max (infinity) norm of the vector id_a.
+// note, only non ghost zone values are included in this calculation
+double norm(level_type * level, int id_a){ // implements the max norm
+  double _timeStart = getTime();
+
+  int block;
+  double max_norm =  0.0;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,block,level->num_my_blocks,max_norm)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+    const int ilo = level->my_blocks[block].read.i;
+    const int jlo = level->my_blocks[block].read.j;
+    const int klo = level->my_blocks[block].read.k;
+    const int ihi = level->my_blocks[block].dim.i + ilo;
+    const int jhi = level->my_blocks[block].dim.j + jlo;
+    const int khi = level->my_blocks[block].dim.k + klo;
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    double * __restrict__ grid   = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+    double block_norm = 0.0;
+
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){ 
+      int ijk = i + j*jStride + k*kStride;
+      double fabs_grid_ijk = fabs(grid[ijk]);
+      if(fabs_grid_ijk>block_norm){block_norm=fabs_grid_ijk;} // max norm
+    }}}
+
+    if(block_norm>max_norm){max_norm = block_norm;}
+  } // block list
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+
+  #ifdef USE_MPI
+  double _timeStartAllReduce = getTime();
+  double send = max_norm;
+  MPI_Allreduce(&send,&max_norm,1,MPI_DOUBLE,MPI_MAX,level->MPI_COMM_ALLREDUCE);
+  double _timeEndAllReduce = getTime();
+  level->timers.collectives   += (double)(_timeEndAllReduce-_timeStartAllReduce);
+  #endif
+  return(max_norm);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// return the mean (arithmetic average value) of vector id_a
+// essentially, this is a l1 norm by a scaling by the inverse of the total (global) number of cells
+// note, only non ghost zone values are included in this calculation
+double mean(level_type * level, int id_a){
+  double _timeStart = getTime();
+
+
+  int block;
+  double sum_level =  0.0;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,block,level->num_my_blocks,sum_level)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+    const int ilo = level->my_blocks[block].read.i;
+    const int jlo = level->my_blocks[block].read.j;
+    const int klo = level->my_blocks[block].read.k;
+    const int ihi = level->my_blocks[block].dim.i + ilo;
+    const int jhi = level->my_blocks[block].dim.j + jlo;
+    const int khi = level->my_blocks[block].dim.k + klo;
+    int i,j,k;
+    int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+    double sum_block = 0.0;
+
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){
+      int ijk = i + j*jStride + k*kStride;
+      sum_block += grid_a[ijk];
+    }}}
+    sum_level+=sum_block;
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+  double ncells_level = (double)level->dim.i*(double)level->dim.j*(double)level->dim.k;
+
+  #ifdef USE_MPI
+  double _timeStartAllReduce = getTime();
+  double send = sum_level;
+  MPI_Allreduce(&send,&sum_level,1,MPI_DOUBLE,MPI_SUM,level->MPI_COMM_ALLREDUCE);
+  double _timeEndAllReduce = getTime();
+  level->timers.collectives   += (double)(_timeEndAllReduce-_timeStartAllReduce);
+  #endif
+
+  double mean_level = sum_level / ncells_level;
+  return(mean_level);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// add the scalar value shift_a to each element of vector id_a and store the result in vector id_c
+// note, only non ghost zone values are included in this calculation
+void shift_vector(level_type * level, int id_c, int id_a, double shift_a){
+  double _timeStart = getTime();
+  int block;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+    const int ilo = level->my_blocks[block].read.i;
+    const int jlo = level->my_blocks[block].read.j;
+    const int klo = level->my_blocks[block].read.k;
+    const int ihi = level->my_blocks[block].dim.i + ilo;
+    const int jhi = level->my_blocks[block].dim.j + jlo;
+    const int khi = level->my_blocks[block].dim.k + klo;
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+    double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+
+
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){
+      int ijk = i + j*jStride + k*kStride;
+      grid_c[ijk] = grid_a[ijk] + shift_a;
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
+// calculate the error between two vectors (id_a and id_b) using either the max (infinity) norm or the L2 norm
+// note, only non ghost zone values are included in this calculation
+double error(level_type * level, int id_a, int id_b){
+  double h3 = level->h * level->h * level->h;
+               add_vectors(level,VECTOR_TEMP,1.0,id_a,-1.0,id_b);            // VECTOR_TEMP = id_a - id_b
+  double   max =      norm(level,VECTOR_TEMP);                return(max);   // max norm of error function
+  double    L2 = sqrt( dot(level,VECTOR_TEMP,VECTOR_TEMP)*h3);return( L2);   // normalized L2 error ?
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// Color the vector id_a with 1's and 0's
+// The pattern is dictated by the number of colors in each dimension and the 'active' color (i,j,kcolor)
+// note, only non ghost zone values are included in this calculation
+//   e.g. colors_in_each_dim=3, icolor=1, jcolor=2...
+//   -+---+---+---+-
+//    | 0 | 1 | 0 |
+//   -+---+---+---+-
+//    | 0 | 0 | 0 |
+//   -+---+---+---+-
+//    | 0 | 0 | 0 |
+//   -+---+---+---+-
+//
+void color_vector(level_type * level, int id_a, int colors_in_each_dim, int icolor, int jcolor, int kcolor){
+  double _timeStart = getTime();
+  int block;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+    const int ilo = level->my_blocks[block].read.i;
+    const int jlo = level->my_blocks[block].read.j;
+    const int klo = level->my_blocks[block].read.k;
+    const int ihi = level->my_blocks[block].dim.i + ilo;
+    const int jhi = level->my_blocks[block].dim.j + jlo;
+    const int khi = level->my_blocks[block].dim.k + klo;
+    const int boxlowi = level->my_boxes[box].low.i;
+    const int boxlowj = level->my_boxes[box].low.j;
+    const int boxlowk = level->my_boxes[box].low.k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    double * __restrict__ grid = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+    int i,j,k;
+
+    for(k=klo;k<khi;k++){double sk=0.0;if( ((k+boxlowk+kcolor)%colors_in_each_dim) == 0 )sk=1.0; // if colors_in_each_dim==1 (don't color), all cells are set to 1.0
+    for(j=jlo;j<jhi;j++){double sj=0.0;if( ((j+boxlowj+jcolor)%colors_in_each_dim) == 0 )sj=1.0;
+    for(i=ilo;i<ihi;i++){double si=0.0;if( ((i+boxlowi+icolor)%colors_in_each_dim) == 0 )si=1.0;
+      int ijk = i + j*jStride + k*kStride;
+      grid[ijk] = si*sj*sk;
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// Initialize each element of vector id_a with a "random" value.  
+// For simplicity, random is defined as -1.0 or +1.0 and is based on whether the coordinates of the element are even or odd
+// note, only non ghost zone values are included in this calculation
+void random_vector(level_type * level, int id_a){
+  double _timeStart = getTime();
+  int block;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+    const int ilo = level->my_blocks[block].read.i;
+    const int jlo = level->my_blocks[block].read.j;
+    const int klo = level->my_blocks[block].read.k;
+    const int ihi = level->my_blocks[block].dim.i + ilo;
+    const int jhi = level->my_blocks[block].dim.j + jlo;
+    const int khi = level->my_blocks[block].dim.k + klo;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    double * __restrict__ grid = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+    int i,j,k;
+
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){
+      int ijk = i + j*jStride + k*kStride;
+      grid[ijk] = -1.000 + 2.0*(i^j^k^0x1);
+    }}}
+  }
+  level->timers.blas1 += (double)(getTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators/problem.fv.c b/Util/hpgmg/finite-volume/source/operators/problem.fv.c
new file mode 100644
index 00000000..e6ea7481
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/problem.fv.c
@@ -0,0 +1,139 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef M_PI
+#define M_PI 3.14159265358979323846 // in case math.h doesn't define it
+#endif
+double evaluateBeta(double x, double y, double z, double h, int add_Bxx, int add_Byy, int add_Bzz){
+  double b = 0.25;
+  double a = 2.0*M_PI; // one period on [0,1]^3
+
+  double B    = 1.0 + b*sin(a*x)*sin(a*y)*sin(a*z);
+//double Bx   =     a*b*cos(a*x)*sin(a*y)*sin(a*z);
+//double By   =     a*b*sin(a*x)*cos(a*y)*sin(a*z);
+//double Bz   =     a*b*sin(a*x)*sin(a*y)*cos(a*z);
+  double Bxx  =  -a*a*b*sin(a*x)*sin(a*y)*sin(a*z);
+  double Byy  =  -a*a*b*sin(a*x)*sin(a*y)*sin(a*z);
+  double Bzz   = -a*a*b*sin(a*x)*sin(a*y)*sin(a*z);
+
+  // 4th order correction to approximate the conversion of cell-centered values to cell-averaged...
+  if(add_Bxx)B+=(h*h/24.0)*Bxx;
+  if(add_Byy)B+=(h*h/24.0)*Byy;
+  if(add_Bzz)B+=(h*h/24.0)*Bzz;
+  return(B);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+double evaluateF(double x, double y, double z, double h, int add_Fxx, int add_Fyy, int add_Fzz){
+  #if 0 // harder problem... not sure I manually differentiated this right...
+  // 8 'poles', one per octant
+  double    cx = 0.75;
+  double    cy = 0.75;
+  double    cz = 0.75,sign = 1.0;
+  if(x<0.5){cx = 0.25;sign*=-1.0;}
+  if(y<0.5){cy = 0.25;sign*=-1.0;}
+  if(z<0.5){cz = 0.25;sign*=-1.0;}
+
+  double r0  = 0.1;
+  double a   = M_PI/2/r0;
+  double r   =  pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) ,  0.5); // euclidean distance
+  double rx  =  pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -0.5)*(x-cx); // dr/dx
+  double ry  =  pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -0.5)*(y-cy);
+  double rz  =  pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -0.5)*(z-cz);
+  double rxx = -pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -1.5)*(x-cx)*(x-cx) + pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -0.5); // d2r/dx2
+  double ryy = -pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -1.5)*(y-cy)*(y-cy) + pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -0.5);
+  double rzz = -pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -1.5)*(z-cz)*(z-cz) + pow( (x-cx)*(x-cx) + (y-cy)*(y-cy) + (z-cz)*(z-cz) , -0.5);
+
+  double p   = 6.0;
+  double F   = sign*(        pow(cos(a*r),p  )    );
+  double Fx  = sign*(   -a*p*pow(cos(a*r),p-1)*sin(a*r)*rx );
+  double Fy  = sign*(   -a*p*pow(cos(a*r),p-1)*sin(a*r)*ry );
+  double Fz  = sign*(   -a*p*pow(cos(a*r),p-1)*sin(a*r)*rz );
+  double Fxx = sign*( -a*a*p*pow(cos(a*r),p  )*rx*rx  +  a*a*p*(p-1)*pow(cos(a*r),p-2)*pow(sin(a*r),2)*rx*rx  -  a*p*pow(cos(a*r),p-1)*sin(a*r)*rxx );
+  double Fyy = sign*( -a*a*p*pow(cos(a*r),p  )*ry*ry  +  a*a*p*(p-1)*pow(cos(a*r),p-2)*pow(sin(a*r),2)*ry*ry  -  a*p*pow(cos(a*r),p-1)*sin(a*r)*ryy );
+  double Fzz = sign*( -a*a*p*pow(cos(a*r),p  )*rz*rz  +  a*a*p*(p-1)*pow(cos(a*r),p-2)*pow(sin(a*r),2)*rz*rz  -  a*p*pow(cos(a*r),p-1)*sin(a*r)*rzz );
+
+  if(r>=r0){
+    F   = 0.0;
+    Fx  = 0.0;
+    Fy  = 0.0;
+    Fz  = 0.0;
+    Fxx = 0.0;
+    Fyy = 0.0;
+    Fzz = 0.0;
+  }
+  #else
+  double a = 2.0*M_PI;
+  double p = 7.0;
+  double F   =        pow(sin(a*x),p  )*pow(sin(a*y),p  )*pow(sin(a*z),p  );
+//double Fx  =    a*p*pow(sin(a*x),p-1)*pow(sin(a*y),p  )*pow(sin(a*z),p  )*cos(a*x);
+//double Fy  =    a*p*pow(sin(a*x),p  )*pow(sin(a*y),p-1)*pow(sin(a*z),p  )*cos(a*y);
+//double Fz  =    a*p*pow(sin(a*x),p  )*pow(sin(a*y),p  )*pow(sin(a*z),p-1)*cos(a*z);
+  double Fxx = -a*a*p*pow(sin(a*x),p  )*pow(sin(a*y),p  )*pow(sin(a*z),p  )  +  a*a*p*(p-1)*pow(sin(a*x),p-2)*pow(sin(a*y),p  )*pow(sin(a*z),p  )*pow(cos(a*x),2);
+  double Fyy = -a*a*p*pow(sin(a*x),p  )*pow(sin(a*y),p  )*pow(sin(a*z),p  )  +  a*a*p*(p-1)*pow(sin(a*x),p  )*pow(sin(a*y),p-2)*pow(sin(a*z),p  )*pow(cos(a*y),2);
+  double Fzz = -a*a*p*pow(sin(a*x),p  )*pow(sin(a*y),p  )*pow(sin(a*z),p  )  +  a*a*p*(p-1)*pow(sin(a*x),p  )*pow(sin(a*y),p  )*pow(sin(a*z),p-2)*pow(cos(a*z),2);
+  #endif
+
+  // 4th order correction to approximate the conversion of cell-centered values to cell-averaged...
+  if(add_Fxx)F+=(h*h/24.0)*Fxx;
+  if(add_Fyy)F+=(h*h/24.0)*Fyy;
+  if(add_Fzz)F+=(h*h/24.0)*Fzz;
+
+  return(F);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void initialize_problem(level_type * level, double hLevel, double a, double b){
+  level->h = hLevel;
+
+  int box;
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int   dim_i = level->my_boxes[box].dim;
+    const int   dim_j = level->my_boxes[box].dim;
+    const int   dim_k = level->my_boxes[box].dim;
+    #ifdef _OPENMP
+    #pragma omp parallel for private(k,j,i) collapse(3)
+    #endif
+    for(k=0;k<=dim_k;k++){ // include high face
+    for(j=0;j<=dim_j;j++){ // include high face
+    for(i=0;i<=dim_i;i++){ // include high face
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      int ijk = (i+ghosts) + (j+ghosts)*jStride + (k+ghosts)*kStride;
+      double x = hLevel*( (double)(i+level->my_boxes[box].low.i) + 0.5 ); // +0.5 to get to the center of cell
+      double y = hLevel*( (double)(j+level->my_boxes[box].low.j) + 0.5 );
+      double z = hLevel*( (double)(k+level->my_boxes[box].low.k) + 0.5 );
+      double A,Bi,Bj,Bk;
+      //double A,B,Bx,By,Bz,Bi,Bj,Bk;
+      //double U,Ux,Uy,Uz,Uxx,Uyy,Uzz;
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      A  = 1.0;
+      Bi = 1.0;
+      Bj = 1.0;
+      Bk = 1.0;
+      #ifdef STENCIL_VARIABLE_COEFFICIENT // variable coefficient problem...
+      Bi=evaluateBeta(x-hLevel*0.5,y           ,z           ,hLevel,0,1,1); // face-centered value of Beta for beta_i
+      Bj=evaluateBeta(x           ,y-hLevel*0.5,z           ,hLevel,1,0,1); // face-centered value of Beta for beta_j
+      Bk=evaluateBeta(x           ,y           ,z-hLevel*0.5,hLevel,1,1,0); // face-centered value of Beta for beta_k
+      #endif
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      double F=evaluateF(x,y,z,hLevel,1,1,1);
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      level->my_boxes[box].vectors[VECTOR_ALPHA ][ijk] = A;
+      level->my_boxes[box].vectors[VECTOR_BETA_I][ijk] = Bi;
+      level->my_boxes[box].vectors[VECTOR_BETA_J][ijk] = Bj;
+      level->my_boxes[box].vectors[VECTOR_BETA_K][ijk] = Bk;
+      level->my_boxes[box].vectors[VECTOR_F     ][ijk] = F;
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+    }}}
+  }
+
+}
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators/problem.p4.c b/Util/hpgmg/finite-volume/source/operators/problem.p4.c
new file mode 100644
index 00000000..3f74ee1f
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/problem.p4.c
@@ -0,0 +1,124 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+void evaluateBeta(double x, double y, double z, double *B, double *Bx, double *By, double *Bz){
+  double Bmin =  1.0;
+  double Bmax = 10.0;
+  double c2 = (Bmax-Bmin)/2; // coefficients to affect this transition
+  double c1 = (Bmax+Bmin)/2;
+  double c3 = 10.0;          // how sharply (B)eta transitions
+  double xcenter = 0.50;
+  double ycenter = 0.50;
+  double zcenter = 0.50;
+  // calculate distance from center of the domain (0.5,0.5,0.5)
+  double r2   = pow((x-xcenter),2) +  pow((y-ycenter),2) +  pow((z-zcenter),2);
+  double r2x  = 2.0*(x-xcenter);
+  double r2y  = 2.0*(y-ycenter);
+  double r2z  = 2.0*(z-zcenter);
+//double r2xx = 2.0;
+//double r2yy = 2.0;
+//double r2zz = 2.0;
+  double r    = pow(r2,0.5);
+  double rx   = 0.5*r2x*pow(r2,-0.5);
+  double ry   = 0.5*r2y*pow(r2,-0.5);
+  double rz   = 0.5*r2z*pow(r2,-0.5);
+//double rxx  = 0.5*r2xx*pow(r2,-0.5) - 0.25*r2x*r2x*pow(r2,-1.5);
+//double ryy  = 0.5*r2yy*pow(r2,-0.5) - 0.25*r2y*r2y*pow(r2,-1.5);
+//double rzz  = 0.5*r2zz*pow(r2,-0.5) - 0.25*r2z*r2z*pow(r2,-1.5);
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  *B  =           c1+c2*tanh( c3*(r-0.25) );
+  *Bx = c2*c3*rx*(1-pow(tanh( c3*(r-0.25) ),2));
+  *By = c2*c3*ry*(1-pow(tanh( c3*(r-0.25) ),2));
+  *Bz = c2*c3*rz*(1-pow(tanh( c3*(r-0.25) ),2));
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void evaluateU(double x, double y, double z, double *U, double *Ux, double *Uy, double *Uz, double *Uxx, double *Uyy, double *Uzz, int isPeriodic){
+  // should be continuous in u, u', and u''
+  // v(w) = w^4 - 2w^3 + w^2 + c
+  // u(x,y,z) = v(x)v(y)v(z)
+  // If Periodic, then the integral of the RHS should sum to zero.
+  //   Setting shift=1/30 should ensure that the integrals of X, Y, or Z should sum to zero... 
+  //   That should(?) make the integrals of u,ux,uy,uz,uxx,uyy,uzz sum to zero and thus make the integral of f sum to zero
+  // If dirichlet, then w(0)=w(1) = 0.0
+  //   Setting shift to 0 should ensure that U(x,y,z) = 0 on boundary
+  double shift = 0.0;if(isPeriodic)shift= -1.0/30.0;
+  double X   =  1.0*pow(x,4) -  2.0*pow(x,3) + 1.0*pow(x,2) + shift;
+  double Y   =  1.0*pow(y,4) -  2.0*pow(y,3) + 1.0*pow(y,2) + shift;
+  double Z   =  1.0*pow(z,4) -  2.0*pow(z,3) + 1.0*pow(z,2) + shift;
+  double Xx  =  4.0*pow(x,3) -  6.0*pow(x,2) + 2.0*x;
+  double Yy  =  4.0*pow(y,3) -  6.0*pow(y,2) + 2.0*y;
+  double Zz  =  4.0*pow(z,3) -  6.0*pow(z,2) + 2.0*z;
+  double Xxx = 12.0*pow(x,2) - 12.0*x        + 2.0;
+  double Yyy = 12.0*pow(y,2) - 12.0*y        + 2.0;
+  double Zzz = 12.0*pow(z,2) - 12.0*z        + 2.0;
+        *U   = X*Y*Z;
+        *Ux  = Xx*Y*Z;
+        *Uy  = X*Yy*Z;
+        *Uz  = X*Y*Zz;
+        *Uxx = Xxx*Y*Z;
+        *Uyy = X*Yyy*Z;
+        *Uzz = X*Y*Zzz;
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void initialize_problem(level_type * level, double hLevel, double a, double b){
+  level->h = hLevel;
+
+  int box;
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int   dim_i = level->my_boxes[box].dim;
+    const int   dim_j = level->my_boxes[box].dim;
+    const int   dim_k = level->my_boxes[box].dim;
+    #ifdef _OPENMP
+    #pragma omp parallel for private(k,j,i) collapse(3)
+    #endif
+    for(k=0;k<=dim_k;k++){ // include high face
+    for(j=0;j<=dim_j;j++){ // include high face
+    for(i=0;i<=dim_i;i++){ // include high face
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      int ijk = (i+ghosts) + (j+ghosts)*jStride + (k+ghosts)*kStride;
+      double x = hLevel*( (double)(i+level->my_boxes[box].low.i) + 0.5 ); // +0.5 to get to the center of cell
+      double y = hLevel*( (double)(j+level->my_boxes[box].low.j) + 0.5 );
+      double z = hLevel*( (double)(k+level->my_boxes[box].low.k) + 0.5 );
+      double A,B,Bx,By,Bz,Bi,Bj,Bk;
+      double U,Ux,Uy,Uz,Uxx,Uyy,Uzz;
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      A  = 1.0;
+      B  = 1.0;
+      Bx = 0.0;
+      By = 0.0;
+      Bz = 0.0; 
+      Bi = 1.0;
+      Bj = 1.0;
+      Bk = 1.0;
+      #ifdef STENCIL_VARIABLE_COEFFICIENT // variable coefficient problem...
+      evaluateBeta(x-hLevel*0.5,y           ,z           ,&Bi,&Bx,&By,&Bz); // face-centered value of Beta for beta_i
+      evaluateBeta(x           ,y-hLevel*0.5,z           ,&Bj,&Bx,&By,&Bz); // face-centered value of Beta for beta_j
+      evaluateBeta(x           ,y           ,z-hLevel*0.5,&Bk,&Bx,&By,&Bz); // face-centered value of Beta for beta_k
+      evaluateBeta(x           ,y           ,z           ,&B ,&Bx,&By,&Bz); // cell-centered value of Beta
+      #endif
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      evaluateU(x,y,z,&U,&Ux,&Uy,&Uz,&Uxx,&Uyy,&Uzz, (level->boundary_condition.type == BC_PERIODIC) );
+      double F = a*A*U - b*( (Bx*Ux + By*Uy + Bz*Uz)  +  B*(Uxx + Uyy + Uzz) );
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      level->my_boxes[box].vectors[VECTOR_BETA_I][ijk] = Bi;
+      level->my_boxes[box].vectors[VECTOR_BETA_J][ijk] = Bj;
+      level->my_boxes[box].vectors[VECTOR_BETA_K][ijk] = Bk;
+      level->my_boxes[box].vectors[VECTOR_ALPHA ][ijk] = A;
+    //level->my_boxes[box].vectors[VECTOR_UTRUE ][ijk] = U; // obviated by Richardson analysis
+      level->my_boxes[box].vectors[VECTOR_F     ][ijk] = F;
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+    }}}
+  }
+
+}
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators/problem.p6.c b/Util/hpgmg/finite-volume/source/operators/problem.p6.c
new file mode 100644
index 00000000..49cca26b
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/problem.p6.c
@@ -0,0 +1,134 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+void evaluateBeta(double x, double y, double z, double *B, double *Bx, double *By, double *Bz){
+  double Bmin =  1.0;
+  double Bmax = 10.0;
+  double c2 = (Bmax-Bmin)/2; // coefficients to affect this transition
+  double c1 = (Bmax+Bmin)/2;
+  double c3 = 10.0;          // how sharply (B)eta transitions
+  double xcenter = 0.50;
+  double ycenter = 0.50;
+  double zcenter = 0.50;
+  // calculate distance from center of the domain (0.5,0.5,0.5)
+  double r2   = pow((x-xcenter),2) +  pow((y-ycenter),2) +  pow((z-zcenter),2);
+  double r2x  = 2.0*(x-xcenter);
+  double r2y  = 2.0*(y-ycenter);
+  double r2z  = 2.0*(z-zcenter);
+//double r2xx = 2.0;
+//double r2yy = 2.0;
+//double r2zz = 2.0;
+  double r    = pow(r2,0.5);
+  double rx   = 0.5*r2x*pow(r2,-0.5);
+  double ry   = 0.5*r2y*pow(r2,-0.5);
+  double rz   = 0.5*r2z*pow(r2,-0.5);
+//double rxx  = 0.5*r2xx*pow(r2,-0.5) - 0.25*r2x*r2x*pow(r2,-1.5);
+//double ryy  = 0.5*r2yy*pow(r2,-0.5) - 0.25*r2y*r2y*pow(r2,-1.5);
+//double rzz  = 0.5*r2zz*pow(r2,-0.5) - 0.25*r2z*r2z*pow(r2,-1.5);
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  *B  =           c1+c2*tanh( c3*(r-0.25) );
+  *Bx = c2*c3*rx*(1-pow(tanh( c3*(r-0.25) ),2));
+  *By = c2*c3*ry*(1-pow(tanh( c3*(r-0.25) ),2));
+  *Bz = c2*c3*rz*(1-pow(tanh( c3*(r-0.25) ),2));
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void evaluateU(double x, double y, double z, double *U, double *Ux, double *Uy, double *Uz, double *Uxx, double *Uyy, double *Uzz, int isPeriodic){
+  // should be continuous in u, u', u'', u''', and u'''' to guarantee high order and periodic boundaries
+  // v(w) = ???
+  // u(x,y,z) = v(x)v(y)v(z)
+  // If Periodic, then the integral of the RHS should sum to zero.
+  //   Setting shift=1.0 should ensure that the integrals of X, Y, or Z should sum to zero... 
+  //   That should(?) make the integrals of u,ux,uy,uz,uxx,uyy,uzz sum to zero and thus make the integral of f sum to zero
+  // If dirichlet, then w(0)=w(1) = 0.0
+  //   Setting shift to 0 should ensure that U(x,y,z) = 0 on boundary
+  //    u =    ax^6 +    bx^5 +   cx^4 +  dx^3 +  ex^2 + fx + g
+  //   ux =   6ax^5 +   5bx^4 +  4cx^3 + 3dx^2 + 2ex   + f
+  //  uxx =  30ax^4 +  20bx^3 + 12cx^2 + 6dx   + 2e
+  // a =   42.0
+  // b = -126.0
+  // c =  105.0
+  // d =    0.0
+  // e =  -21.0
+  // f =    0.0
+  // g =    1.0
+  double shift = 0.0;if(isPeriodic)shift= 1.0/21.0;
+  double X     =  2.0*pow(x,6) -   6.0*pow(x,5) +  5.0*pow(x,4) - 1.0*pow(x,2) + shift;
+  double Y     =  2.0*pow(y,6) -   6.0*pow(y,5) +  5.0*pow(y,4) - 1.0*pow(y,2) + shift;
+  double Z     =  2.0*pow(z,6) -   6.0*pow(z,5) +  5.0*pow(z,4) - 1.0*pow(z,2) + shift;
+  double Xx    = 12.0*pow(x,5) -  30.0*pow(x,4) + 20.0*pow(x,3) - 2.0*x;
+  double Yy    = 12.0*pow(y,5) -  30.0*pow(y,4) + 20.0*pow(y,3) - 2.0*y;
+  double Zz    = 12.0*pow(z,5) -  30.0*pow(z,4) + 20.0*pow(z,3) - 2.0*z;
+  double Xxx   = 60.0*pow(x,4) - 120.0*pow(x,3) + 60.0*pow(x,2) - 2.0;
+  double Yyy   = 60.0*pow(y,4) - 120.0*pow(y,3) + 60.0*pow(y,2) - 2.0;
+  double Zzz   = 60.0*pow(z,4) - 120.0*pow(z,3) + 60.0*pow(z,2) - 2.0;
+        *U     = X   * Y   * Z;
+        *Ux    = Xx  * Y   * Z;
+        *Uy    = X   * Yy  * Z;
+        *Uz    = X   * Y   * Zz;
+        *Uxx   = Xxx * Y   * Z;
+        *Uyy   = X   * Yyy * Z;
+        *Uzz   = X   * Y   * Zzz;
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void initialize_problem(level_type * level, double hLevel, double a, double b){
+  level->h = hLevel;
+
+  int box;
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int   dim_i = level->my_boxes[box].dim;
+    const int   dim_j = level->my_boxes[box].dim;
+    const int   dim_k = level->my_boxes[box].dim;
+    #ifdef _OPENMP
+    #pragma omp parallel for private(k,j,i) collapse(3)
+    #endif
+    for(k=0;k<=dim_k;k++){ // include high face
+    for(j=0;j<=dim_j;j++){ // include high face
+    for(i=0;i<=dim_i;i++){ // include high face
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      int ijk = (i+ghosts) + (j+ghosts)*jStride + (k+ghosts)*kStride;
+      double x = hLevel*( (double)(i+level->my_boxes[box].low.i) + 0.5 ); // +0.5 to get to the center of cell
+      double y = hLevel*( (double)(j+level->my_boxes[box].low.j) + 0.5 );
+      double z = hLevel*( (double)(k+level->my_boxes[box].low.k) + 0.5 );
+      double A,B,Bx,By,Bz,Bi,Bj,Bk;
+      double U,Ux,Uy,Uz,Uxx,Uyy,Uzz;
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      A  = 1.0;
+      B  = 1.0;
+      Bx = 0.0;
+      By = 0.0;
+      Bz = 0.0; 
+      Bi = 1.0;
+      Bj = 1.0;
+      Bk = 1.0;
+      #ifdef STENCIL_VARIABLE_COEFFICIENT // variable coefficient problem...
+      evaluateBeta(x-hLevel*0.5,y           ,z           ,&Bi,&Bx,&By,&Bz); // face-centered value of Beta for beta_i
+      evaluateBeta(x           ,y-hLevel*0.5,z           ,&Bj,&Bx,&By,&Bz); // face-centered value of Beta for beta_j
+      evaluateBeta(x           ,y           ,z-hLevel*0.5,&Bk,&Bx,&By,&Bz); // face-centered value of Beta for beta_k
+      evaluateBeta(x           ,y           ,z           ,&B ,&Bx,&By,&Bz); // cell-centered value of Beta
+      #endif
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      evaluateU(x,y,z,&U,&Ux,&Uy,&Uz,&Uxx,&Uyy,&Uzz, (level->boundary_condition.type == BC_PERIODIC) );
+      double F = a*A*U - b*( (Bx*Ux + By*Uy + Bz*Uz)  +  B*(Uxx + Uyy + Uzz) );
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      level->my_boxes[box].vectors[VECTOR_BETA_I][ijk] = Bi;
+      level->my_boxes[box].vectors[VECTOR_BETA_J][ijk] = Bj;
+      level->my_boxes[box].vectors[VECTOR_BETA_K][ijk] = Bk;
+      level->my_boxes[box].vectors[VECTOR_ALPHA ][ijk] = A;
+    //level->my_boxes[box].vectors[VECTOR_UTRUE ][ijk] = U; // obviated by Richardson analysis
+      level->my_boxes[box].vectors[VECTOR_F     ][ijk] = F;
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+    }}}
+  }
+
+}
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators/problem.sine.c b/Util/hpgmg/finite-volume/source/operators/problem.sine.c
new file mode 100644
index 00000000..caa67f67
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/problem.sine.c
@@ -0,0 +1,120 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef M_PI
+#define M_PI 3.14159265358979323846 // in case math.h doesn't define it
+#endif
+void evaluateBeta(double x, double y, double z, double *B, double *Bx, double *By, double *Bz){
+  double Bmin =  1.0;
+  double Bmax = 10.0;
+  double c2 = (Bmax-Bmin)/2; // coefficients to affect this transition
+  double c1 = (Bmax+Bmin)/2;
+  double c3 = 10.0;          // how sharply (B)eta transitions
+  double xcenter = 0.50;
+  double ycenter = 0.50;
+  double zcenter = 0.50;
+  // calculate distance from center of the domain (0.5,0.5,0.5)
+  double r2   = pow((x-xcenter),2) +  pow((y-ycenter),2) +  pow((z-zcenter),2);
+  double r2x  = 2.0*(x-xcenter);
+  double r2y  = 2.0*(y-ycenter);
+  double r2z  = 2.0*(z-zcenter);
+//double r2xx = 2.0;
+//double r2yy = 2.0;
+//double r2zz = 2.0;
+  double r    = pow(r2,0.5);
+  double rx   = 0.5*r2x*pow(r2,-0.5);
+  double ry   = 0.5*r2y*pow(r2,-0.5);
+  double rz   = 0.5*r2z*pow(r2,-0.5);
+//double rxx  = 0.5*r2xx*pow(r2,-0.5) - 0.25*r2x*r2x*pow(r2,-1.5);
+//double ryy  = 0.5*r2yy*pow(r2,-0.5) - 0.25*r2y*r2y*pow(r2,-1.5);
+//double rzz  = 0.5*r2zz*pow(r2,-0.5) - 0.25*r2z*r2z*pow(r2,-1.5);
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  *B  =           c1+c2*tanh( c3*(r-0.25) );
+  *Bx = c2*c3*rx*(1-pow(tanh( c3*(r-0.25) ),2));
+  *By = c2*c3*ry*(1-pow(tanh( c3*(r-0.25) ),2));
+  *Bz = c2*c3*rz*(1-pow(tanh( c3*(r-0.25) ),2));
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void evaluateU(double x, double y, double z, double *U, double *Ux, double *Uy, double *Uz, double *Uxx, double *Uyy, double *Uzz, int isPeriodic){
+  double c1 = 2.0*M_PI;
+  double c2 = 6.0*M_PI;
+  double p = 13; // must be odd(?) and allows up to p-2 order MG
+        *U    =                                                       pow(sin(c1*x),p  )*pow(sin(c1*y),p)*pow(sin(c1*z),p);
+        *Ux   =                                        c1*p*cos(c1*x)*pow(sin(c1*x),p-1)*pow(sin(c1*y),p)*pow(sin(c1*z),p);
+        *Uy   =                                        c1*p*cos(c1*y)*pow(sin(c1*y),p-1)*pow(sin(c1*x),p)*pow(sin(c1*z),p);
+        *Uz   =                                        c1*p*cos(c1*z)*pow(sin(c1*z),p-1)*pow(sin(c1*x),p)*pow(sin(c1*y),p);
+        *Uxx  = c1*c1*p*( (p-1)*pow(sin(c1*x),p-2)*pow(cos(c1*x),2) - pow(sin(c1*x),p) )*pow(sin(c1*y),p)*pow(sin(c1*z),p);
+        *Uyy  = c1*c1*p*( (p-1)*pow(sin(c1*y),p-2)*pow(cos(c1*y),2) - pow(sin(c1*y),p) )*pow(sin(c1*x),p)*pow(sin(c1*z),p);
+        *Uzz  = c1*c1*p*( (p-1)*pow(sin(c1*z),p-2)*pow(cos(c1*z),2) - pow(sin(c1*z),p) )*pow(sin(c1*x),p)*pow(sin(c1*y),p);
+
+        *U   +=                                                       pow(sin(c2*x),p  )*pow(sin(c2*y),p)*pow(sin(c2*z),p);
+        *Ux  +=                                        c2*p*cos(c2*x)*pow(sin(c2*x),p-1)*pow(sin(c2*y),p)*pow(sin(c2*z),p);
+        *Uy  +=                                        c2*p*cos(c2*y)*pow(sin(c2*y),p-1)*pow(sin(c2*x),p)*pow(sin(c2*z),p);
+        *Uz  +=                                        c2*p*cos(c2*z)*pow(sin(c2*z),p-1)*pow(sin(c2*x),p)*pow(sin(c2*y),p);
+        *Uxx += c2*c2*p*( (p-1)*pow(sin(c2*x),p-2)*pow(cos(c2*x),2) - pow(sin(c2*x),p) )*pow(sin(c2*y),p)*pow(sin(c2*z),p);
+        *Uyy += c2*c2*p*( (p-1)*pow(sin(c2*y),p-2)*pow(cos(c2*y),2) - pow(sin(c2*y),p) )*pow(sin(c2*x),p)*pow(sin(c2*z),p);
+        *Uzz += c2*c2*p*( (p-1)*pow(sin(c2*z),p-2)*pow(cos(c2*z),2) - pow(sin(c2*z),p) )*pow(sin(c2*x),p)*pow(sin(c2*y),p);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void initialize_problem(level_type * level, double hLevel, double a, double b){
+  level->h = hLevel;
+
+  int box;
+  for(box=0;box<level->num_my_boxes;box++){
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const int   dim_i = level->my_boxes[box].dim;
+    const int   dim_j = level->my_boxes[box].dim;
+    const int   dim_k = level->my_boxes[box].dim;
+    #ifdef _OPENMP
+    #pragma omp parallel for private(k,j,i) collapse(3)
+    #endif
+    for(k=0;k<=dim_k;k++){ // include high face
+    for(j=0;j<=dim_j;j++){ // include high face
+    for(i=0;i<=dim_i;i++){ // include high face
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      int ijk = (i+ghosts) + (j+ghosts)*jStride + (k+ghosts)*kStride;
+      double x = hLevel*( (double)(i+level->my_boxes[box].low.i) + 0.5 ); // +0.5 to get to the center of cell
+      double y = hLevel*( (double)(j+level->my_boxes[box].low.j) + 0.5 );
+      double z = hLevel*( (double)(k+level->my_boxes[box].low.k) + 0.5 );
+      double A,B,Bx,By,Bz,Bi,Bj,Bk;
+      double U,Ux,Uy,Uz,Uxx,Uyy,Uzz;
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      A  = 1.0;
+      B  = 1.0;
+      Bx = 0.0;
+      By = 0.0;
+      Bz = 0.0; 
+      Bi = 1.0;
+      Bj = 1.0;
+      Bk = 1.0;
+      #ifdef STENCIL_VARIABLE_COEFFICIENT // variable coefficient problem...
+      evaluateBeta(x-hLevel*0.5,y           ,z           ,&Bi,&Bx,&By,&Bz); // face-centered value of Beta for beta_i
+      evaluateBeta(x           ,y-hLevel*0.5,z           ,&Bj,&Bx,&By,&Bz); // face-centered value of Beta for beta_j
+      evaluateBeta(x           ,y           ,z-hLevel*0.5,&Bk,&Bx,&By,&Bz); // face-centered value of Beta for beta_k
+      evaluateBeta(x           ,y           ,z           ,&B ,&Bx,&By,&Bz); // cell-centered value of Beta
+      #endif
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      evaluateU(x,y,z,&U,&Ux,&Uy,&Uz,&Uxx,&Uyy,&Uzz, (level->boundary_condition.type == BC_PERIODIC) );
+      double F = a*A*U - b*( (Bx*Ux + By*Uy + Bz*Uz)  +  B*(Uxx + Uyy + Uzz) );
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      level->my_boxes[box].vectors[VECTOR_BETA_I][ijk] = Bi;
+      level->my_boxes[box].vectors[VECTOR_BETA_J][ijk] = Bj;
+      level->my_boxes[box].vectors[VECTOR_BETA_K][ijk] = Bk;
+      level->my_boxes[box].vectors[VECTOR_ALPHA ][ijk] = A;
+    //level->my_boxes[box].vectors[VECTOR_UTRUE ][ijk] = U; // obviated by Richardson analysis
+      level->my_boxes[box].vectors[VECTOR_F     ][ijk] = F;
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+    }}}
+  }
+
+}
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators/rebuild.c b/Util/hpgmg/finite-volume/source/operators/rebuild.c
new file mode 100644
index 00000000..04fc7978
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/rebuild.c
@@ -0,0 +1,202 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+/*
+// power method for calculating the dominant eigenvalue of D^{-1}A
+double power_method(level_type * level, double a, double b, int max_iterations){
+  int i;
+  int  x_id = VECTOR_U;
+  int Ax_id = VECTOR_TEMP;
+  double lambda_max = 0;
+
+  #ifdef USE_MPI
+  double lmax_start = MPI_Wtime();
+  #endif
+  if(level->my_rank==0){fprintf(stdout,"  calculating lambda_max...");fflush(stdout);}
+
+  random_vector(level,x_id);
+  for(i=0;i<max_iterations;i++){
+   apply_op(level,Ax_id, x_id,a,b);
+   mul_vectors(level,Ax_id,1.0,VECTOR_DINV,Ax_id); // D^{-1}Ax
+   double   x_dot_x = dot(level, x_id,x_id);
+   double DAx_dot_x = dot(level,Ax_id,x_id);
+   lambda_max = DAx_dot_x / x_dot_x;
+   double Ax_max = norm(level,Ax_id); // renormalize Ax (== new x)
+   scale_vector(level,x_id,1.0/Ax_max,Ax_id); 
+  }
+  #ifdef USE_MPI
+  if(level->my_rank==0){fprintf(stdout,"  %1.15e (%0.6f seconds)\n",lambda_max,MPI_Wtime()-lmax_start);}
+  #else
+  if(level->my_rank==0){fprintf(stdout,"  %1.15e\n",lambda_max);}
+  #endif
+  return(lambda_max);
+}
+*/
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// Accurate estimates of D^{-1} are essential in realizing high-performance and stable smoothers.
+// Unfortunately, complex boundary conditions can make it difficult to express D^{-1} analytically
+// As such, this black-box routine will calculate D^{-1}, l1 norm, the dominant eigenvalue using only the apply_op_ijk macro
+// colors_in_each_dim should be sufficiently large as to decouple the boundary condition from the operator
+// e.g. with quartic BC's, colors_in_each_dim==4 (total of 64 colors in 3D)
+// If using periodic BCs, one should be able to set colors_in_each_dim to stencil_get_radius();
+// NOTE, as this function is not timed, it has not been optimized for performance.
+void rebuild_operator_blackbox(level_type * level, double a, double b, int colors_in_each_dim){
+
+  // trying to color a 1^3 grid with 8 colors won't work... reduce the number of colors...
+  if(level->dim.i<colors_in_each_dim)colors_in_each_dim=level->dim.i;
+  if(level->dim.j<colors_in_each_dim)colors_in_each_dim=level->dim.j;
+  if(level->dim.k<colors_in_each_dim)colors_in_each_dim=level->dim.k;
+
+  if(level->my_rank==0){fprintf(stdout,"  calculating D^{-1} exactly for level h=%e using %d colors...  ",level->h,colors_in_each_dim*colors_in_each_dim*colors_in_each_dim);fflush(stdout);}
+  #ifdef USE_MPI
+  double dinv_start = MPI_Wtime();
+  #endif
+
+  #if 0 // naive version using existing routines.  Doesn't calculate l1inv or estimate the dominant eigenvalue
+  int         x_id = VECTOR_U;
+  int        Ax_id = VECTOR_TEMP;
+  int icolor,jcolor,kcolor;
+  zero_vector(level,VECTOR_DINV);
+  zero_vector(level,VECTOR_L1INV);
+  for(kcolor=0;kcolor<colors_in_each_dim;kcolor++){
+  for(jcolor=0;jcolor<colors_in_each_dim;jcolor++){
+  for(icolor=0;icolor<colors_in_each_dim;icolor++){
+    color_vector(level,x_id,colors_in_each_dim,icolor,jcolor,kcolor);  // color the grid as 1's and 0's
+        apply_op(level,Ax_id,x_id,a,b);                                // includes effects of boundary conditions on Aii
+     mul_vectors(level,Ax_id,1.0,x_id,Ax_id);                          // zero out the off-diagonal contributions 
+     add_vectors(level,VECTOR_DINV,1.0,Ax_id,1.0,VECTOR_DINV);         // add to running sum of Aii
+  }}}
+  invert_vector(level,VECTOR_DINV,1.0,VECTOR_DINV);
+  #else
+
+  int         x_id = VECTOR_TEMP;
+  int       Aii_id = VECTOR_DINV;
+  int sumAbsAij_id = VECTOR_L1INV;
+  int icolor,jcolor,kcolor;
+  double dominant_eigenvalue = -1e9;
+  int block;
+
+  // initialize Aii[] = subAbsAij[] = 0's
+  zero_vector(level,      Aii_id);
+  zero_vector(level,sumAbsAij_id);
+
+  // loop over all colors...
+  for(kcolor=0;kcolor<colors_in_each_dim;kcolor++){
+  for(jcolor=0;jcolor<colors_in_each_dim;jcolor++){
+  for(icolor=0;icolor<colors_in_each_dim;icolor++){
+    // color the grid as 1's and 0's
+    color_vector(level,x_id,colors_in_each_dim,icolor,jcolor,kcolor);
+
+    // exchange the boundary of x in preparation for Ax
+    exchange_boundary(level,x_id,stencil_get_shape());
+            apply_BCs(level,x_id,stencil_get_shape());
+ 
+    // apply the operator and add to Aii and AbsAij 
+    PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
+    for(block=0;block<level->num_my_blocks;block++){
+      const int box = level->my_blocks[block].read.box;
+      const int ilo = level->my_blocks[block].read.i;
+      const int jlo = level->my_blocks[block].read.j;
+      const int klo = level->my_blocks[block].read.k;
+      const int ihi = level->my_blocks[block].dim.i + ilo;
+      const int jhi = level->my_blocks[block].dim.j + jlo;
+      const int khi = level->my_blocks[block].dim.k + klo;
+      const int jStride = level->my_boxes[box].jStride;
+      const int kStride = level->my_boxes[box].kStride;
+      const int  ghosts = level->my_boxes[box].ghosts;
+      const double h2inv = 1.0/(level->h*level->h);
+      const double * __restrict__         x = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+      const double * __restrict__     alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+      const double * __restrict__    beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+      const double * __restrict__    beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+      const double * __restrict__    beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+            double * __restrict__       Aii = level->my_boxes[box].vectors[       Aii_id] + ghosts*(1+jStride+kStride);
+            double * __restrict__ sumAbsAij = level->my_boxes[box].vectors[ sumAbsAij_id] + ghosts*(1+jStride+kStride);
+  
+      int i,j,k;
+      for(k=klo;k<khi;k++){
+      for(j=jlo;j<jhi;j++){
+      for(i=ilo;i<ihi;i++){
+        int ijk = i + j*jStride + k*kStride;
+        double Ax = apply_op_ijk(x);
+              Aii[ijk] +=      (    x[ijk])*Ax; // add the effect of setting one grid point (i) to 1.0 to Aii
+        sumAbsAij[ijk] += fabs((1.0-x[ijk])*Ax);
+      }}}
+    }
+  }}}
+
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // take Aii and the row sum sumAbsAij and calculate D^{-1} and L1^{-1}...
+  PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,block,level->num_my_blocks,dominant_eigenvalue)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+    const int ilo = level->my_blocks[block].read.i;
+    const int jlo = level->my_blocks[block].read.j;
+    const int klo = level->my_blocks[block].read.k;
+    const int ihi = level->my_blocks[block].dim.i + ilo;
+    const int jhi = level->my_blocks[block].dim.j + jlo;
+    const int khi = level->my_blocks[block].dim.k + klo;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const double h2inv = 1.0/(level->h*level->h);
+    double * __restrict__       Aii = level->my_boxes[box].vectors[      Aii_id] + ghosts*(1+jStride+kStride);
+    double * __restrict__ sumAbsAij = level->my_boxes[box].vectors[sumAbsAij_id] + ghosts*(1+jStride+kStride);
+
+    double block_eigenvalue = -1e9;
+    int i,j,k;
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){
+      int ijk = i + j*jStride + k*kStride;
+
+      // catch failure...
+      if(Aii[ijk]==0.0){
+        printf("Aii[%d,%d,%d]==0.0 !!!\n",i+level->my_boxes[box].low.i,j+level->my_boxes[box].low.j,k+level->my_boxes[box].low.k);
+        Aii[ijk] = a+b*h2inv; // FIX !!!
+      }
+
+      // upper limit to Gershgorin disc == bound on dominant eigenvalue
+      double Di = (Aii[ijk] + sumAbsAij[ijk])/Aii[ijk];if(Di>block_eigenvalue)block_eigenvalue=Di;
+
+      // inverse of the L1 row norm... L1inv = ( D+D^{L1} )^{-1}
+      // sumAbsAij[ijk] = 1.0/(Aii[ijk]+sumAbsAij[ijk]);
+      // alternately, as suggested by eq 6.5 in Baker et al, "Multigrid smoothers for ultra-parallel computing: additional theory and discussion"...
+      if(Aii[ijk]>=1.5*sumAbsAij[ijk])sumAbsAij[ijk] = 1.0/(Aii[ijk]                   ); // VECTOR_L1INV = ...
+                                 else sumAbsAij[ijk] = 1.0/(Aii[ijk]+0.5*sumAbsAij[ijk]); // VECTOR_L1INV = ...
+
+      // inverse of the diagonal...
+      Aii[ijk] = 1.0/Aii[ijk]; // VECTOR_DINV = ...
+
+    }}}
+    if(block_eigenvalue>dominant_eigenvalue){dominant_eigenvalue = block_eigenvalue;}
+  }
+  #ifdef USE_MPI
+  if(level->my_rank==0){fprintf(stdout,"done (%0.6f seconds)\n",MPI_Wtime()-dinv_start);}
+  #else
+  if(level->my_rank==0){fprintf(stdout,"done\n");}
+  #endif
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // Reduce the local estimate of the dominant eigenvalue to a global estimate
+  #ifdef USE_MPI
+  double _timeStartAllReduce = getTime();
+  double send = dominant_eigenvalue;
+  MPI_Allreduce(&send,&dominant_eigenvalue,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD);
+  double _timeEndAllReduce = getTime();
+  level->timers.collectives   += (double)(_timeEndAllReduce-_timeStartAllReduce);
+  #endif
+  if(level->my_rank==0){fprintf(stdout,"  estimating  lambda_max... <%1.15e\n",dominant_eigenvalue);fflush(stdout);}
+  level->dominant_eigenvalue_of_DinvA = dominant_eigenvalue;
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  //level->dominant_eigenvalue_of_DinvA = power_method(level,a,b,10);
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  #endif
+}
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/operators/residual.c b/Util/hpgmg/finite-volume/source/operators/residual.c
new file mode 100644
index 00000000..36a50f27
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/residual.c
@@ -0,0 +1,50 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+// This routines calculates the residual (res=rhs-Ax) using the linear operator specified in the apply_op_ijk macro
+// This requires exchanging a ghost zone and/or enforcing a boundary condition.
+// NOTE, x_id must be distinct from rhs_id and res_id
+void residual(level_type * level, int res_id, int x_id, int rhs_id, double a, double b){
+  // exchange the boundary for x in prep for Ax...
+  exchange_boundary(level,x_id,stencil_get_shape());
+          apply_BCs(level,x_id,stencil_get_shape());
+
+  // now do residual/restriction proper...
+  double _timeStart = getTime();
+  int block;
+
+  PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks)
+  for(block=0;block<level->num_my_blocks;block++){
+    const int box = level->my_blocks[block].read.box;
+    const int ilo = level->my_blocks[block].read.i;
+    const int jlo = level->my_blocks[block].read.j;
+    const int klo = level->my_blocks[block].read.k;
+    const int ihi = level->my_blocks[block].dim.i + ilo;
+    const int jhi = level->my_blocks[block].dim.j + jlo;
+    const int khi = level->my_blocks[block].dim.k + klo;
+    int i,j,k;
+    const int jStride = level->my_boxes[box].jStride;
+    const int kStride = level->my_boxes[box].kStride;
+    const int  ghosts = level->my_boxes[box].ghosts;
+    const double h2inv = 1.0/(level->h*level->h);
+    const double * __restrict__ x      = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+    const double * __restrict__ rhs    = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
+    const double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+    const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+    const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+    const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+          double * __restrict__ res    = level->my_boxes[box].vectors[       res_id] + ghosts*(1+jStride+kStride);
+
+    for(k=klo;k<khi;k++){
+    for(j=jlo;j<jhi;j++){
+    for(i=ilo;i<ihi;i++){
+      int ijk = i + j*jStride + k*kStride;
+      double Ax = apply_op_ijk(x);
+      res[ijk] = rhs[ijk]-Ax;
+    }}}
+  }
+  level->timers.residual += (double)(getTime()-_timeStart);
+}
+
diff --git a/Util/hpgmg/finite-volume/source/operators/restriction.c b/Util/hpgmg/finite-volume/source/operators/restriction.c
new file mode 100644
index 00000000..d6e94659
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/restriction.c
@@ -0,0 +1,206 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+static inline void restriction_pc_block(level_type *level_c, int id_c, level_type *level_f, int id_f, blockCopy_type *block, int restrictionType){
+  // restrict 3D array from read_i,j,k of read[] to write_i,j,k in write[] using piecewise constant restriction (cell averaged)
+  int   dim_i       = block->dim.i; // calculate the dimensions of the resultant coarse block
+  int   dim_j       = block->dim.j;
+  int   dim_k       = block->dim.k;
+
+  int  read_i       = block->read.i;
+  int  read_j       = block->read.j;
+  int  read_k       = block->read.k;
+  int  read_jStride = block->read.jStride;
+  int  read_kStride = block->read.kStride;
+
+  int write_i       = block->write.i;
+  int write_j       = block->write.j;
+  int write_k       = block->write.k;
+  int write_jStride = block->write.jStride;
+  int write_kStride = block->write.kStride;
+
+  double * __restrict__  read = block->read.ptr;
+  double * __restrict__ write = block->write.ptr;
+  if(block->read.box >=0){
+     read_jStride = level_f->my_boxes[block->read.box ].jStride;
+     read_kStride = level_f->my_boxes[block->read.box ].kStride;
+     read = level_f->my_boxes[ block->read.box].vectors[id_f] + level_f->my_boxes[ block->read.box].ghosts*(1+ read_jStride+ read_kStride);
+  }
+  if(block->write.box>=0){
+    write_jStride = level_c->my_boxes[block->write.box].jStride;
+    write_kStride = level_c->my_boxes[block->write.box].kStride;
+    write = level_c->my_boxes[block->write.box].vectors[id_c] + level_c->my_boxes[block->write.box].ghosts*(1+write_jStride+write_kStride);
+  }
+
+
+
+  int i,j,k;
+  int ii,jj,kk;
+  switch(restrictionType){
+    case RESTRICT_CELL:
+         for(k=0,kk=0;k<dim_k;k++,kk+=2){
+         for(j=0,jj=0;j<dim_j;j++,jj+=2){
+         for(i=0,ii=0;i<dim_i;i++,ii+=2){
+           int write_ijk = (i +write_i) + (j +write_j)*write_jStride + (k +write_k)*write_kStride;
+           int  read_ijk = (ii+ read_i) + (jj+ read_j)* read_jStride + (kk+ read_k)* read_kStride;
+           write[write_ijk] = ( read[read_ijk                            ]+read[read_ijk+1                          ] +
+                                read[read_ijk  +read_jStride             ]+read[read_ijk+1+read_jStride             ] +
+                                read[read_ijk               +read_kStride]+read[read_ijk+1             +read_kStride] +
+                                read[read_ijk  +read_jStride+read_kStride]+read[read_ijk+1+read_jStride+read_kStride] ) * 0.125;
+         }}}break;
+    case RESTRICT_FACE_I:
+         for(k=0,kk=0;k<dim_k;k++,kk+=2){
+         for(j=0,jj=0;j<dim_j;j++,jj+=2){
+         for(i=0,ii=0;i<dim_i;i++,ii+=2){
+           int write_ijk = (i +write_i) + (j +write_j)*write_jStride + (k +write_k)*write_kStride;
+           int  read_ijk = (ii+ read_i) + (jj+ read_j)* read_jStride + (kk+ read_k)* read_kStride;
+           write[write_ijk] = ( read[read_ijk                          ] +
+                                read[read_ijk+read_jStride             ] +
+                                read[read_ijk             +read_kStride] +
+                                read[read_ijk+read_jStride+read_kStride] ) * 0.25;
+         }}}break;
+    case RESTRICT_FACE_J:
+         for(k=0,kk=0;k<dim_k;k++,kk+=2){
+         for(j=0,jj=0;j<dim_j;j++,jj+=2){
+         for(i=0,ii=0;i<dim_i;i++,ii+=2){
+           int write_ijk = (i +write_i) + (j +write_j)*write_jStride + (k +write_k)*write_kStride;
+           int  read_ijk = (ii+ read_i) + (jj+ read_j)* read_jStride + (kk+ read_k)* read_kStride;
+           write[write_ijk] = ( read[read_ijk               ] +
+                                read[read_ijk+1             ] +
+                                read[read_ijk  +read_kStride] +
+                                read[read_ijk+1+read_kStride] ) * 0.25;
+         }}}break;
+    case RESTRICT_FACE_K:
+         for(k=0,kk=0;k<dim_k;k++,kk+=2){
+         for(j=0,jj=0;j<dim_j;j++,jj+=2){
+         for(i=0,ii=0;i<dim_i;i++,ii+=2){
+           int write_ijk = (i +write_i) + (j +write_j)*write_jStride + (k +write_k)*write_kStride;
+           int  read_ijk = (ii+ read_i) + (jj+ read_j)* read_jStride + (kk+ read_k)* read_kStride;
+           write[write_ijk] = ( read[read_ijk               ] +
+                                read[read_ijk+1             ] +
+                                read[read_ijk  +read_jStride] +
+                                read[read_ijk+1+read_jStride] ) * 0.25;
+         }}}break;
+  }
+
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// perform a (inter-level) restriction on vector id_f of the fine level and stores the result in vector id_c on the coarse level
+// restrictionType specifies whether this is either cell-averaged restriction, or one of three face-averaged restrictions
+// piecewise constant restriction requires neither a ghost zone exchange nor a boundary condition
+// This is a rather bulk synchronous implementation which packs all MPI buffers before initiating any sends
+// Similarly, it waits for all remote data before copying any into local boxes.
+// It does however attempt to overlap local restriction with MPI
+void restriction(level_type * level_c, int id_c, level_type *level_f, int id_f, int restrictionType){
+  double _timeCommunicationStart = getTime();
+  double _timeStart,_timeEnd;
+  int buffer=0;
+  int n;
+  int my_tag = (level_f->tag<<4) | 0x5;
+
+
+
+
+  #ifdef USE_MPI
+  // by convention, level_f allocates a combined array of requests for both level_f sends and level_c recvs...
+  int nMessages = level_c->restriction[restrictionType].num_recvs + level_f->restriction[restrictionType].num_sends;
+  MPI_Request *recv_requests = level_f->restriction[restrictionType].requests;
+  MPI_Request *send_requests = level_f->restriction[restrictionType].requests + level_c->restriction[restrictionType].num_recvs;
+
+
+  // loop through packed list of MPI receives and prepost Irecv's...
+  if(level_c->restriction[restrictionType].num_recvs>0){
+    _timeStart = getTime();
+    #ifdef USE_MPI_THREAD_MULTIPLE
+    #pragma omp parallel for schedule(dynamic,1)
+    #endif
+    for(n=0;n<level_c->restriction[restrictionType].num_recvs;n++){
+      MPI_Irecv(level_c->restriction[restrictionType].recv_buffers[n],
+                level_c->restriction[restrictionType].recv_sizes[n],
+                MPI_DOUBLE,
+                level_c->restriction[restrictionType].recv_ranks[n],
+                my_tag,
+                MPI_COMM_WORLD,
+                &recv_requests[n]
+      );
+    }
+    _timeEnd = getTime();
+    level_f->timers.restriction_recv += (_timeEnd-_timeStart);
+  }
+
+
+  // pack MPI send buffers...
+  if(level_f->restriction[restrictionType].num_blocks[0]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->restriction[restrictionType].num_blocks[0])
+    for(buffer=0;buffer<level_f->restriction[restrictionType].num_blocks[0];buffer++){
+      restriction_pc_block(level_c,id_c,level_f,id_f,&level_f->restriction[restrictionType].blocks[0][buffer],restrictionType);
+    }
+    _timeEnd = getTime();
+    level_f->timers.restriction_pack += (_timeEnd-_timeStart);
+  }
+
+ 
+  // loop through MPI send buffers and post Isend's...
+  if(level_f->restriction[restrictionType].num_sends>0){
+    _timeStart = getTime();
+    #ifdef USE_MPI_THREAD_MULTIPLE
+    #pragma omp parallel for schedule(dynamic,1)
+    #endif
+    for(n=0;n<level_f->restriction[restrictionType].num_sends;n++){
+      MPI_Isend(level_f->restriction[restrictionType].send_buffers[n],
+                level_f->restriction[restrictionType].send_sizes[n],
+                MPI_DOUBLE,
+                level_f->restriction[restrictionType].send_ranks[n],
+                my_tag,
+                MPI_COMM_WORLD,
+                &send_requests[n]
+      );
+    }
+    _timeEnd = getTime();
+    level_f->timers.restriction_send += (_timeEnd-_timeStart);
+  }
+  #endif
+
+
+  // perform local restriction[restrictionType]... try and hide within Isend latency... 
+  if(level_f->restriction[restrictionType].num_blocks[1]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->restriction[restrictionType].num_blocks[1])
+    for(buffer=0;buffer<level_f->restriction[restrictionType].num_blocks[1];buffer++){
+      restriction_pc_block(level_c,id_c,level_f,id_f,&level_f->restriction[restrictionType].blocks[1][buffer],restrictionType);
+    }
+    _timeEnd = getTime();
+    level_f->timers.restriction_local += (_timeEnd-_timeStart);
+  }
+
+
+  // wait for MPI to finish...
+  #ifdef USE_MPI 
+  if(nMessages){
+    _timeStart = getTime();
+    MPI_Waitall(nMessages,level_f->restriction[restrictionType].requests,level_f->restriction[restrictionType].status);
+    _timeEnd = getTime();
+    level_f->timers.restriction_wait += (_timeEnd-_timeStart);
+  }
+
+
+  // unpack MPI receive buffers 
+  if(level_c->restriction[restrictionType].num_blocks[2]>0){
+    _timeStart = getTime();
+    PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->restriction[restrictionType].num_blocks[2])
+    for(buffer=0;buffer<level_c->restriction[restrictionType].num_blocks[2];buffer++){
+      CopyBlock(level_c,id_c,&level_c->restriction[restrictionType].blocks[2][buffer]);
+    }
+    _timeEnd = getTime();
+    level_f->timers.restriction_unpack += (_timeEnd-_timeStart);
+  }
+  #endif
+ 
+ 
+  level_f->timers.restriction_total += (double)(getTime()-_timeCommunicationStart);
+}
diff --git a/Util/hpgmg/finite-volume/source/operators/symgs.c b/Util/hpgmg/finite-volume/source/operators/symgs.c
new file mode 100644
index 00000000..e554006a
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/operators/symgs.c
@@ -0,0 +1,57 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+void smooth(level_type * level, int phi_id, int rhs_id, double a, double b){
+  int box,s;
+
+  for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps (forward/backward) per GS smooth
+    exchange_boundary(level,phi_id,stencil_get_shape());
+            apply_BCs(level,phi_id,stencil_get_shape());
+
+    double _timeStart = getTime();
+    #ifdef _OPENMP
+    #pragma omp parallel for private(box)
+    #endif
+    for(box=0;box<level->num_my_boxes;box++){
+      int i,j,k;
+      const int ghosts = level->box_ghosts;
+      const int jStride = level->my_boxes[box].jStride;
+      const int kStride = level->my_boxes[box].kStride;
+      const int     dim = level->my_boxes[box].dim;
+      const double h2inv = 1.0/(level->h*level->h);
+            double * __restrict__ phi      = level->my_boxes[box].vectors[       phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+      const double * __restrict__ rhs      = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ alpha    = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_i   = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_j   = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ beta_k   = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride);
+      const double * __restrict__ Dinv     = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride);
+          
+
+      if( (s&0x1)==0 ){ // forward sweep... hard to thread
+        for(k=0;k<dim;k++){
+        for(j=0;j<dim;j++){
+        for(i=0;i<dim;i++){
+          int ijk = i + j*jStride + k*kStride;
+          double Ax = apply_op_ijk(phi);
+          phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax);
+        }}}
+      }else{ // backward sweep... hard to thread
+        for(k=dim-1;k>=0;k--){
+        for(j=dim-1;j>=0;j--){
+        for(i=dim-1;i>=0;i--){
+          int ijk = i + j*jStride + k*kStride;
+          double Ax = apply_op_ijk(phi);
+          phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax);
+        }}}
+      }
+
+    } // boxes
+    level->timers.smooth += (double)(getTime()-_timeStart);
+  } // s-loop
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/solvers.c b/Util/hpgmg/finite-volume/source/solvers.c
new file mode 100644
index 00000000..158970c4
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/solvers.c
@@ -0,0 +1,101 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#include "timers.h"
+#include "defines.h"
+#include "level.h"
+#include "operators.h"
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef USE_BICGSTAB
+#include "solvers/bicgstab.c"
+#elif  USE_CG
+#include "solvers/cg.c"
+#elif  USE_CABICGSTAB
+#include "solvers/cabicgstab.c"
+#elif  USE_CACG
+#include "solvers/cacg.c"
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+void IterativeSolver(level_type * level, int u_id, int f_id, double a, double b, double desired_reduction_in_norm){ 
+  if(!level->active)return;
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  if(level->must_subtract_mean==-1){
+    level->must_subtract_mean=0;
+    int alpha_is_zero = (dot(level,VECTOR_ALPHA,VECTOR_ALPHA) == 0.0);
+    if( (level->boundary_condition.type==BC_PERIODIC) && ((a==0) || (alpha_is_zero)) )level->must_subtract_mean = 1; // Poisson with Periodic BCs
+  }
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  #if 0
+  if( (level->dim.i==1)&&(level->dim.j==1)&&(level->dim.k==1) ){
+    // I have reduced the system to 1 equation and 1 unknown and know D^{-1} exactly
+    // therefore A^{-1} == D^{-1} = 1/a00
+    // u = A^{-1}f == D^{-1}f
+    mul_vectors(level,u_id,1.0,VECTOR_DINV,f_id); // u = A^{-1}f = D^{-1}f 
+    if(level->must_subtract_mean == 1){
+      double mean_of_u = mean(level,u_id);
+      shift_vector(level,u_id,u_id,-mean_of_u);
+    }
+    return;
+  }
+  #endif
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  #ifdef USE_BICGSTAB
+    BiCGStab(level,u_id,f_id,a,b,desired_reduction_in_norm);
+  #elif  USE_CG
+    CG(level,u_id,f_id,a,b,desired_reduction_in_norm);
+  #elif  USE_CABICGSTAB
+    CABiCGStab(level,u_id,f_id,a,b,desired_reduction_in_norm);
+  #elif  USE_CACG
+    CACG(level,u_id,f_id,a,b,desired_reduction_in_norm);
+  #else 
+    // just point relaxation via multiple smooth()'s
+    if(level->must_subtract_mean == 1){
+      double mean_of_u = mean(level,u_id);
+      shift_vector(level,u_id,u_id,-mean_of_u);
+    }
+    residual(level,VECTOR_TEMP,u_id,f_id,a,b);
+    //mul_vectors(level,VECTOR_TEMP,1.0,VECTOR_TEMP,VECTOR_DINV); //  Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria...
+    double norm_of_r0 = norm(level,VECTOR_TEMP);
+    int s=0,maxSmoothsBottom=200,converged=0;
+    while( (s<maxSmoothsBottom) && !converged){
+      s++;
+      level->Krylov_iterations++;
+      smooth(level,u_id,f_id,a,b);
+      if(level->must_subtract_mean == 1){
+        double mean_of_u = mean(level,u_id);
+        shift_vector(level,u_id,u_id,-mean_of_u);
+      }
+      residual(level,VECTOR_TEMP,u_id,f_id,a,b);
+      //mul_vectors(level,VECTOR_TEMP,1.0,VECTOR_TEMP,VECTOR_DINV); //  Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria...
+      double norm_of_r = norm(level,VECTOR_TEMP);
+      if(norm_of_r == 0.0){converged=1;break;}
+      if(norm_of_r < desired_reduction_in_norm*norm_of_r0){converged=1;break;}
+    }
+  #endif
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+int IterativeSolver_NumVectors(){
+  // additionally number of vectors required by an iterative solver...
+  #ifdef USE_BICGSTAB
+  return(8);                  // BiCGStab requires additional vectors r0,r,p,s,Ap,As
+  #elif  USE_CG
+  return(5);                  // CG requires extra vectors r0,r,p,Ap,z
+  #elif  USE_CABICGSTAB
+  return(4+4*CA_KRYLOV_S);    // CABiCGStab requires additional vectors rt,p,r,P[2s+1],R[2s].
+  #elif  USE_CACG
+  return(4+2*CA_KRYLOV_S);    // CACG requires additional vectors r0,p,r,P[s+1],R[s].
+  #endif
+  return(0);                  // simply doing multiple smooths requires no extra vectors
+}
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/solvers.h b/Util/hpgmg/finite-volume/source/solvers.h
new file mode 100644
index 00000000..7ba580c4
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/solvers.h
@@ -0,0 +1,12 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef SOLVERS_H
+#define SOLVERS_H
+//------------------------------------------------------------------------------------------------------------------------------
+void IterativeSolver(level_type *level, int u_id, int f_id, double a, double b, double desired_reduction_in_norm);
+int  IterativeSolver_NumVectors();
+//------------------------------------------------------------------------------------------------------------------------------
+#endif
diff --git a/Util/hpgmg/finite-volume/source/solvers/bicgstab.c b/Util/hpgmg/finite-volume/source/solvers/bicgstab.c
new file mode 100644
index 00000000..38b4f063
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/solvers/bicgstab.c
@@ -0,0 +1,97 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#define KRYLOV_DIAGONAL_PRECONDITION
+//------------------------------------------------------------------------------------------------------------------------------
+void BiCGStab(level_type * level, int x_id, int R_id, double a, double b, double desired_reduction_in_norm){
+  // Algorithm 7.7 in Iterative Methods for Sparse Linear Systems(Yousef Saad)
+  // Algorithm 1 in Analysis and Practical use of Flexible BiCGStab (Jie Chen)
+  int  r0_id = VECTORS_RESERVED+0;
+  int   r_id = VECTORS_RESERVED+1;
+  int   p_id = VECTORS_RESERVED+2;
+  int   q_id = VECTORS_RESERVED+3; // q = D^{-1}p
+  int   s_id = VECTORS_RESERVED+4;
+  int   t_id = VECTORS_RESERVED+5; // t = D^{-1}s
+  int  Ap_id = VECTORS_RESERVED+6;
+  int  As_id = VECTORS_RESERVED+7;
+
+  int jMax=200;
+  int j=0;
+  int BiCGStabFailed    = 0;
+  int BiCGStabConverged = 0;
+  residual(level,r0_id,x_id,R_id,a,b);                                          // r0[] = R_id[] - A(x_id)
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  if(level->must_subtract_mean == 1){
+    double mean_of_r0 = mean(level,r0_id);
+    shift_vector(level,r0_id,r0_id,-mean_of_r0);
+  }
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  scale_vector(level,r_id,1.0,r0_id);                                           // r[] = r0[]
+  scale_vector(level,p_id,1.0,r0_id);                                           // p[] = r0[]
+  double r_dot_r0 = dot(level,r_id,r0_id);                                      // r_dot_r0 = dot(r,r0)
+  double norm_of_r0 = norm(level,r_id);                                         // the norm of the initial residual...
+  if(r_dot_r0   == 0.0){BiCGStabConverged=1;}                                   // entered BiCGStab with exact solution
+  if(norm_of_r0 == 0.0){BiCGStabConverged=1;}                                   // entered BiCGStab with exact solution
+  while( (j<jMax) && (!BiCGStabFailed) && (!BiCGStabConverged) ){               // while(not done){
+    j++;level->Krylov_iterations++;                                             //
+    #ifdef KRYLOV_DIAGONAL_PRECONDITION                                         //
+    mul_vectors(level,q_id,1.0,VECTOR_DINV,p_id);                               //   q[] = Dinv[]*p[]
+    #else                                                                       //
+    scale_vector(level,q_id,1.0,p_id);                                          //   q[] =        p[]
+    #endif                                                                      //
+    apply_op(level,Ap_id,q_id,a,b);                                             //   Ap[] = AM^{-1}(p)
+    double Ap_dot_r0 = dot(level,Ap_id,r0_id);                                  //   Ap_dot_r0 = dot(Ap,r0)
+    if(Ap_dot_r0 == 0.0){BiCGStabFailed=1;break;}                               //   pivot breakdown ???
+    double alpha = r_dot_r0 / Ap_dot_r0;                                        //   alpha = r_dot_r0 / Ap_dot_r0
+    if(isinf(alpha)){BiCGStabFailed=2;break;}                                   //   pivot breakdown ???
+    add_vectors(level,x_id,1.0,x_id, alpha, q_id);                              //   x_id[] = x_id[] + alpha*q[]
+    add_vectors(level,s_id,1.0,r_id,-alpha,Ap_id);                              //   s[]    = r[]    - alpha*Ap[]   (intermediate residual?)
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    if(level->must_subtract_mean == 1){
+      double mean_of_s = mean(level,s_id);
+      shift_vector(level,s_id,s_id,-mean_of_s);
+    }
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    double norm_of_s = norm(level,s_id);                                        //   FIX - redundant??  norm of intermediate residual
+    if(norm_of_s == 0.0){BiCGStabConverged=1;break;}                            //   FIX - redundant??  if As_dot_As==0, then As must be 0 which implies s==0
+    if(norm_of_s < desired_reduction_in_norm*norm_of_r0){BiCGStabConverged=1;break;}
+    #ifdef KRYLOV_DIAGONAL_PRECONDITION                                         //
+    mul_vectors(level,t_id,1.0,VECTOR_DINV,s_id);                               //   t[] = Dinv[]*s[]
+    #else                                                                       //
+    scale_vector(level,t_id,1.0,s_id);                                          //   t[] =        s[]
+    #endif                                                                      //
+    apply_op(level,As_id,t_id,a,b);                                             //   As = AM^{-1}(s)
+    double As_dot_As = dot(level,As_id,As_id);                                  //   As_dot_As = dot(As,As)
+    double As_dot_s  = dot(level,As_id, s_id);                                  //   As_dot_s  = dot(As, s)
+    if(As_dot_As == 0.0){BiCGStabConverged=1;break;}                            //   converged ?
+    double omega = As_dot_s / As_dot_As;                                        //   omega = As_dot_s / As_dot_As
+    if(omega == 0.0){BiCGStabFailed=3;break;}                                   //   stabilization breakdown ???
+    if(isinf(omega)){BiCGStabFailed=4;break;}                                   //   stabilization breakdown ???
+    add_vectors(level,x_id,1.0,x_id, omega, t_id);                              //   x_id[] = x_id[] + omega*t[]
+    add_vectors(level,r_id,1.0,s_id,-omega,As_id);                              //   r[]    = s[]    - omega*As[]  (recursively computed / updated residual)
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    if(level->must_subtract_mean == 1){
+      double mean_of_r = mean(level,r_id);
+      shift_vector(level,r_id,r_id,-mean_of_r);
+    }
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    double norm_of_r = norm(level,r_id);                                        //   norm of recursively computed residual (good enough??)
+    if(norm_of_r == 0.0){BiCGStabConverged=1;break;}                            //
+    if(norm_of_r < desired_reduction_in_norm*norm_of_r0){BiCGStabConverged=1;break;}
+    double r_dot_r0_new = dot(level,r_id,r0_id);                                //   r_dot_r0_new = dot(r,r0)
+    if(r_dot_r0_new == 0.0){BiCGStabFailed=5;break;}                            //   Lanczos breakdown ???
+    double beta = (r_dot_r0_new/r_dot_r0) * (alpha/omega);                      //   beta = (r_dot_r0_new/r_dot_r0) * (alpha/omega)
+    if(isinf(beta)){BiCGStabFailed=6;break;}                                    //   ???
+    add_vectors(level,VECTOR_TEMP,1.0,p_id,-omega,      Ap_id);                 //   VECTOR_TEMP = (p[]-omega*Ap[])
+    add_vectors(level,       p_id,1.0,r_id,  beta,VECTOR_TEMP);                 //   p[] = r[] + beta*(p[]-omega*Ap[])
+    r_dot_r0 = r_dot_r0_new;                                                    //   r_dot_r0 = r_dot_r0_new   (save old r_dot_r0)
+  }                                                                             // }
+}
diff --git a/Util/hpgmg/finite-volume/source/solvers/cabicgstab.c b/Util/hpgmg/finite-volume/source/solvers/cabicgstab.c
new file mode 100644
index 00000000..97527afa
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/solvers/cabicgstab.c
@@ -0,0 +1,518 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+//#define KRYLOV_DIAGONAL_PRECONDITION
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef    CA_KRYLOV_TELESCOPING
+#define    CA_KRYLOV_TELESCOPING
+#endif
+#ifndef    CA_KRYLOV_S
+#define    CA_KRYLOV_S     4
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#include "matmul.c"
+//------------------------------------------------------------------------------------------------------------------------------
+// z[r] = alpha*A[r][c]*x[c]+beta*y[r]   // [row][col]
+// z[r] = alpha*A[r][c]*x[c]+beta*y[r]   // [row][col]
+#define gemv(z,alpha,A,x,beta,y,rows,cols)  {int r,c;double sum;for(r=0;r<(rows);r++){sum=0.0;for(c=0;c<(cols);c++){sum+=(A)[r][c]*(x)[c];}(z)[r]=(alpha)*sum+(beta)*(y)[r];}}
+static inline void axpy(double * z, double alpha, double * x, double beta, double * y, int n){ // z[n] = alpha*x[n]+beta*y[n]
+  int nn;
+  for(nn=0;nn<n;nn++){
+    z[nn] = alpha*x[nn] + beta*y[nn];
+  }
+}
+static inline double vdotv(double * x, double * y, int n){ // x[n].y[n]
+  int nn;
+  double sum = 0.0;
+  for(nn=0;nn<n;nn++){
+    sum += x[nn]*y[nn];
+  }
+  return(sum);
+}
+static inline void zero(double * z, int n){ // z[n] = 0.0
+  int nn;
+  for(nn=0;nn<n;nn++){
+    z[nn] = 0.0;
+  }
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef CA_KRYLOV_TELESCOPING
+void CABiCGStab(level_type * level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm){
+  // based on Erin Carson/Jim Demmel/Nick Knight's s-Step BiCGStab Algorithm 3.4
+  // However, the formation of [P,R] is expensive ~ 4S+1 exchanges.  Moreover, formation of G[][] requires (4S+2)(4S+1) grid operations.
+  //   When the required number of iterations is small, this overhead is large and can make the s-step version slower than vanilla BiCGStab
+  //   Thus, this version is a telescoping s-step method that will start out with s=1, then do s=2, then s=4
+  int    rt_id = VECTORS_RESERVED+0;
+  int     r_id = VECTORS_RESERVED+1;
+  int     p_id = VECTORS_RESERVED+2;
+  int  PRrt_id = VECTORS_RESERVED+3;
+
+
+  // note: CA_KRYLOV_S should be tiny (2-8?).  As such, 4*CA_KRYLOV_S+1 is also tiny (9-33).  Just allocate on the stack...
+  double  temp1[4*CA_KRYLOV_S+1];                                                               //
+  double  temp2[4*CA_KRYLOV_S+1];                                                               //
+  double  temp3[4*CA_KRYLOV_S+1];                                                               //
+  double     Tp[4*CA_KRYLOV_S+1][4*CA_KRYLOV_S+1];                                              // T'  indexed as [row][col]
+  double    Tpp[4*CA_KRYLOV_S+1][4*CA_KRYLOV_S+1];                                              // T'' indexed as [row][col]
+  double     aj[4*CA_KRYLOV_S+1];                                                               //
+  double     cj[4*CA_KRYLOV_S+1];                                                               //
+  double     ej[4*CA_KRYLOV_S+1];                                                               //
+  double   Tpaj[4*CA_KRYLOV_S+1];                                                               //
+  double   Tpcj[4*CA_KRYLOV_S+1];                                                               //
+  double  Tppaj[4*CA_KRYLOV_S+1];                                                               //
+  double      G[4*CA_KRYLOV_S+1][4*CA_KRYLOV_S+1];                                              // extracted from first 4*CA_KRYLOV_S+1 columns of Gg[][].  indexed as [row][col]
+  double      g[4*CA_KRYLOV_S+1];                                                               // extracted from last [4*CA_KRYLOV_S+1] column of Gg[][].
+  double    Gg[(4*CA_KRYLOV_S+1)*(4*CA_KRYLOV_S+2)];                                            // buffer to hold the Gram-like matrix produced by matmul().  indexed as [row*(4*CA_KRYLOV_S+2) + col]
+  int      PRrt[4*CA_KRYLOV_S+2];                                                               // vector_id's of the concatenation of the 2S+1 matrix powers of P, 2S matrix powers of R, and rt
+
+  int mMax=200;
+  int m=0,n;
+  int i,j,k;
+  int BiCGStabFailed    = 0;
+  int BiCGStabConverged = 0;
+  double g_dot_Tpaj,alpha,omega_numerator,omega_denominator,omega,delta,delta_next,beta;
+  double L2_norm_of_rt,L2_norm_of_residual,cj_dot_Gcj,L2_norm_of_s;
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  residual(level,rt_id,e_id,R_id,a,b);                                                           // rt[] = R_id[] - A(e_id)... note, if DPC, then rt = R-AD^-1De
+  scale_vector(level,r_id,1.0,rt_id);                                                               // r[] = rt[]
+  scale_vector(level, p_id,1.0,rt_id);                                                               // p[] = rt[]
+  double norm_of_rt = norm(level,rt_id);                                                         // the norm of the initial residual...
+  #ifdef VERBOSE
+  if(level->my_rank==0)ffprintf(stderr,stderr,"m=%8d, norm   =%0.20f\n",m,norm_of_rt);
+  #endif
+  if(norm_of_rt == 0.0){BiCGStabConverged=1;}                                                   // entered BiCGStab with exact solution
+  delta = dot(level,r_id,rt_id);                                                                  // delta = dot(r,rt)
+  if(delta==0.0){BiCGStabConverged=1;}                                                          // entered BiCGStab with exact solution (square of L2 norm of r_id)
+  L2_norm_of_rt = sqrt(delta);
+
+  int ca_krylov_s = 1;
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  while( (m<mMax) && (!BiCGStabFailed) && (!BiCGStabConverged) ){                               // while(not done){
+    zero(   aj,4*ca_krylov_s+1);                                                            //
+    zero(   cj,4*ca_krylov_s+1);                                                            //
+    zero(   ej,4*ca_krylov_s+1);                                                            //
+    zero( Tpaj,4*ca_krylov_s+1);                                                            //
+    zero( Tpcj,4*ca_krylov_s+1);                                                            //
+    zero(Tppaj,4*ca_krylov_s+1);                                                            //
+    zero(temp1,4*ca_krylov_s+1);                                                            //
+    zero(temp2,4*ca_krylov_s+1);                                                            //
+    zero(temp3,4*ca_krylov_s+1);                                                            //
+ 
+    for(i=0;i<4*ca_krylov_s+1;i++)for(j=0;j<4*ca_krylov_s+1;j++) Tp[i][j]=0;                // initialize Tp[][] and Tpp[][] ...
+    for(i=0;i<4*ca_krylov_s+1;i++)for(j=0;j<4*ca_krylov_s+1;j++)Tpp[i][j]=0;                //
+    for(i=              0;i<2*ca_krylov_s  ;i++){ Tp[i+1][i]=1;}                            // monomial basis... Fixed (typo in SIAM paper)
+    for(i=2*ca_krylov_s+1;i<4*ca_krylov_s  ;i++){ Tp[i+1][i]=1;}                            //
+    for(i=              0;i<2*ca_krylov_s-1;i++){Tpp[i+2][i]=1;}                            //
+    for(i=2*ca_krylov_s+1;i<4*ca_krylov_s-1;i++){Tpp[i+2][i]=1;}                            //
+
+    for(i=0;i<4*ca_krylov_s+1;i++){PRrt[              i] = PRrt_id+i;}                       // columns of PRrt map to the consecutive spare grid indices starting at PRrt_id
+                                   PRrt[4*ca_krylov_s+1] = rt_id;                            // last column or PRrt (r tilde) maps to rt
+    int *P = PRrt+              0;                                                            // vector_id's of the 2S+1 Matrix Powers of P.  P[i] is the vector_id of A^i(p)
+    int *R = PRrt+2*ca_krylov_s+1;                                                            // vector_id's of the 2S   Matrix Powers of R.  R[i] is the vector_id of A^i(r)
+
+    // Using the monomial basis, compute 2s+1 matrix powers on p[] and 2s matrix powers on r[] one power at a time 
+    // (conventional approach applicable to CHOMBO and BoxLib)
+    scale_vector(level,P[0],1.0, p_id);                                                             // P[0] = A^0p =  p_id
+    for(n=1;n<2*ca_krylov_s+1;n++){                                                           // naive way of calculating the monomial basis.
+      #ifdef KRYLOV_DIAGONAL_PRECONDITION                                                             //
+      mul_vectors(level, VECTOR_TEMP,1.0, VECTOR_DINV,P[n-1]);                                                //   temp[] = Dinv[]*P[n-1]
+      apply_op(level,P[n], VECTOR_TEMP,a,b);                                                          //   P[n] = AD^{-1} VECTOR_TEMP = AD^{-1}P[n-1] = ((AD^{-1})^n)p
+      #else                                                                                     //
+      apply_op(level,P[n],P[n-1],a,b);                                                          //   P[n] = A(P[n-1]) = (A^n)p
+      #endif                                                                                    //
+    }
+    scale_vector(level,R[0],1.0,r_id);                                                             // R[0] = A^0r = r_id
+    for(n=1;n<2*ca_krylov_s;n++){                                                             // naive way of calculating the monomial basis.
+      #ifdef KRYLOV_DIAGONAL_PRECONDITION                                                             //
+      mul_vectors(level, VECTOR_TEMP,1.0, VECTOR_DINV,R[n-1]);                                                //   temp[] = Dinv[]*R[n-1]
+      apply_op(level,R[n], VECTOR_TEMP,a,b);                                                          //   R[n] = AD^{-1} VECTOR_TEMP = AD^{-1}R[n-1]
+      #else                                                                                     //
+      apply_op(level,R[n],R[n-1],a,b);                                                          //   R[n] = A(R[n-1]) = (A^n)r
+      #endif                                                                                    //
+    }
+
+    // Compute Gg[][] = [P,R]^T * [P,R,rt] (Matmul with grids with ghost zones but only one MPI_AllReduce)
+    level->CAKrylov_formations_of_G++;                                                         //   Record the number of times CABiCGStab formed G[][]
+    matmul(level,Gg,PRrt,PRrt,4*ca_krylov_s+1,4*ca_krylov_s+2,1);
+    for(i=0,k=0;i<4*ca_krylov_s+1;i++){                                                       // extract G[][] and g[] from Gg[]
+    for(j=0    ;j<4*ca_krylov_s+1;j++){G[i][j] = Gg[k++];}                                    // first 4*ca_krylov_s+1 elements in each row go to G[][].
+                                         g[i]    = Gg[k++];                                     // last element in row goes to g[].
+    }
+
+    for(i=0;i<4*ca_krylov_s+1;i++)aj[i]=0.0;aj[              0]=1.0;                        // initialized based on (3.26)
+    for(i=0;i<4*ca_krylov_s+1;i++)cj[i]=0.0;cj[2*ca_krylov_s+1]=1.0;                        // initialized based on (3.26)
+    for(i=0;i<4*ca_krylov_s+1;i++)ej[i]=0.0;                                                  // initialized based on (3.26)
+
+    for(n=0;n<ca_krylov_s;n++){                                                               // for(n=0;n<ca_krylov_s;n++){
+      level->Krylov_iterations++;                                                               // record number of inner-loop (j) iterations for comparison
+      gemv( Tpaj,   1.0, Tp,   aj,   0.0, Tpaj,4*ca_krylov_s+1,4*ca_krylov_s+1);          //    T'aj
+      gemv( Tpcj,   1.0, Tp,   cj,   0.0, Tpcj,4*ca_krylov_s+1,4*ca_krylov_s+1);          //    T'cj
+      gemv(Tppaj,   1.0,Tpp,   aj,   0.0,Tppaj,4*ca_krylov_s+1,4*ca_krylov_s+1);          //   T''aj
+                       g_dot_Tpaj = vdotv(g,Tpaj,4*ca_krylov_s+1);                            // (g,T'aj)
+      if(g_dot_Tpaj == 0.0){                                                                    // pivot breakdown ???
+        #ifdef VERBOSE                                                                        //
+        if(level->my_rank==0){ffprintf(stderr,stderr,"g_dot_Tpaj == 0.0\n");}                                   //
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 //
+      }                                                                                         //
+      alpha = delta / g_dot_Tpaj;                                                               // delta / (g,T'aj)
+      if(isinf(alpha)){                                                                         // alpha = big/tiny(overflow) = inf -> breakdown
+        #ifdef VERBOSE                                                                        //
+        if(level->my_rank==0){ffprintf(stderr,stderr,"alpha == inf\n");}                                        // 
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 // 
+      }                                                                                         // 
+      #if 0                                                                                     // seems to have accuracy problems in finite precision...
+      gemv(temp1,-alpha,  G, Tpaj,   0.0,temp1,4*ca_krylov_s+1,4*ca_krylov_s+1);          //  temp1[] =       - alpha*GT'aj
+      gemv(temp1,   1.0,  G,   cj,   1.0,temp1,4*ca_krylov_s+1,4*ca_krylov_s+1);          //  temp1[] =   Gcj - alpha*GT'aj
+      gemv(temp2,-alpha,  G,Tppaj,   0.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1);          //  temp2[] =       − alpha*GT′′aj
+      gemv(temp2,   1.0,  G, Tpcj,   1.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1);          //  temp2[] = GT′cj − alpha*GT′′aj
+      axpy(temp3,   1.0,     Tpcj,-alpha,Tppaj,4*ca_krylov_s+1);                            //  temp3[] =  T′cj − alpha*T′′aj
+             omega_numerator = vdotv(temp3,temp1,4*ca_krylov_s+1);                            //  (temp3,temp1) = ( T'cj-alpha*T''aj ,   Gcj-alpha*GT'aj )
+           omega_denominator = vdotv(temp3,temp2,4*ca_krylov_s+1);                            //  (temp3,temp2) = ( T′cj−alpha*T′′aj , GT′cj−alpha*GT′′aj )
+      #else                                                                                     // better to change the order of operations Gx-Gy -> G(x-y) ...  (note, G is symmetric)
+      axpy(temp1,   1.0,     Tpcj,-alpha,Tppaj,4*ca_krylov_s+1);                            //  temp1[] =  (T'cj - alpha*T''aj)
+      gemv(temp2,   1.0,  G,temp1,   0.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1);          //  temp2[] = G(T'cj - alpha*T''aj)
+      axpy(temp3,   1.0,       cj,-alpha, Tpaj,4*ca_krylov_s+1);                            //  temp3[] =     cj - alpha*T'aj
+             omega_numerator = vdotv(temp3,temp2,4*ca_krylov_s+1);                            //  (temp3,temp2) = ( (  cj - alpha*T'aj ) , G(T'cj - alpha*T''aj) )
+           omega_denominator = vdotv(temp1,temp2,4*ca_krylov_s+1);                            //  (temp1,temp2) = ( (T'cj - alpha*T''aj) , G(T'cj - alpha*T''aj) )
+      #endif                                                                                    // 
+      // NOTE: omega_numerator/omega_denominator can be 0/x or 0/0, but should never be x/0
+      // If omega_numerator==0, and ||s||==0, then convergence, x=x+alpha*aj
+      // If omega_numerator==0, and ||s||!=0, then stabilization breakdown
+
+      // !!! PARTIAL UPDATE OF ej MUST HAPPEN BEFORE THE CHECK ON OMEGA TO ENSURE FORWARD PROGRESS !!!
+      axpy(   ej,1.0,ej,       alpha,   aj,4*ca_krylov_s+1);                                // ej[] = ej[] + alpha*aj[]    
+
+      // calculate the norm of Saad's vector 's' to check intra s-step convergence...
+      axpy(temp1,   1.0,       cj,-alpha, Tpaj,4*ca_krylov_s+1);                            //  temp1[] =   cj - alpha*T'aj
+      gemv(temp2,   1.0,  G,temp1,   0.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1);          //  temp2[] = G(cj - alpha*T'aj)
+                                 L2_norm_of_s = vdotv(temp1,temp2,4*ca_krylov_s+1);           //  (temp1,temp2) = ( (cj - alpha*T'aj) , G(cj - alpha*T'aj) )  == square of L2 norm of s in exact arithmetic
+      if(L2_norm_of_s<0)L2_norm_of_s=0;else L2_norm_of_s=sqrt(L2_norm_of_s);                    // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0)
+      #ifdef VERBOSE                                                                          //
+      if(level->my_rank==0){fprintf(stderr,"m=%8d, norm(s)=%0.20f\n",m+n,L2_norm_of_s);}                //
+      #endif                                                                                    //
+      if(L2_norm_of_s < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;}    // terminate the inner n-loop
+
+
+      if(omega_denominator == 0.0){                                                             // ??? breakdown
+        #ifdef VERBOSE                                                                        //
+        if(level->my_rank==0){if(omega_denominator == 0.0)fprintf(stderr,"omega_denominator == 0.0\n");}//
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 //
+      }                                                                                         //
+      omega = omega_numerator / omega_denominator;                                              // 
+      if(isinf(omega)){                                                                         // omega = big/tiny(oveflow) = inf
+        #ifdef VERBOSE                                                                        //
+        if(level->my_rank==0){if(isinf(omega))fprintf(stderr,"omega == inf\n");}                        // 
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 //
+      }                                                                                         //
+      // !!! COMPLETE THE UPDATE OF ej & cj now that omega is known to be ok                    //
+      axpy(   ej,1.0,ej,       omega,   cj,4*ca_krylov_s+1);                                // ej[] = ej[] + alpha*aj[] + omega*cj[]
+      axpy(   ej,1.0,ej,-omega*alpha, Tpaj,4*ca_krylov_s+1);                                // ej[] = ej[] + alpha*aj[] + omega*cj[] - omega*alpha*T'aj[]
+      axpy(   cj,1.0,cj,      -omega, Tpcj,4*ca_krylov_s+1);                                // cj[] = cj[] - omega*T'cj[]
+      axpy(   cj,1.0,cj,      -alpha, Tpaj,4*ca_krylov_s+1);                                // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[]
+      axpy(   cj,1.0,cj, omega*alpha,Tppaj,4*ca_krylov_s+1);                                // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] + omega*alpha*T''aj[]
+
+
+      // calculate the norm of the incremental residual (Saad's vector 'r') to check intra s-step convergence...
+      gemv(temp1,   1.0,  G,   cj,   0.0,temp1,4*ca_krylov_s+1,4*ca_krylov_s+1);          // temp1[] = Gcj
+                                       cj_dot_Gcj = vdotv(cj,temp1,4*ca_krylov_s+1);          // sqrt( (cj,Gcj) ) == L2 norm of the intermediate residual in exact arithmetic
+      L2_norm_of_residual = 0.0;if(cj_dot_Gcj>0)L2_norm_of_residual=sqrt(cj_dot_Gcj);           // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0)
+      #ifdef VERBOSE 
+      if(level->my_rank==0){fprintf(stderr,"m=%8d, norm(r)=%0.20f (cj_dot_Gcj=%0.20e)\n",m+n,L2_norm_of_residual,cj_dot_Gcj);}
+      #endif
+      if(L2_norm_of_residual < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop
+
+
+      delta_next = vdotv( g,cj,4*ca_krylov_s+1);                                              // (g,cj)
+      #ifdef VERBOSE                                                                          //
+      if(level->my_rank==0){                                                                    //
+        if(isinf(delta_next)     ){fprintf(stderr,"delta == inf\n");}                                   // delta = big/tiny(overflow) = inf
+        if(delta_next      == 0.0){fprintf(stderr,"delta == 0.0\n");}                                   // Lanczos breakdown
+        if(omega_numerator == 0.0){fprintf(stderr,"omega_numerator == 0.0\n");}                         // stabilization breakdown
+        if(omega           == 0.0){fprintf(stderr,"omega == 0.0\n");}                                   // stabilization breakdown 
+      }                                                                                         //
+      #endif                                                                                    //
+      if(isinf(delta_next)){BiCGStabFailed   =1;break;}                                         // delta = inf?
+      if(delta_next  ==0.0){BiCGStabFailed   =1;break;}                                         // Lanczos breakdown...
+      if(omega       ==0.0){BiCGStabFailed   =1;break;}                                         // stabilization breakdown 
+      beta = (delta_next/delta)*(alpha/omega);                                                  // (delta_next/delta)*(alpha/omega)
+      #ifdef VERBOSE                                                                          //
+      if(level->my_rank==0){                                                                    //
+        if(isinf(beta)           ){fprintf(stderr,"beta == inf\n");}                                    // beta = inf?
+        if(beta            == 0.0){fprintf(stderr,"beta == 0.0\n");}                                    // beta = 0?  can't make further progress(?)
+      }                                                                                         //
+      #endif                                                                                    //
+      if(isinf(beta)      ){BiCGStabFailed   =1;break;}                                         // beta = inf?
+      if(beta       == 0.0){BiCGStabFailed   =1;break;}                                         // beta = 0?  can't make further progress(?)
+      axpy(   aj,1.0,cj,        beta,   aj,4*ca_krylov_s+1);                                // aj[] = cj[] + beta*aj[]
+      axpy(   aj,1.0,aj, -omega*beta, Tpaj,4*ca_krylov_s+1);                                // aj[] = cj[] + beta*aj[] - omega*beta*T'aj
+      delta = delta_next;                                                                       // delta = delta_next
+
+    }                                                                                           // inner n (j) loop
+
+    // update iterates...
+    for(i=0;i<4*ca_krylov_s+1;i++){add_vectors(level,e_id,1.0,e_id,ej[i],PRrt[i]);}             // e_id[] = [P,R]ej + e_id[]
+    if(!BiCGStabFailed && !BiCGStabConverged){                                                  // if we're done, then there is no point in updating these
+                                   add_vectors(level,  p_id,0.0,  p_id,aj[0],PRrt[0]);              //    p[] = [P,R]aj
+    for(i=1;i<4*ca_krylov_s+1;i++){add_vectors(level,  p_id,1.0,  p_id,aj[i],PRrt[i]);}             //          ...
+                                   add_vectors(level, r_id,0.0, r_id,cj[0],PRrt[0]);              //    r[] = [P,R]cj
+    for(i=1;i<4*ca_krylov_s+1;i++){add_vectors(level, r_id,1.0, r_id,cj[i],PRrt[i]);}             //          ...
+    }                                                                                           //
+    m+=ca_krylov_s;                                                                           //   m+=ca_krylov_s;
+    ca_krylov_s*=2;if(ca_krylov_s>CA_KRYLOV_S)ca_krylov_s=CA_KRYLOV_S;
+  }                                                                                             // } // outer m loop
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  #ifdef KRYLOV_DIAGONAL_PRECONDITION
+  mul_vectors(level,e_id,1.0, VECTOR_DINV,e_id);                                                        //   e_id[] = Dinv[]*e_id[] // i.e. e = D^{-1}e'
+  #endif
+
+}
+//------------------------------------------------------------------------------------------------------------------------------
+#else // CA_KRYLOV_TELESCOPING =0
+void CABiCGStab(level_type * level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm){
+  // based on Erin Carson/Jim Demmel/Nick Knight's s-Step BiCGStab Algorithm 3.4
+  int    rt_id = VECTORS_RESERVED+0;
+  int     r_id = VECTORS_RESERVED+1;
+  int     p_id = VECTORS_RESERVED+2;
+  int  PRrt_id = VECTORS_RESERVED+3;
+
+  // note: CA_KRYLOV_S should be tiny (2-8?).  As such, 4*CA_KRYLOV_S+1 is also tiny (9-33).  Just allocate on the stack...
+  double  temp1[4*CA_KRYLOV_S+1];                                               //
+  double  temp2[4*CA_KRYLOV_S+1];                                               //
+  double  temp3[4*CA_KRYLOV_S+1];                                               //
+  double     Tp[4*CA_KRYLOV_S+1][4*CA_KRYLOV_S+1];                              // T'  indexed as [row][col]
+  double    Tpp[4*CA_KRYLOV_S+1][4*CA_KRYLOV_S+1];                              // T'' indexed as [row][col]
+  double     aj[4*CA_KRYLOV_S+1];                                               //
+  double     cj[4*CA_KRYLOV_S+1];                                               //
+  double     ej[4*CA_KRYLOV_S+1];                                               //
+  double   Tpaj[4*CA_KRYLOV_S+1];                                               //
+  double   Tpcj[4*CA_KRYLOV_S+1];                                               //
+  double  Tppaj[4*CA_KRYLOV_S+1];                                               //
+  double      G[4*CA_KRYLOV_S+1][4*CA_KRYLOV_S+1];                              // extracted from first 4*CA_KRYLOV_S+1 columns of Gg[][].  indexed as [row][col]
+  double      g[4*CA_KRYLOV_S+1];                                               // extracted from last [4*CA_KRYLOV_S+1] column of Gg[][].
+  double    Gg[(4*CA_KRYLOV_S+1)*(4*CA_KRYLOV_S+2)];                            // buffer to hold the Gram-like matrix produced by matmul().  indexed as [row*(4*CA_KRYLOV_S+2) + col]
+  int      PRrt[4*CA_KRYLOV_S+2];                                               // vector_id's of the concatenation of the 2S+1 matrix powers of P, 2S matrix powers of R, and rt
+  int *P = PRrt+                0;                                              // vector_id's of the 2S+1 Matrix Powers of P.  P[i] is the vector_id of A^i(p)
+  int *R = PRrt+2*CA_KRYLOV_S+1;                                                // vector_id's of the 2S   Matrix Powers of R.  R[i] is the vector_id of A^i(r)
+
+  int mMax=200;
+  int m=0,n;
+  int i,j,k;
+  int BiCGStabFailed    = 0;
+  int BiCGStabConverged = 0;
+  double g_dot_Tpaj,alpha,omega_numerator,omega_denominator,omega,delta,delta_next,beta;
+  double L2_norm_of_rt,L2_norm_of_residual,cj_dot_Gcj,L2_norm_of_s;
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  residual(level,rt_id,e_id,R_id,a,b);                                           // rt[] = R_id[] - A(e_id)... note, if DPC, then rt = R-AD^-1De
+  scale_vector(level,r_id,1.0,rt_id);                                               // r[] = rt[]
+  scale_vector(level, p_id,1.0,rt_id);                                               // p[] = rt[]
+  double norm_of_rt = norm(level,rt_id);                                         // the norm of the initial residual...
+  #ifdef VERBOSE
+  if(level->my_rank==0)fprintf(stderr,"m=%8d, norm   =%0.20f\n",m,norm_of_rt);
+  #endif
+  if(norm_of_rt == 0.0){BiCGStabConverged=1;}                                   // entered BiCGStab with exact solution
+  delta = dot(level,r_id,rt_id);                                                  // delta = dot(r,rt)
+  if(delta==0.0){BiCGStabConverged=1;}                                          // entered BiCGStab with exact solution (square of L2 norm of r_id)
+  L2_norm_of_rt = sqrt(delta);
+
+  int ca_krylov_s = CA_KRYLOV_S;                                              // by making this a variable, I prevent the compiler from optimizing more than the telescoping version, thus preserving a bit-identcal result
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  for(i=0;i<4*ca_krylov_s+1;i++)for(j=0;j<4*ca_krylov_s+1;j++) Tp[i][j]=0;  // initialize Tp[][] and Tpp[][] ...
+  for(i=0;i<4*ca_krylov_s+1;i++)for(j=0;j<4*ca_krylov_s+1;j++)Tpp[i][j]=0;  //
+  for(i=              0;i<2*ca_krylov_s  ;i++){ Tp[i+1][i]=1;}              // monomial basis... Fixed (typo in SIAM paper)
+  for(i=2*ca_krylov_s+1;i<4*ca_krylov_s  ;i++){ Tp[i+1][i]=1;}              //
+  for(i=              0;i<2*ca_krylov_s-1;i++){Tpp[i+2][i]=1;}              //
+  for(i=2*ca_krylov_s+1;i<4*ca_krylov_s-1;i++){Tpp[i+2][i]=1;}              //
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  for(i=0;i<4*ca_krylov_s+1;i++){PRrt[              i] = PRrt_id+i;}         // columns of PRrt map to the consecutive spare grid indices starting at PRrt_id
+                                 PRrt[4*ca_krylov_s+1] = rt_id;              // last column or PRrt (r tilde) maps to rt
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  while( (m<mMax) && (!BiCGStabFailed) && (!BiCGStabConverged) ){               // while(not done){
+    zero(   aj,4*ca_krylov_s+1);                                            //
+    zero(   cj,4*ca_krylov_s+1);                                            //
+    zero(   ej,4*ca_krylov_s+1);                                            //
+    zero( Tpaj,4*ca_krylov_s+1);                                            //
+    zero( Tpcj,4*ca_krylov_s+1);                                            //
+    zero(Tppaj,4*ca_krylov_s+1);                                            //
+    zero(temp1,4*ca_krylov_s+1);                                            //
+    zero(temp2,4*ca_krylov_s+1);                                            //
+    zero(temp3,4*ca_krylov_s+1);                                            //
+
+    // Using the monomial basis, compute 2s+1 matrix powers on p[] and 2s matrix powers on r[] one power at a time 
+    // (conventional approach applicable to CHOMBO and BoxLib)
+    scale_vector(level,P[0],1.0, p_id);                                             // P[0] = A^0p =  p_id
+    for(n=1;n<2*ca_krylov_s+1;n++){                                           // naive way of calculating the monomial basis.
+      #ifdef KRYLOV_DIAGONAL_PRECONDITION                                             //
+      mul_vectors(level, VECTOR_TEMP,1.0, VECTOR_DINV,P[n-1]);                           //   temp[] = Dinv[]*P[n-1]
+      apply_op(level,P[n], VECTOR_TEMP,a,b);                                          //   P[n] = AD^{-1} VECTOR_TEMP = AD^{-1}P[n-1] = ((AD^{-1})^n)p
+      #else                                                                     //
+      apply_op(level,P[n],P[n-1],a,b);                                          //   P[n] = A(P[n-1]) = (A^n)p
+      #endif                                                                    //
+    }
+    scale_vector(level,R[0],1.0,r_id);                                             // R[0] = A^0r = r_id
+    for(n=1;n<2*ca_krylov_s;n++){                                             // naive way of calculating the monomial basis.
+      #ifdef KRYLOV_DIAGONAL_PRECONDITION                                             //
+      mul_vectors(level, VECTOR_TEMP,1.0, VECTOR_DINV,R[n-1]);                                //   temp[] = Dinv[]*R[n-1]
+      apply_op(level,R[n], VECTOR_TEMP,a,b);                                          //   R[n] = AD^{-1} VECTOR_TEMP = AD^{-1}R[n-1]
+      #else                                                                     //
+      apply_op(level,R[n],R[n-1],a,b);                                          //   R[n] = A(R[n-1]) = (A^n)r
+      #endif                                                                    //
+    }
+
+    // Compute Gg[][] = [P,R]^T * [P,R,rt] (Matmul with grids with ghost zones but only one MPI_AllReduce)
+    level->CAKrylov_formations_of_G++;                                                         //   Record the number of times CABiCGStab formed G[][]
+    matmul(level,Gg,PRrt,PRrt,4*ca_krylov_s+1,4*ca_krylov_s+2,1);
+    for(i=0,k=0;i<4*ca_krylov_s+1;i++){                                                       // extract G[][] and g[] from Gg[]
+    for(j=0    ;j<4*ca_krylov_s+1;j++){G[i][j] = Gg[k++];}                                    // first 4*ca_krylov_s+1 elements in each row go to G[][].
+                                         g[i]    = Gg[k++];                                     // last element in row goes to g[].
+    }
+
+    for(i=0;i<4*ca_krylov_s+1;i++)aj[i]=0.0;aj[              0]=1.0;                        // initialized based on (3.26)
+    for(i=0;i<4*ca_krylov_s+1;i++)cj[i]=0.0;cj[2*ca_krylov_s+1]=1.0;                        // initialized based on (3.26)
+    for(i=0;i<4*ca_krylov_s+1;i++)ej[i]=0.0;                                                  // initialized based on (3.26)
+
+    for(n=0;n<ca_krylov_s;n++){                                                               // for(n=0;n<ca_krylov_s;n++){
+      level->Krylov_iterations++;                                                               // record number of inner-loop (j) iterations for comparison
+      gemv( Tpaj,   1.0, Tp,   aj,   0.0, Tpaj,4*ca_krylov_s+1,4*ca_krylov_s+1);          //    T'aj
+      gemv( Tpcj,   1.0, Tp,   cj,   0.0, Tpcj,4*ca_krylov_s+1,4*ca_krylov_s+1);          //    T'cj
+      gemv(Tppaj,   1.0,Tpp,   aj,   0.0,Tppaj,4*ca_krylov_s+1,4*ca_krylov_s+1);          //   T''aj
+                       g_dot_Tpaj = vdotv(g,Tpaj,4*ca_krylov_s+1);                            // (g,T'aj)
+      if(g_dot_Tpaj == 0.0){                                                                    // pivot breakdown ???
+        #ifdef VERBOSE                                                                        //
+        if(level->my_rank==0){fprintf(stderr,"g_dot_Tpaj == 0.0\n");}                                   //
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 //
+      }                                                                                         //
+      alpha = delta / g_dot_Tpaj;                                                               // delta / (g,T'aj)
+      if(isinf(alpha)){                                                                         // alpha = big/tiny(overflow) = inf -> breakdown
+        #ifdef VERBOSE                                                                        //
+        if(level->my_rank==0){fprintf(stderr,"alpha == inf\n");}                                        // 
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 // 
+      }                                                                                         // 
+      #if 0                                                                                     // seems to have accuracy problems in finite precision...
+      gemv(temp1,-alpha,  G, Tpaj,   0.0,temp1,4*ca_krylov_s+1,4*ca_krylov_s+1);          //  temp1[] =       - alpha*GT'aj
+      gemv(temp1,   1.0,  G,   cj,   1.0,temp1,4*ca_krylov_s+1,4*ca_krylov_s+1);          //  temp1[] =   Gcj - alpha*GT'aj
+      gemv(temp2,-alpha,  G,Tppaj,   0.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1);          //  temp2[] =       − alpha*GT′′aj
+      gemv(temp2,   1.0,  G, Tpcj,   1.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1);          //  temp2[] = GT′cj − alpha*GT′′aj
+      axpy(temp3,   1.0,     Tpcj,-alpha,Tppaj,4*ca_krylov_s+1);                            //  temp3[] =  T′cj − alpha*T′′aj
+             omega_numerator = vdotv(temp3,temp1,4*ca_krylov_s+1);                            //  (temp3,temp1) = ( T'cj-alpha*T''aj ,   Gcj-alpha*GT'aj )
+           omega_denominator = vdotv(temp3,temp2,4*ca_krylov_s+1);                            //  (temp3,temp2) = ( T′cj−alpha*T′′aj , GT′cj−alpha*GT′′aj )
+      #else                                                                                     // better to change the order of operations Gx-Gy -> G(x-y) ...  (note, G is symmetric)
+      axpy(temp1,   1.0,     Tpcj,-alpha,Tppaj,4*ca_krylov_s+1);                            //  temp1[] =  (T'cj - alpha*T''aj)
+      gemv(temp2,   1.0,  G,temp1,   0.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1);          //  temp2[] = G(T'cj - alpha*T''aj)
+      axpy(temp3,   1.0,       cj,-alpha, Tpaj,4*ca_krylov_s+1);                            //  temp3[] =     cj - alpha*T'aj
+             omega_numerator = vdotv(temp3,temp2,4*ca_krylov_s+1);                            //  (temp3,temp2) = ( (  cj - alpha*T'aj ) , G(T'cj - alpha*T''aj) )
+           omega_denominator = vdotv(temp1,temp2,4*ca_krylov_s+1);                            //  (temp1,temp2) = ( (T'cj - alpha*T''aj) , G(T'cj - alpha*T''aj) )
+      #endif                                                                                    // 
+      // NOTE: omega_numerator/omega_denominator can be 0/x or 0/0, but should never be x/0
+      // If omega_numerator==0, and ||s||==0, then convergence, x=x+alpha*aj
+      // If omega_numerator==0, and ||s||!=0, then stabilization breakdown
+
+      // !!! PARTIAL UPDATE OF ej MUST HAPPEN BEFORE THE CHECK ON OMEGA TO ENSURE FORWARD PROGRESS !!!
+      axpy(   ej,1.0,ej,       alpha,   aj,4*ca_krylov_s+1);                                // ej[] = ej[] + alpha*aj[]    
+
+      // calculate the norm of Saad's vector 's' to check intra s-step convergence...
+      axpy(temp1,   1.0,       cj,-alpha, Tpaj,4*ca_krylov_s+1);                            //  temp1[] =   cj - alpha*T'aj
+      gemv(temp2,   1.0,  G,temp1,   0.0,temp2,4*ca_krylov_s+1,4*ca_krylov_s+1);          //  temp2[] = G(cj - alpha*T'aj)
+                                 L2_norm_of_s = vdotv(temp1,temp2,4*ca_krylov_s+1);           //  (temp1,temp2) = ( (cj - alpha*T'aj) , G(cj - alpha*T'aj) )  == square of L2 norm of s in exact arithmetic
+      if(L2_norm_of_s<0)L2_norm_of_s=0;else L2_norm_of_s=sqrt(L2_norm_of_s);                    // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0)
+      #ifdef VERBOSE                                                                          //
+      if(level->my_rank==0){fprintf(stderr,"m=%8d, norm(s)=%0.20f\n",m+n,L2_norm_of_s);}                //
+      #endif                                                                                    //
+      if(L2_norm_of_s < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;}    // terminate the inner n-loop
+
+
+      if(omega_denominator == 0.0){                                                             // ??? breakdown
+        #ifdef VERBOSE                                                                        //
+        if(level->my_rank==0){if(omega_denominator == 0.0)fprintf(stderr,"omega_denominator == 0.0\n");}//
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 //
+      }                                                                                         //
+      omega = omega_numerator / omega_denominator;                                              // 
+      if(isinf(omega)){                                                                         // omega = big/tiny(oveflow) = inf
+        #ifdef VERBOSE                                                                        //
+        if(level->my_rank==0){if(isinf(omega))fprintf(stderr,"omega == inf\n");}                        // 
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 //
+      }                                                                                         //
+      // !!! COMPLETE THE UPDATE OF ej & cj now that omega is known to be ok                    //
+      axpy(   ej,1.0,ej,       omega,   cj,4*ca_krylov_s+1);                                // ej[] = ej[] + alpha*aj[] + omega*cj[]
+      axpy(   ej,1.0,ej,-omega*alpha, Tpaj,4*ca_krylov_s+1);                                // ej[] = ej[] + alpha*aj[] + omega*cj[] - omega*alpha*T'aj[]
+      axpy(   cj,1.0,cj,      -omega, Tpcj,4*ca_krylov_s+1);                                // cj[] = cj[] - omega*T'cj[]
+      axpy(   cj,1.0,cj,      -alpha, Tpaj,4*ca_krylov_s+1);                                // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[]
+      axpy(   cj,1.0,cj, omega*alpha,Tppaj,4*ca_krylov_s+1);                                // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] + omega*alpha*T''aj[]
+
+
+      // calculate the norm of the incremental residual (Saad's vector 'r') to check intra s-step convergence...
+      gemv(temp1,   1.0,  G,   cj,   0.0,temp1,4*ca_krylov_s+1,4*ca_krylov_s+1);          // temp1[] = Gcj
+                                       cj_dot_Gcj = vdotv(cj,temp1,4*ca_krylov_s+1);          // sqrt( (cj,Gcj) ) == L2 norm of the intermediate residual in exact arithmetic
+      L2_norm_of_residual = 0.0;if(cj_dot_Gcj>0)L2_norm_of_residual=sqrt(cj_dot_Gcj);           // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0)
+      #ifdef VERBOSE 
+      if(level->my_rank==0){fprintf(stderr,"m=%8d, norm(r)=%0.20f (cj_dot_Gcj=%0.20e)\n",m+n,L2_norm_of_residual,cj_dot_Gcj);}
+      #endif
+      if(L2_norm_of_residual < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop
+
+
+      delta_next = vdotv( g,cj,4*ca_krylov_s+1);                                              // (g,cj)
+      #ifdef VERBOSE                                                                          //
+      if(level->my_rank==0){                                                                    //
+        if(isinf(delta_next)     ){fprintf(stderr,"delta == inf\n");}                                   // delta = big/tiny(overflow) = inf
+        if(delta_next      == 0.0){fprintf(stderr,"delta == 0.0\n");}                                   // Lanczos breakdown
+        if(omega_numerator == 0.0){fprintf(stderr,"omega_numerator == 0.0\n");}                         // stabilization breakdown
+        if(omega           == 0.0){fprintf(stderr,"omega == 0.0\n");}                                   // stabilization breakdown 
+      }                                                                                         //
+      #endif                                                                                    //
+      if(isinf(delta_next)){BiCGStabFailed   =1;break;}                                         // delta = inf?
+      if(delta_next  ==0.0){BiCGStabFailed   =1;break;}                                         // Lanczos breakdown...
+      if(omega       ==0.0){BiCGStabFailed   =1;break;}                                         // stabilization breakdown 
+      beta = (delta_next/delta)*(alpha/omega);                                                  // (delta_next/delta)*(alpha/omega)
+      #ifdef VERBOSE                                                                          //
+      if(level->my_rank==0){                                                                    //
+        if(isinf(beta)           ){fprintf(stderr,"beta == inf\n");}                                    // beta = inf?
+        if(beta            == 0.0){fprintf(stderr,"beta == 0.0\n");}                                    // beta = 0?  can't make further progress(?)
+      }                                                                                         //
+      #endif                                                                                    //
+      if(isinf(beta)      ){BiCGStabFailed   =1;break;}                                         // beta = inf?
+      if(beta       == 0.0){BiCGStabFailed   =1;break;}                                         // beta = 0?  can't make further progress(?)
+      axpy(   aj,1.0,cj,        beta,   aj,4*ca_krylov_s+1);                                // aj[] = cj[] + beta*aj[]
+      axpy(   aj,1.0,aj, -omega*beta, Tpaj,4*ca_krylov_s+1);                                // aj[] = cj[] + beta*aj[] - omega*beta*T'aj
+      delta = delta_next;                                                                       // delta = delta_next
+
+    }                                                                                           // inner n (j) loop
+
+    // update iterates...
+    for(i=0;i<4*ca_krylov_s+1;i++){add_vectors(level,e_id,1.0,e_id,ej[i],PRrt[i]);}             // e_id[] = [P,R]ej + e_id[]
+    if(!BiCGStabFailed && !BiCGStabConverged){                                                  // if we're done, then there is no point in updating these
+                                   add_vectors(level,  p_id,0.0,  p_id,aj[0],PRrt[0]);              //    p[] = [P,R]aj
+    for(i=1;i<4*ca_krylov_s+1;i++){add_vectors(level,  p_id,1.0,  p_id,aj[i],PRrt[i]);}             //          ...
+                                   add_vectors(level, r_id,0.0, r_id,cj[0],PRrt[0]);              //    r[] = [P,R]cj
+    for(i=1;i<4*ca_krylov_s+1;i++){add_vectors(level, r_id,1.0, r_id,cj[i],PRrt[i]);}             //          ...
+    }                                                                                           //
+    m+=ca_krylov_s;                                                                           //   m+=ca_krylov_s;
+  }                                                                                             // } // outer m loop
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  #ifdef KRYLOV_DIAGONAL_PRECONDITION
+  mul_vectors(level,e_id,1.0, VECTOR_DINV,e_id);                                                        //   e_id[] = Dinv[]*e_id[] // i.e. e = D^{-1}e'
+  #endif
+}
+#endif // CA_KRYLOV_TELESCOPING
+//------------------------------------------------------------------------------------------------------------------------------
diff --git a/Util/hpgmg/finite-volume/source/solvers/cacg.c b/Util/hpgmg/finite-volume/source/solvers/cacg.c
new file mode 100644
index 00000000..61449226
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/solvers/cacg.c
@@ -0,0 +1,170 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef    CA_KRYLOV_S
+#define    CA_KRYLOV_S     4
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#include "matmul.c"
+//------------------------------------------------------------------------------------------------------------------------------
+// z[r] = alpha*A[r][c]*x[c]+beta*y[r]   // [row][col]
+// z[r] = alpha*A[r][c]*x[c]+beta*y[r]   // [row][col]
+#define gemv(z,alpha,A,x,beta,y,rows,cols)  {int r,c;double sum;for(r=0;r<(rows);r++){sum=0.0;for(c=0;c<(cols);c++){sum+=(A)[r][c]*(x)[c];}(z)[r]=(alpha)*sum+(beta)*(y)[r];}}
+static inline void axpy(double * z, double alpha, double * x, double beta, double * y, int n){ // z[n] = alpha*x[n]+beta*y[n]
+  int nn;
+  for(nn=0;nn<n;nn++){
+    z[nn] = alpha*x[nn] + beta*y[nn];
+  }
+}
+static inline double vdotv(double * x, double * y, int n){ // x[n].y[n]
+  int nn;
+  double sum = 0.0;
+  for(nn=0;nn<n;nn++){
+    sum += x[nn]*y[nn];
+  }
+  return(sum);
+}
+static inline void zero(double * z, int n){ // z[n] = 0.0
+  int nn;
+  for(nn=0;nn<n;nn++){
+    z[nn] = 0.0;
+  }
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void CACG(level_type * level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm){
+  // based on Lauren Goodfriend, Yinghui Huang, and David Thorman's derivation in their Spring 2013 CS267 Report
+  int    r0_id = VECTORS_RESERVED+0;
+  int     r_id = VECTORS_RESERVED+1;
+  int     p_id = VECTORS_RESERVED+2;
+  int  PRrt_id = VECTORS_RESERVED+3;
+
+  double  temp1[2*CA_KRYLOV_S+1];                                                             //
+  double  temp2[2*CA_KRYLOV_S+1];                                                             //
+  double  temp3[2*CA_KRYLOV_S+1];                                                             //
+  double     aj[2*CA_KRYLOV_S+1];                                                             //
+  double     cj[2*CA_KRYLOV_S+1];                                                             //
+  double     ej[2*CA_KRYLOV_S+1];                                                             //
+  double   Tpaj[2*CA_KRYLOV_S+1];                                                             //
+  double     Tp[2*CA_KRYLOV_S+1][2*CA_KRYLOV_S+1];                                          // T'  indexed as [row][col]
+  double      G[2*CA_KRYLOV_S+1][2*CA_KRYLOV_S+1];                                          // extracted from first 2*CA_KRYLOV_S+1 columns of Gg[][].  indexed as [row][col]
+  double   Gbuf[(2*CA_KRYLOV_S+1)*(2*CA_KRYLOV_S+1)];                                       // buffer to hold the Gram-like matrix produced by matmul().  indexed as [row*(2*CA_KRYLOV_S+1) + col]
+  int      PR[2*CA_KRYLOV_S+1];                                                               // vector_id's of the concatenation of the S+1 matrix powers of P, and the S matrix powers of R
+  int *P = PR+              0;                                                                  // vector_id's of the S+1 Matrix Powers of P.  P[i] is the vector_id of A^i(p)
+  int *R = PR+CA_KRYLOV_S+1;                                                                  // vector_id's of the S   Matrix Powers of R.  R[i] is the vector_id of A^i(r)
+
+  int mMax=200;
+  int m=0,n;
+  int i,j,k;
+  int CGFailed    = 0;
+  int CGConverged = 0;
+
+  double aj_dot_GTpaj,cj_dot_Gcj,alpha,cj_dot_Gcj_new,beta,L2_norm_of_r0,L2_norm_of_residual,delta;
+
+  residual(level,r0_id,e_id,R_id,a,b);                                                            // r0[] = R_id[] - A(e_id)
+  scale_vector(level,r_id,1.0,r0_id);                                                                // r[] = r0[]
+  scale_vector(level, p_id,1.0,r0_id);                                                                // p[] = r0[]
+  double norm_of_r0 = norm(level,r0_id);                                                          // the norm of the initial residual...
+  if(norm_of_r0 == 0.0){CGConverged=1;}                                                          // entered CG with exact solution
+
+  delta = dot(level,r_id,r0_id);                                                                   // delta = dot(r,r0)
+  if(delta==0.0){CGConverged=1;}                                                                 // entered CG with exact solution (square of L2 norm of r_id)
+  L2_norm_of_r0 = sqrt(delta);                                                                   // 
+
+
+
+  // initialize Tp[][] ...
+  for(i=0;i<2*CA_KRYLOV_S+1;i++)for(j=0;j<2*CA_KRYLOV_S+1;j++) Tp[i][j]=0;                  // zero Tp
+  for(i=              0;i<  CA_KRYLOV_S  ;i++){ Tp[i+1][i]=1;}                                // monomial basis
+  for(i=CA_KRYLOV_S+1;i<2*CA_KRYLOV_S  ;i++){ Tp[i+1][i]=1;}                                //
+
+  for(i=0;i<2*CA_KRYLOV_S+1;i++){PR[i] = PRrt_id+i;}                                           // columns of PR map to the consecutive spare grids allocated for the bottom solver starting at PRrt_id
+
+
+  while( (m<mMax) && (!CGFailed) && (!CGConverged) ){                                           // while(not done){
+    zero(   aj,2*CA_KRYLOV_S+1);
+    zero(   cj,2*CA_KRYLOV_S+1);
+    zero(   ej,2*CA_KRYLOV_S+1);
+    zero( Tpaj,2*CA_KRYLOV_S+1);
+    zero(temp1,2*CA_KRYLOV_S+1);
+    zero(temp2,2*CA_KRYLOV_S+1);
+    zero(temp3,2*CA_KRYLOV_S+1);
+
+    // Using the monomial basis, compute s+1 matrix powers on p[] and s matrix powers on r[] one power at a time
+    //  (conventional approach applicable to CHOMBO and BoxLib)
+    scale_vector(level,P[0],1.0, p_id);                                                             // P[0] = A^0p =  p_id
+    for(n=1;n<CA_KRYLOV_S+1;n++){                                                             // naive way of calculating the monomial basis.
+      apply_op(level,P[n],P[n-1],a,b);                                                          // P[n] = A(P[n-1]) = A^(n)p
+    }
+    scale_vector(level,R[0],1.0,r_id);                                                             // R[0] = A^0r = r_id
+    for(n=1;n<CA_KRYLOV_S;n++){                                                               // naive way of calculating the monomial basis.
+      apply_op(level,R[n],R[n-1],a,b);                                                          // R[n] = A(R[n-1]) = A^(n)r
+    }
+
+
+    // form G[][] and g[]
+    level->CAKrylov_formations_of_G++;                                                         //   Record the number of times CACG formed G[][]
+    matmul(level,Gbuf,PR,PR,2*CA_KRYLOV_S+1,2*CA_KRYLOV_S+1,1);                       // Compute Gbuf[][] = [P,R]^T * [P,R] (Matmul with grids but only one MPI_AllReduce)
+    for(i=0,k=0;i<2*CA_KRYLOV_S+1;i++){                                                       // extract G[][] from Gbuf[]
+    for(j=0    ;j<2*CA_KRYLOV_S+1;j++){G[i][j] = Gbuf[k++];}                                  // first 2*CA_KRYLOV_S+1 elements in each row go to G[][].
+    }
+
+
+    for(i=0;i<2*CA_KRYLOV_S+1;i++)aj[i]=0.0;aj[               0]=1.0;                         // initialized based on (???)
+    for(i=0;i<2*CA_KRYLOV_S+1;i++)cj[i]=0.0;cj[CA_KRYLOV_S+1]=1.0;                          // initialized based on (???)
+    for(i=0;i<2*CA_KRYLOV_S+1;i++)ej[i]=0.0;                                                  // initialized based on (???)
+
+    for(n=0;n<CA_KRYLOV_S;n++){                                                               // for(n=0;n<CA_KRYLOV_S;n++){
+      level->Krylov_iterations++;                                                               //   record number of inner-loop (j) iterations for comparison
+      gemv( Tpaj,1.0,Tp,  aj,0.0, Tpaj,2*CA_KRYLOV_S+1,2*CA_KRYLOV_S+1);                  //               T'aj
+      gemv(temp1,1.0, G,Tpaj,0.0,temp1,2*CA_KRYLOV_S+1,2*CA_KRYLOV_S+1);                  //    temp1[] = GT'aj
+      gemv(temp2,1.0, G,  cj,0.0,temp2,2*CA_KRYLOV_S+1,2*CA_KRYLOV_S+1);                  //    temp2[] = Gcj
+           aj_dot_GTpaj = vdotv(aj,temp1,2*CA_KRYLOV_S+1);                                    //   (aj,GT'aj)
+             cj_dot_Gcj = vdotv(cj,temp2,2*CA_KRYLOV_S+1);                                    //   (cj,  Gcj)
+      // FIX, can cj_dot_Gcj ever be zero ?
+      if(aj_dot_GTpaj == 0.0){                                                                  //   pivot breakdown ???
+        CGFailed=1;break;                                                                       //
+      }                                                                                         //
+      alpha = cj_dot_Gcj / aj_dot_GTpaj;                                                        //   alpha = (cj,Gcj) / (aj,GT'aj)
+      if(isinf(alpha)){                                                                         //   alpha = big/tiny(overflow) = inf -> breakdown
+        CGFailed=1;break;                                                                       // 
+      }                                                                                         //
+      axpy(   ej,1.0,ej,   alpha,   aj,2*CA_KRYLOV_S+1);                                    //   ej[] = ej[] + alpha*aj[]    
+      axpy(   cj,1.0,cj,  -alpha, Tpaj,2*CA_KRYLOV_S+1);                                    //   cj[] = cj[] - alpha*T'*aj[]    
+      gemv(temp2,1.0, G,  cj,0.0,temp2,2*CA_KRYLOV_S+1,2*CA_KRYLOV_S+1);                  //    temp2[] = Gcj
+         cj_dot_Gcj_new = vdotv(cj,temp2,2*CA_KRYLOV_S+1);                                    //   (cj,  Gcj)
+      // calculate the norm of the incremental residual (Saad's vector 'r') to check intra s-step convergence... == cj_dot_Gcj_new??
+      L2_norm_of_residual = 0.0;if(cj_dot_Gcj_new>0)L2_norm_of_residual=sqrt(cj_dot_Gcj_new);   // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0)
+      if(L2_norm_of_residual < desired_reduction_in_norm*L2_norm_of_r0){CGConverged=1;break;}   // terminate the inner n-loop
+      if(cj_dot_Gcj_new == 0.0){                                                                //   Lanczos breakdown ???
+        CGFailed=1;break;                                                                       //
+      }                                                                                         //
+      beta = cj_dot_Gcj_new / cj_dot_Gcj;                                                       // 
+      if(isinf(beta)){CGFailed=1;break;}                                                        //   beta = inf?
+      if(beta == 0.0){CGFailed=1;break;}                                                        //   beta = 0?  can't make further progress(?)
+      axpy(   aj,1.0,cj,    beta,   aj,2*CA_KRYLOV_S+1);                                    //   cj[] = cj[] + beta*aj[]    
+
+    }                                                                                           // inner n (j) loop
+
+    // update iterates...
+    for(i=0;i<2*CA_KRYLOV_S+1;i++){add_vectors(level,e_id,1.0,e_id,ej[i],PR[i]);}               // e_id[] = [P,R]ej + e_id[]
+    if(!CGFailed && !CGConverged){                                                              // if we're done, then there is no point in updating these
+                                   add_vectors(level,  p_id,0.0,  p_id,aj[0],PR[0]);                //    p[] = [P,R]aj
+    for(i=1;i<2*CA_KRYLOV_S+1;i++){add_vectors(level,  p_id,1.0,  p_id,aj[i],PR[i]);}               //          ...
+                                   add_vectors(level, r_id,0.0, r_id,cj[0],PR[0]);                //    r[] = [P,R]cj
+    for(i=1;i<2*CA_KRYLOV_S+1;i++){add_vectors(level, r_id,1.0, r_id,cj[i],PR[i]);}               //          ...
+    }
+                              m+=CA_KRYLOV_S;                                                 //   m+=CA_KRYLOV_S;
+    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  }                                                                                             // } // outer m loop
+
+}
diff --git a/Util/hpgmg/finite-volume/source/solvers/cg.c b/Util/hpgmg/finite-volume/source/solvers/cg.c
new file mode 100644
index 00000000..eaa386a3
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/solvers/cg.c
@@ -0,0 +1,73 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#define KRYLOV_DIAGONAL_PRECONDITION
+//------------------------------------------------------------------------------------------------------------------------------
+void CG(level_type * level, int x_id, int R_id, double a, double b, double desired_reduction_in_norm){
+  // Algorithm 9.1 in Iterative Methods for Sparse Linear Systems(Yousef Saad)
+  int  r0_id = VECTORS_RESERVED+0;
+  int   r_id = VECTORS_RESERVED+1;
+  int   p_id = VECTORS_RESERVED+2;
+  int  Ap_id = VECTORS_RESERVED+3;
+  int   z_id = VECTORS_RESERVED+4;
+
+  int jMax=200;
+  int j=0;
+  int CGFailed    = 0;
+  int CGConverged = 0;
+  residual(level,r0_id,x_id,R_id,a,b);                                          // r0[] = R_id[] - A(x_id)
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  if(level->must_subtract_mean == 1){
+    double mean_of_r0 = mean(level,r0_id);
+    shift_vector(level,r0_id,r0_id,-mean_of_r0);
+  }
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  scale_vector(level,r_id,1.0,r0_id);                                           // r[] = r0[]
+  #ifdef KRYLOV_DIAGONAL_PRECONDITION                                           //
+  mul_vectors(level,z_id,1.0,VECTOR_DINV,r0_id);                                // z[] = Dinv[]*r0[]
+  #else                                                                         //
+  scale_vector(level,z_id,1.0,r0_id);                                           // z[] = I*r0[]
+  #endif                                                                        //
+  scale_vector(level,p_id,1.0,z_id);                                            // p[] = z[]
+  double norm_of_r0 = norm(level,r_id);                                         // the norm of the initial residual...
+  if(norm_of_r0 == 0.0){CGConverged=1;}                                         // entered CG with exact solution
+  double r_dot_z = dot(level,r_id,z_id);                                        // r_dot_z = dot(r,z)
+  while( (j<jMax) && (!CGFailed) && (!CGConverged) ){                           // while(not done){
+    j++;level->Krylov_iterations++;                                             //
+    apply_op(level,Ap_id,p_id,a,b);                                             //   Ap[] = A(p)
+    double Ap_dot_p = dot(level,Ap_id,p_id);                                    //   Ap_dot_p = dot(Ap,p)
+    if(Ap_dot_p == 0.0){CGFailed=1;break;}                                      //   pivot breakdown ???
+    double alpha = r_dot_z / Ap_dot_p;                                          //   alpha = r_dot_z / Ap_dot_p
+    if(isinf(alpha)){CGFailed=1;break;}                                         //   ???
+    add_vectors(level,x_id,1.0,x_id, alpha,p_id );                              //   x_id[] = x_id[] + alpha*p[]
+    add_vectors(level,r_id,1.0,r_id,-alpha,Ap_id);                              //   r[]    = r[]    - alpha*Ap[]   (intermediate residual?)
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    if(level->must_subtract_mean == 1){
+      double mean_of_r = mean(level,r_id);
+      shift_vector(level,r_id,r_id,-mean_of_r);
+    }
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    double norm_of_r = norm(level,r_id);                                        //   norm of intermediate residual
+    if(norm_of_r == 0.0){CGConverged=1;break;}                                  //
+    if(norm_of_r < desired_reduction_in_norm*norm_of_r0){CGConverged=1;break;}  //
+    #ifdef KRYLOV_DIAGONAL_PRECONDITION                                         //
+    mul_vectors(level,z_id,1.0,VECTOR_DINV,r_id);                               //   z[] = Dinv[]*r[]
+    #else                                                                       //
+    scale_vector(level,z_id,1.0,r_id);                                          //   z[] = I*r[]
+    #endif                                                                      //
+    double r_dot_z_new = dot(level,r_id,z_id);                                  //   r_dot_z_new = dot(r_{j+1},z_{j+1})
+    if(r_dot_z_new == 0.0){CGFailed=1;break;}                                   //   Lanczos breakdown ???
+    double beta = (r_dot_z_new/r_dot_z);                                        //   beta = (r_dot_z_new/r_dot_z)
+    if(isinf(beta)){CGFailed=1;break;}                                          //   ???
+    add_vectors(level,p_id,1.0,z_id,beta,p_id );                                //   p[] = z[] + beta*p[]
+    r_dot_z = r_dot_z_new;                                                      //   r_dot_r = r_dot_r_new   (save old r_dot_r)
+  }                                                                             // }
+}
diff --git a/Util/hpgmg/finite-volume/source/solvers/matmul.c b/Util/hpgmg/finite-volume/source/solvers/matmul.c
new file mode 100644
index 00000000..37883d55
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/solvers/matmul.c
@@ -0,0 +1,64 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+void matmul(level_type * level, double *C, int * id_A, int * id_B, int rows, int cols, int A_equals_B_transpose){
+  // *id_A = m vector_id's (conceptually pointers to the rows    of a m x level->num_my_boxes*volume matrix)
+  // *id_B = n vector_id's (conceptually pointers to the columns of a level->num_my_boxes*volume matrix x n)
+  // *C is a mxn matrix where C[rows][cols] = dot(id_A[rows],id_B[cols])
+
+  // FIX, id_A and id_B are likely the same and thus C[][] will be symmetric (modulo missing row?)
+  // if(A_equals_B_transpose && (cols>=rows)) then use id_B and only run for nn>=mm // common case for s-step Krylov methods
+  // C_is_symmetric && cols< rows (use id_A)
+  int mm,nn;
+
+
+  double _timeStart = getTime();
+  // FIX... rather than performing an all_reduce on the essentially symmetric [G,g], do the all_reduce on the upper triangle and then duplicate (saves BW)
+  #ifdef _OPENMP
+  #pragma omp parallel for schedule(static,1) collapse(2)
+  #endif
+  for(mm=0;mm<rows;mm++){
+  for(nn=0;nn<cols;nn++){
+  if(nn>=mm){ // upper triangular
+    int box;
+    double a_dot_b_level =  0.0;
+    for(box=0;box<level->num_my_boxes;box++){
+      int i,j,k;
+      const int jStride = level->my_boxes[box].jStride;
+      const int kStride = level->my_boxes[box].kStride;
+      const int  ghosts = level->my_boxes[box].ghosts;
+      const int     dim = level->my_boxes[box].dim;
+      double * __restrict__ grid_a = level->my_boxes[box].vectors[id_A[mm]] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point
+      double * __restrict__ grid_b = level->my_boxes[box].vectors[id_B[nn]] + ghosts*(1+jStride+kStride); 
+      double a_dot_b_box = 0.0;
+      for(k=0;k<dim;k++){
+      for(j=0;j<dim;j++){
+      for(i=0;i<dim;i++){
+        int ijk = i + j*jStride + k*kStride;
+        a_dot_b_box += grid_a[ijk]*grid_b[ijk];
+      }}}
+      a_dot_b_level+=a_dot_b_box;
+    }
+                             C[mm*cols + nn] = a_dot_b_level; // C[mm][nn]
+    if((mm<cols)&&(nn<rows)){C[nn*cols + mm] = a_dot_b_level;}// C[nn][mm] 
+  }
+  }}
+  level->timers.blas3 += (double)(getTime()-_timeStart);
+
+  #ifdef USE_MPI
+  double *send_buffer = (double*)malloc(rows*cols*sizeof(double));
+  for(mm=0;mm<rows;mm++){
+  for(nn=0;nn<cols;nn++){
+    send_buffer[mm*cols + nn] = C[mm*cols + nn];
+  }}
+  double _timeStartAllReduce = getTime();
+  MPI_Allreduce(send_buffer,C,rows*cols,MPI_DOUBLE,MPI_SUM,level->MPI_COMM_ALLREDUCE);
+  double _timeEndAllReduce = getTime();
+  level->timers.collectives   += (double)(_timeEndAllReduce-_timeStartAllReduce);
+  free(send_buffer);
+  #endif
+
+}
+
diff --git a/Util/hpgmg/finite-volume/source/timers.c b/Util/hpgmg/finite-volume/source/timers.c
new file mode 100644
index 00000000..cec93e68
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/timers.c
@@ -0,0 +1,14 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef _OPENMP
+// getTime in OpenMP is now defined as a preprocessor macro
+//#include "./timers/omp.c"
+#elif USE_MPI
+// getTime in MPI is now defined as a preprocessor macro
+//#include "./timers/mpi.c"
+#else
+#include "./timers/x86.c"
+#endif
diff --git a/Util/hpgmg/finite-volume/source/timers.h b/Util/hpgmg/finite-volume/source/timers.h
new file mode 100644
index 00000000..27384357
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/timers.h
@@ -0,0 +1,25 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef TIMER_H
+#define TIMER_H
+
+  #include<stdint.h>
+
+  #ifdef _OPENMP
+    #include <omp.h>
+    #define getTime() (omp_get_wtime())
+
+  #elif USE_MPI
+    #include <mpi.h>
+    #define getTime() (MPI_Wtime())
+
+  #else
+    // user must provide a function getTime and include it in timers.c
+    // if calibration is necesary, then the user must #define CALIBRATE_TIMER
+    double getTime();
+  #endif
+
+#endif
diff --git a/Util/hpgmg/finite-volume/source/timers/mpi.c b/Util/hpgmg/finite-volume/source/timers/mpi.c
new file mode 100644
index 00000000..adc0970e
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/timers/mpi.c
@@ -0,0 +1,10 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+#include <mpi.h>
+double getTime(){
+  return(MPI_Wtime()); // timers are in units of seconds; no conversion is necessary
+}
diff --git a/Util/hpgmg/finite-volume/source/timers/omp.c b/Util/hpgmg/finite-volume/source/timers/omp.c
new file mode 100644
index 00000000..bdf453e6
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/timers/omp.c
@@ -0,0 +1,10 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+#include <omp.h>
+double getTime(){
+  return(omp_get_wtime()); // timers are in units of seconds; no conversion is necessary
+}
diff --git a/Util/hpgmg/finite-volume/source/timers/x86.c b/Util/hpgmg/finite-volume/source/timers/x86.c
new file mode 100644
index 00000000..a361b4b9
--- /dev/null
+++ b/Util/hpgmg/finite-volume/source/timers/x86.c
@@ -0,0 +1,12 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+#define CALIBRATE_TIMER // mg.c will calibrate the timer to determine seconds per cycle
+double getTime(){
+  uint64_t lo, hi;
+  __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+  return( 1e-9*((double)( (((uint64_t)hi) << 32) | ((uint64_t)lo) )) ); // timers are in units of seconds;  assume 1GHz cycle counter and convert later
+}