From 727872d3124d12d386169a3ff05c23caf9051ede Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 21 Jun 2022 07:25:29 -0700
Subject: [PATCH 01/60] Initial work towards unification of gauge fields. 
 Replaced Gauge_p() method with a better replacement named data()

---
 include/gauge_field.h                         | 130 +++++++---------
 include/gauge_field_order.h                   |  46 +++---
 lib/coarse_op.cuh                             |   8 +-
 lib/coarse_op_preconditioned.cu               |   6 +-
 lib/cpu_gauge_field.cpp                       | 147 +++++-------------
 lib/cuda_gauge_field.cpp                      | 103 ++++--------
 lib/gauge_field.cpp                           |  94 ++++++++++-
 lib/gauge_stout.cu                            |   4 +-
 lib/interface_quda.cpp                        |   8 +-
 lib/staggered_kd_build_xinv.cu                |   4 +-
 lib/staggered_oprod.cu                        |   4 +-
 lib/unitarize_links_quda.cu                   |  31 ++--
 tests/gauge_force_test.cpp                    |  22 +--
 tests/heatbath_test.cpp                       |   6 +-
 tests/hisq_paths_force_test.cpp               |  13 +-
 tests/hisq_unitarize_force_test.cpp           |   8 +-
 .../domain_wall_dslash_reference.cpp          |  68 ++++----
 .../domain_wall_dslash_reference.h            |  52 +++----
 .../host_reference/gauge_force_reference.cpp  |   8 +-
 tests/host_reference/gauge_force_reference.h  |   4 +-
 tests/host_reference/hisq_force_reference.cpp |  21 ++-
 .../staggered_dslash_reference.cpp            |  12 +-
 .../staggered_dslash_reference.h              |   8 +-
 tests/multigrid_evolve_test.cpp               |   6 +-
 tests/staggered_dslash_test_utils.h           |   3 +-
 tests/unitarize_link_test.cpp                 |   4 +-
 tests/utils/host_utils.cpp                    |   2 +-
 tests/utils/host_utils.h                      |   2 +-
 tests/utils/staggered_host_utils.cpp          |   2 +-
 29 files changed, 399 insertions(+), 427 deletions(-)

diff --git a/include/gauge_field.h b/include/gauge_field.h
index 706e82ccf1..7e484bd3a0 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -195,6 +195,9 @@ namespace quda {
   class GaugeField : public LatticeField {
 
   protected:
+    void *gauge; /** The gauge field allocation */
+    void *gauge_h; /** Mapped-memory pointer when allocating on the host */
+    void **gauge_qdp; /** Array of pointers to each subset (QDP order) */
       size_t bytes;        // bytes allocated per full field
       size_t phase_offset; // offset in bytes to gauge phases - useful to keep track of texture alignment
       size_t phase_bytes;  // bytes needed to store the phases
@@ -203,6 +206,7 @@ namespace quda {
       int nColor;
       int nFace;
       QudaFieldGeometry geometry; // whether the field is a scale, vector or tensor
+      int site_dim; // the dimensionality of each site (number of matrices per lattice site)
 
       QudaReconstructType reconstruct;
       int nInternal; // number of degrees of freedom per link matrix
@@ -350,24 +354,46 @@ namespace quda {
 
     size_t TotalBytes() const { return bytes; }
 
-    virtual void* Gauge_p() { errorQuda("Not implemented"); return (void*)0;}
-    virtual void* Even_p() { errorQuda("Not implemented"); return (void*)0;}
-    virtual void* Odd_p() { errorQuda("Not implemented"); return (void*)0;}
+    /**
+       @brief Helper function that returns true if the gauge order is an array of pointers
+       @param[in] order The gauge order requested
+       @return If the order is an array of pointers
+     */
+    constexpr bool is_pointer_array(QudaGaugeFieldOrder order) const
+    {
+      switch (order) {
+      case QUDA_QDP_GAUGE_ORDER:
+      case QUDA_QDPJIT_GAUGE_ORDER:
+        return true;
+      default:
+        return false;
+      }
+    }
 
-    virtual const void* Gauge_p() const { errorQuda("Not implemented"); return (void*)0;}
-    virtual const void* Even_p() const { errorQuda("Not implemented"); return (void*)0;}
-    virtual const void* Odd_p() const { errorQuda("Not implemented"); return (void*)0;}
+    /**
+       @brief Return base pointer to the gauge field allocation.
+       @tparam T Optional type to cast the pointer to.
+       @return Base pointer to the gauge field allocation
+     */
+    template <typename T = void*> auto data() const
+    {
+      static_assert(std::is_pointer_v<T>, "data() requires a pointer cast type");
+
+      using U = typename std::remove_pointer<T>::type;
+      if constexpr (std::is_pointer_v<U>) {
+        if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order);
+        return reinterpret_cast<T>(gauge_qdp);
+      } else {
+        if (is_pointer_array(order) && !std::is_same_v<T, void*>) errorQuda("Non dim-array ordered field requested but order is %d", order);
+        return reinterpret_cast<T>(gauge);
+      }
+    }
 
     virtual int full_dim(int d) const { return x[d]; }
 
-    const void** Ghost() const {
+    auto Ghost() const {
       if ( isNative() ) errorQuda("No ghost zone pointer for quda-native gauge fields");
-      return (const void**)ghost;
-    }
-
-    void** Ghost() {
-      if ( isNative() ) errorQuda("No ghost zone pointer for quda-native gauge fields");
-      return ghost;
+      return (void * const *)ghost;
     }
 
     /**
@@ -383,9 +409,9 @@ namespace quda {
     size_t SiteSize() const { return site_size; }
 
     /**
-       Set all field elements to zero (virtual)
+       Set all field elements to zero
     */
-    virtual void zero() = 0;
+    void zero();
 
     /**
      * Generic gauge field copy
@@ -439,15 +465,28 @@ namespace quda {
     */
     static GaugeField* Create(const GaugeFieldParam &param);
 
+    /**
+      @brief If managed memory and prefetch is enabled, prefetch
+      the gauge field and buffers to the CPU or the GPU
+      @param[in] mem_space Memory space we are prefetching to
+      @param[in] stream Which stream to run the prefetch in (default 0)
+    */
+    void prefetch(QudaFieldLocation mem_space, qudaStream_t stream = device::get_default_stream()) const;
+
+    /**
+       @brief Backs up the GaugeField
+    */
+    void backup() const;
+
+    /**
+       @brief Restores the GaugeField
+    */
+    void restore() const;
   };
 
   class cudaGaugeField : public GaugeField {
 
   private:
-    void *gauge;
-    void *gauge_h; // mapped-memory pointer when allocating on the host
-    void *even;
-    void *odd;
 
     /**
        @brief Initialize the padded region to 0
@@ -571,15 +610,6 @@ namespace quda {
     */
     void saveCPUField(cpuGaugeField &cpu, TimeProfile &profile) const;
 
-    // (ab)use with care
-    void* Gauge_p() { return gauge; }
-    void* Even_p() { return even; }
-    void* Odd_p() { return odd; }
-
-    const void* Gauge_p() const { return gauge; }
-    const void* Even_p() const { return even; }
-    const void *Odd_p() const { return odd; }
-
     /**
       @brief Copy all contents of the field to a host buffer.
       @param[in] the host buffer to copy to.
@@ -593,29 +623,6 @@ namespace quda {
     virtual void copy_from_buffer(void *buffer);
 
     void setGauge(void* _gauge); //only allowed when create== QUDA_REFERENCE_FIELD_CREATE
-
-    /**
-       Set all field elements to zero
-    */
-    void zero();
-
-    /**
-       @brief Backs up the cudaGaugeField to CPU memory
-    */
-    void backup() const;
-
-    /**
-       @brief Restores the cudaGaugeField to CUDA memory
-    */
-    void restore() const;
-
-    /**
-      @brief If managed memory and prefetch is enabled, prefetch
-      the gauge field and buffers to the CPU or the GPU
-      @param[in] mem_space Memory space we are prefetching to
-      @param[in] stream Which stream to run the prefetch in (default 0)
-    */
-    void prefetch(QudaFieldLocation mem_space, qudaStream_t stream = device::get_default_stream()) const;
   };
 
   class cpuGaugeField : public GaugeField {
@@ -624,9 +631,6 @@ namespace quda {
     friend void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu);
     friend void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const;
 
-  private:
-    void **gauge; // the actual gauge field
-
   public:
     /**
        @brief Constructor for cpuGaugeField from a GaugeFieldParam
@@ -680,9 +684,6 @@ namespace quda {
      */
     void copy(const GaugeField &src);
 
-    void* Gauge_p() { return gauge; }
-    const void* Gauge_p() const { return gauge; }
-
     /**
       @brief Copy all contents of the field to a host buffer.
       @param[in] the host buffer to copy to.
@@ -696,21 +697,6 @@ namespace quda {
     virtual void copy_from_buffer(void *buffer);
 
     void setGauge(void** _gauge); //only allowed when create== QUDA_REFERENCE_FIELD_CREATE
-
-    /**
-       Set all field elements to zero
-    */
-    void zero();
-
-    /**
-       @brief Backs up the cpuGaugeField
-    */
-    void backup() const;
-
-    /**
-       @brief Restores the cpuGaugeField
-    */
-    void restore() const;
   };
 
   /**
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 5f0186533f..2b0f4e2faa 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -355,10 +355,9 @@ namespace quda {
         scale(static_cast<Float>(1.0)),
         scale_inv(static_cast<Float>(1.0))
       {
-	for (int d=0; d<U.Geometry(); d++)
-	  u[d] = gauge_ ? static_cast<complex<storeFloat>**>(gauge_)[d] :
-	    static_cast<complex<storeFloat>**>(const_cast<void*>(U.Gauge_p()))[d];
-	resetScale(U.Scale());
+        for (int d = 0; d < U.Geometry(); d++)
+          u[d] = gauge_ ? static_cast<complex<storeFloat> **>(gauge_)[d] : U.data<complex<storeFloat> *const *>()[d];
+        resetScale(U.Scale());
       }
 
       void resetScale(Float max)
@@ -466,8 +465,7 @@ namespace quda {
       static constexpr bool fixed = fixed_point<Float,storeFloat>();
 
       Accessor(const GaugeField &U, void *gauge_ = nullptr, void ** = nullptr) :
-        u(gauge_ ? static_cast<complex<storeFloat> *>(gauge_) :
-                   static_cast<complex<storeFloat> *>(const_cast<void *>(U.Gauge_p()))),
+        u(gauge_ ? static_cast<complex<storeFloat> *>(gauge_) : U.data<complex<storeFloat> *>()),
         volumeCB(U.VolumeCB()),
         geometry(U.Geometry()),
         scale(static_cast<Float>(1.0)),
@@ -601,8 +599,7 @@ namespace quda {
       static constexpr bool fixed = fixed_point<Float,storeFloat>();
 
       Accessor(const GaugeField &U, void *gauge_ = nullptr, void ** = nullptr) :
-        u(gauge_ ? static_cast<complex<storeFloat> *>(gauge_) :
-                   static_cast<complex<storeFloat> *>(const_cast<void *>(U.Gauge_p()))),
+        u(gauge_ ? static_cast<complex<storeFloat> *>(gauge_) : U.data<complex<storeFloat> *>()),
         offset_cb((U.Bytes() >> 1) / sizeof(complex<storeFloat>)),
         volumeCB(U.VolumeCB()),
         stride(U.Stride()),
@@ -1512,7 +1509,7 @@ namespace quda {
 
         FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
           reconstruct(u),
-          gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
+          gauge(gauge_ ? gauge_ : u.data<Float *>()),
           offset(u.Bytes() / (2 * sizeof(Float) * N)),
           ghostExchange(u.GhostExchange()),
           volumeCB(u.VolumeCB()),
@@ -1829,7 +1826,9 @@ namespace quda {
       const int volumeCB;
     QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
       : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
-	{ for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }
+    {
+      for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data<Float *const *>()[i];
+    }
 
         __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const
         {
@@ -1873,7 +1872,9 @@ namespace quda {
       const int volumeCB;
     QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
       : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
-	{ for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }
+    {
+      for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data<Float *const *>()[i];
+    }
 
         __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const
         {
@@ -1920,9 +1921,14 @@ namespace quda {
     Float *gauge;
     const int volumeCB;
     const int geometry;
-  MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) :
-    LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
-      volumeCB(u.VolumeCB()), geometry(u.Geometry()) { ; }
+    MILCOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
+      LegacyOrder<Float, length>(u, ghost_),
+      gauge(gauge_ ? gauge_ : u.data<Float *>()),
+      volumeCB(u.VolumeCB()),
+      geometry(u.Geometry())
+    {
+      ;
+    }
 
   __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const
   {
@@ -1980,7 +1986,7 @@ namespace quda {
     const size_t size;
     MILCSiteOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
       LegacyOrder<Float, length>(u, ghost_),
-      gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
+      gauge(gauge_ ? gauge_ : u.data<Float *>()),
       volumeCB(u.VolumeCB()),
       geometry(u.Geometry()),
       offset(u.SiteOffset()),
@@ -2040,7 +2046,7 @@ namespace quda {
     const int geometry;
     CPSOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
       LegacyOrder<Float, length>(u, ghost_),
-      gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
+      gauge(gauge_ ? gauge_ : u.data<Float *>()),
       volumeCB(u.VolumeCB()),
       anisotropy(u.Anisotropy()),
       anisotropy_inv(1.0 / anisotropy),
@@ -2106,9 +2112,7 @@ namespace quda {
       int exVolumeCB; // extended checkerboard volume
       static constexpr int Nc = 3;
       BQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
-        LegacyOrder<Float, length>(u, ghost_),
-        gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
-        volumeCB(u.VolumeCB())
+        LegacyOrder<Float, length>(u, ghost_), gauge(gauge_ ? gauge_ : u.data<Float *>()), volumeCB(u.VolumeCB())
       {
         if constexpr (length != 18) errorQuda("Gauge length %d not supported", length);
         // compute volumeCB + halo region
@@ -2172,7 +2176,7 @@ namespace quda {
       const real scale_inv;
       TIFROrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
         LegacyOrder<Float, length>(u, ghost_),
-        gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
+        gauge(gauge_ ? gauge_ : u.data<Float *>()),
         volumeCB(u.VolumeCB()),
         scale(u.Scale()),
         scale_inv(1.0 / scale)
@@ -2239,7 +2243,7 @@ namespace quda {
       const int exDim[4];
       TIFRPaddedOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
         LegacyOrder<Float, length>(u, ghost_),
-        gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
+        gauge(gauge_ ? gauge_ : u.data<Float *>()),
         volumeCB(u.VolumeCB()),
         exVolumeCB(1),
         scale(u.Scale()),
diff --git a/lib/coarse_op.cuh b/lib/coarse_op.cuh
index fbefd474a0..67433a2291 100644
--- a/lib/coarse_op.cuh
+++ b/lib/coarse_op.cuh
@@ -877,8 +877,8 @@ namespace quda {
 	X_atomic.backup();
         break;
       case COMPUTE_CONVERT:
-	if (Y_atomic.Gauge_p() == Y.Gauge_p()) Y.backup();
-	if (X_atomic.Gauge_p() == X.Gauge_p()) X.backup();
+	if (Y_atomic.data() == Y.data()) Y.backup();
+	if (X_atomic.data() == X.data()) X.backup();
         break;
       case COMPUTE_RESCALE:
         Y.backup();
@@ -911,8 +911,8 @@ namespace quda {
 	X_atomic.restore();
         break;
       case COMPUTE_CONVERT:
-	if (Y_atomic.Gauge_p() == Y.Gauge_p()) Y.restore();
-	if (X_atomic.Gauge_p() == X.Gauge_p()) X.restore();
+	if (Y_atomic.data() == Y.data()) Y.restore();
+	if (X_atomic.data() == X.data()) X.restore();
         break;
       case COMPUTE_RESCALE:
         Y.restore();
diff --git a/lib/coarse_op_preconditioned.cu b/lib/coarse_op_preconditioned.cu
index 9cf4da755e..7a75c1d895 100644
--- a/lib/coarse_op_preconditioned.cu
+++ b/lib/coarse_op_preconditioned.cu
@@ -174,8 +174,7 @@ namespace quda
       GaugeField *X_aos = create_gauge_copy(X, true);
       Xinv_aos = create_gauge_copy(Xinv, false);
 
-      blas::flops += invert((void *)Xinv_aos->Gauge_p(), (void *)X_aos->Gauge_p(), n, X_aos->Volume(),
-                            X_aos->Precision(), X.Location());
+      blas::flops += invert(Xinv_aos->data(), X_aos->data(), n, X_aos->Volume(), X_aos->Precision(), X.Location());
 
       if (&Xinv != Xinv_aos) {
         if (Xinv.Precision() < QUDA_SINGLE_PRECISION) Xinv.Scale(Xinv_aos->abs_max());
@@ -188,7 +187,8 @@ namespace quda
     } else if (X.Location() == QUDA_CPU_FIELD_LOCATION && X.Order() == QUDA_QDP_GAUGE_ORDER) {
       const cpuGaugeField *X_h = static_cast<const cpuGaugeField*>(&X);
       cpuGaugeField *Xinv_h = static_cast<cpuGaugeField*>(&Xinv);
-      blas::flops += invert(*(void**)Xinv_h->Gauge_p(), *(void**)X_h->Gauge_p(), n, X_h->Volume(), X.Precision(), X.Location());
+      blas::flops += invert(Xinv_h->data<void *const *>()[0], X_h->data<void *const *>()[0], n, X_h->Volume(),
+                            X.Precision(), X.Location());
     } else {
       errorQuda("Unsupported location=%d and order=%d", X.Location(), X.Order());
     }
diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp
index f4b27109a8..f3063d5d32 100644
--- a/lib/cpu_gauge_field.cpp
+++ b/lib/cpu_gauge_field.cpp
@@ -26,39 +26,30 @@ namespace quda {
       errorQuda("10-reconstruction only supported with momentum links");
     }
 
-    int siteDim=0;
-    if (geometry == QUDA_SCALAR_GEOMETRY) siteDim = 1;
-    else if (geometry == QUDA_VECTOR_GEOMETRY) siteDim = nDim;
-    else if (geometry == QUDA_TENSOR_GEOMETRY) siteDim = nDim * (nDim-1) / 2;
-    else if (geometry == QUDA_COARSE_GEOMETRY) siteDim = 2*nDim;
-    else if (geometry == QUDA_KDINVERSE_GEOMETRY)
-      siteDim = 1 << nDim;
-    else errorQuda("Unknown geometry type %d", geometry);
-
     // compute the correct bytes size for these padded field orders
     if (order == QUDA_TIFR_PADDED_GAUGE_ORDER) {
-      bytes = siteDim * (x[0]*x[1]*(x[2]+4)*x[3]) * nInternal * precision;
+      bytes = site_dim * (x[0] * x[1] * (x[2] + 4) * x[3]) * nInternal * precision;
     } else if (order == QUDA_BQCD_GAUGE_ORDER) {
-      bytes = siteDim * (x[0]+4)*(x[1]+2)*(x[2]+2)*(x[3]+2) * nInternal * precision;
+      bytes = site_dim * (x[0] + 4) * (x[1] + 2) * (x[2] + 2) * (x[3] + 2) * nInternal * precision;
     } else if (order == QUDA_MILC_SITE_GAUGE_ORDER) {
       bytes = volume * site_size;
     }
 
     if (order == QUDA_QDP_GAUGE_ORDER) {
-      gauge = (void**) safe_malloc(siteDim * sizeof(void*));
-
-      for (int d=0; d<siteDim; d++) {
-	size_t nbytes = volume * nInternal * precision;
-	if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
-          gauge[d] = nbytes ? safe_malloc(nbytes) : nullptr;
-          if (create == QUDA_ZERO_FIELD_CREATE && nbytes) memset(gauge[d], 0, nbytes);
+      gauge = safe_malloc(site_dim * sizeof(void *));
+      size_t nbytes = volume * nInternal * precision;
+      gauge_qdp = reinterpret_cast<void **>(gauge);
+      for (int d = 0; d < site_dim; d++) {
+        if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
+          gauge_qdp[d] = nbytes ? safe_malloc(nbytes) : nullptr;
+          if (create == QUDA_ZERO_FIELD_CREATE && nbytes) memset(gauge_qdp[d], 0, nbytes);
         } else if (create == QUDA_REFERENCE_FIELD_CREATE) {
-          gauge[d] = ((void **)param.gauge)[d];
+          gauge_qdp[d] = ((void **)param.gauge)[d];
         } else {
           errorQuda("Unsupported creation type %d", create);
         }
       }
-    
+
     } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER  ||
 	       order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_TIFR_GAUGE_ORDER ||
 	       order == QUDA_TIFR_PADDED_GAUGE_ORDER || order == QUDA_MILC_SITE_GAUGE_ORDER) {
@@ -71,7 +62,7 @@ namespace quda {
         gauge = bytes ? (void **)safe_malloc(bytes) : nullptr;
         if (create == QUDA_ZERO_FIELD_CREATE && bytes) memset(gauge, 0, bytes);
       } else if (create == QUDA_REFERENCE_FIELD_CREATE) {
-	gauge = (void**) param.gauge;
+        gauge = param.gauge;
       } else {
 	errorQuda("Unsupported creation type %d", create);
       }
@@ -104,24 +95,13 @@ namespace quda {
 
   cpuGaugeField::~cpuGaugeField()
   {
-    int siteDim = 0;
-    if (geometry == QUDA_SCALAR_GEOMETRY) siteDim = 1;
-    else if (geometry == QUDA_VECTOR_GEOMETRY) siteDim = nDim;
-    else if (geometry == QUDA_TENSOR_GEOMETRY) siteDim = nDim * (nDim-1) / 2;
-    else if (geometry == QUDA_COARSE_GEOMETRY) siteDim = 2*nDim;
-    else if (geometry == QUDA_KDINVERSE_GEOMETRY)
-      siteDim = 1 << nDim;
-    else errorQuda("Unknown geometry type %d", geometry);
-
     if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
       if (order == QUDA_QDP_GAUGE_ORDER) {
-	for (int d=0; d<siteDim; d++) {
-	  if (gauge[d]) host_free(gauge[d]);
-	}
-	if (gauge) host_free(gauge);
-      } else {
-	if (gauge) host_free(gauge);
+        for (int d = 0; d < site_dim; d++) {
+          if (gauge_qdp[d]) host_free(gauge_qdp[d]);
+        }
       }
+      if (gauge) host_free(gauge);
     } else { // QUDA_REFERENCE_FIELD_CREATE 
       if (order == QUDA_QDP_GAUGE_ORDER){
 	if (gauge) host_free(gauge);
@@ -282,11 +262,10 @@ namespace quda {
 
 	if (!src.isNative()) errorQuda("Only native order is supported");
 	void *buffer = pool_pinned_malloc(src.Bytes());
-	// this copies over both even and odd
-        qudaMemcpy(buffer, static_cast<const cudaGaugeField &>(src).Gauge_p(), src.Bytes(), qudaMemcpyDeviceToHost);
+        qudaMemcpy(buffer, src.data(), src.Bytes(), qudaMemcpyDeviceToHost);
 
-        copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, gauge, buffer);
-	pool_pinned_free(buffer);
+        copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, nullptr, buffer);
+        pool_pinned_free(buffer);
 
       } else { // else on the GPU
 
@@ -297,9 +276,11 @@ namespace quda {
 	void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, order, geometry) : nullptr;
 
 	if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED) {
-	  copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0, ghost_buffer, 0);
-	  if (geometry == QUDA_COARSE_GEOMETRY) copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0, ghost_buffer, 0, 3); // forwards links if bi-directional
-	} else {
+          copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr);
+          if (geometry == QUDA_COARSE_GEOMETRY)
+            copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr,
+                             3); // forwards links if bi-directional
+        } else {
 	  copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0);
 	}
 
@@ -321,8 +302,7 @@ namespace quda {
 
     } else if (typeid(src) == typeid(cpuGaugeField)) {
       // copy field and ghost zone directly
-      copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, gauge,
-		       const_cast<void*>(static_cast<const cpuGaugeField&>(src).Gauge_p()));
+      copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION);
     } else {
       errorQuda("Invalid gauge field type");
     }
@@ -343,88 +323,35 @@ namespace quda {
     gauge = gauge_;
   }
 
-  void cpuGaugeField::backup() const {
-    if (backed_up) errorQuda("Gauge field already backed up");
-
-    if (order == QUDA_QDP_GAUGE_ORDER) {
-      char **buffer = new char*[geometry];
-      for (int d=0; d<geometry; d++) {
-	buffer[d] = new char[bytes/geometry];
-	memcpy(buffer[d], gauge[d], bytes/geometry);
-      }
-      backup_h = reinterpret_cast<char*>(buffer);
-    } else {
-      backup_h = new char[bytes];
-      memcpy(backup_h, gauge, bytes);
-    }
-
-    backed_up = true;
-  }
-
-  void cpuGaugeField::restore() const
-  {
-    if (!backed_up) errorQuda("Cannot restore since not backed up");
-
-    if (order == QUDA_QDP_GAUGE_ORDER) {
-      char **buffer = reinterpret_cast<char**>(backup_h);
-      for (int d=0; d<geometry; d++) {
-	memcpy(gauge[d], buffer[d], bytes/geometry);
-	delete []buffer[d];
-      }
-      delete []buffer;
-    } else {
-      memcpy(gauge, backup_h, bytes);
-      delete []backup_h;
-    }
-
-    backed_up = false;
-  }
-
-  void cpuGaugeField::zero() {
-    if (order != QUDA_QDP_GAUGE_ORDER) {
-      memset(gauge, 0, bytes);
-    } else {
-      for (int g=0; g<geometry; g++) memset(gauge[g], 0, volume * nInternal * precision);
-    }
-  }
-
   void cpuGaugeField::copy_to_buffer(void *buffer) const
   {
-
-    if (Order() == QUDA_QDP_GAUGE_ORDER || Order() == QUDA_QDPJIT_GAUGE_ORDER) {
-      void *const *p = static_cast<void *const *>(Gauge_p());
-      int dbytes = Bytes() / 4;
-      static_assert(sizeof(char) == 1, "Assuming sizeof(char) == 1");
+    if (is_pointer_array(order)) {
       char *dst_buffer = reinterpret_cast<char *>(buffer);
-      for (int d = 0; d < 4; d++) { std::memcpy(&dst_buffer[d * dbytes], p[d], dbytes); }
+      for (int d = 0; d < geometry; d++) {
+        std::memcpy(&dst_buffer[d * bytes / geometry], data<void *const *>()[d], bytes / geometry);
+      }
     } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER
                || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER
                || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
-      const void *p = Gauge_p();
-      int bytes = Bytes();
-      std::memcpy(buffer, p, bytes);
+      std::memcpy(buffer, data(), Bytes());
     } else {
-      errorQuda("Unsupported order = %d\n", Order());
+      errorQuda("Unsupported order = %d", Order());
     }
   }
 
   void cpuGaugeField::copy_from_buffer(void *buffer)
   {
-
-    if (Order() == QUDA_QDP_GAUGE_ORDER || Order() == QUDA_QDPJIT_GAUGE_ORDER) {
-      void **p = static_cast<void **>(Gauge_p());
-      size_t dbytes = Bytes() / 4;
-      static_assert(sizeof(char) == 1, "Assuming sizeof(char) == 1");
+    if (is_pointer_array(order)) {
       const char *dst_buffer = reinterpret_cast<const char *>(buffer);
-      for (int d = 0; d < 4; d++) { std::memcpy(p[d], &dst_buffer[d * dbytes], dbytes); }
+      for (int d = 0; d < geometry; d++) {
+        std::memcpy(data<void *const *>()[d], &dst_buffer[d * bytes / geometry], Bytes() / geometry);
+      }
     } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER
                || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER
                || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
-      void *p = Gauge_p();
-      size_t bytes = Bytes();
-      std::memcpy(p, buffer, bytes);
+      std::memcpy(data(), buffer, Bytes());
     } else {
-      errorQuda("Unsupported order = %d\n", Order());
+      errorQuda("Unsupported order = %d", Order());
     }
   }
 
diff --git a/lib/cuda_gauge_field.cpp b/lib/cuda_gauge_field.cpp
index a7e6ff8952..23b4331cc0 100644
--- a/lib/cuda_gauge_field.cpp
+++ b/lib/cuda_gauge_field.cpp
@@ -7,8 +7,7 @@
 
 namespace quda {
 
-  cudaGaugeField::cudaGaugeField(const GaugeFieldParam &param) :
-    GaugeField(param), gauge(0), even(0), odd(0)
+  cudaGaugeField::cudaGaugeField(const GaugeFieldParam &param) : GaugeField(param)
   {
     if ((order == QUDA_QDP_GAUGE_ORDER || order == QUDA_QDPJIT_GAUGE_ORDER) &&
         create != QUDA_REFERENCE_FIELD_CREATE) {
@@ -68,8 +67,6 @@ namespace quda {
       if (create == QUDA_REFERENCE_FIELD_CREATE) exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
     }
 
-    even = gauge;
-    odd = static_cast<char*>(gauge) + bytes/2;
     if (create != QUDA_ZERO_FIELD_CREATE && isNative() && ghostExchange == QUDA_GHOST_EXCHANGE_PAD) zeroPad();
   }
 
@@ -79,8 +76,10 @@ namespace quda {
 
     size_t pitch = stride*order*precision;
     if (pad_bytes) {
-      qudaMemset2D(static_cast<char *>(even) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad);
-      qudaMemset2D(static_cast<char *>(odd) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad);
+      qudaMemset2D(static_cast<char *>(gauge) + 0 * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes,
+                   Npad);
+      qudaMemset2D(static_cast<char *>(gauge) + 1 * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes,
+                   Npad);
     }
   }
 
@@ -511,12 +510,12 @@ namespace quda {
 
       if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
         // copy field and ghost zone into this field
-        copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge);
+        copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION);
 
         if (geometry == QUDA_COARSE_GEOMETRY)
-          copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge, 0, 0, 3);
+          copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, nullptr, nullptr, nullptr, 3);
       } else {
-        copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge);
+        copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, nullptr);
         if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
       }
 
@@ -526,17 +525,15 @@ namespace quda {
 
 	if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
 	  // copy field and ghost zone into buffer
-	  copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge);
+          copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr);
 
           if (geometry == QUDA_COARSE_GEOMETRY)
-            copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField &>(src).gauge,
-                             0, 0, 3);
+            copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr, 0, 0, 3);
         } else {
-	  copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge);
+          copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr);
           if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
 	}
 
-	// this copies over both even and odd
         qudaMemcpy(gauge, buffer, bytes, qudaMemcpyDefault);
         pool_pinned_free(buffer);
       } else { // else on the GPU
@@ -545,7 +542,7 @@ namespace quda {
             src.Order() == QUDA_BQCD_GAUGE_ORDER      ||
             src.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
 	  // special case where we use zero-copy memory to read/write directly from application's array
-          void *src_d = get_mapped_device_pointer(src.Gauge_p());
+          void *src_d = get_mapped_device_pointer(src.data());
 
           if (src.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
             copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, src_d);
@@ -562,10 +559,10 @@ namespace quda {
 
 	  if (src.Order() == QUDA_QDP_GAUGE_ORDER) {
 	    for (int d=0; d<geometry; d++) {
-              qudaMemcpy(((void **)buffer)[d], ((void **)src.Gauge_p())[d], src.Bytes() / geometry, qudaMemcpyDefault);
+              qudaMemcpy(((void **)buffer)[d], src.data<void *const *>()[d], src.Bytes() / geometry, qudaMemcpyDefault);
             }
           } else {
-            qudaMemcpy(buffer, src.Gauge_p(), src.Bytes(), qudaMemcpyDefault);
+            qudaMemcpy(buffer, src.data(), src.Bytes(), qudaMemcpyDefault);
           }
 
           if (src.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD
@@ -574,11 +571,11 @@ namespace quda {
               qudaMemcpy(ghost_buffer[d], src.Ghost()[d], ghost_bytes[d], qudaMemcpyDefault);
 
           if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
-            copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer);
+            copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer);
             if (geometry == QUDA_COARSE_GEOMETRY)
-              copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer, 3);
+              copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer, 3);
           } else {
-            copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer);
+            copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer);
             if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
           }
           free_gauge_buffer(buffer, src.Order(), src.Geometry());
@@ -612,7 +609,7 @@ namespace quda {
 
   void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const
   {
-    static_cast<LatticeField&>(cpu).checkField(*this);
+    cpu.checkField(*this);
 
     if (reorder_location() == QUDA_CUDA_FIELD_LOCATION) {
 
@@ -620,9 +617,9 @@ namespace quda {
           cpu.Order() == QUDA_BQCD_GAUGE_ORDER      ||
           cpu.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
 	// special case where we use zero-copy memory to read/write directly from application's array
-        void *cpu_d = get_mapped_device_pointer(cpu.Gauge_p());
+        void *cpu_d = get_mapped_device_pointer(cpu.data());
         if (cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
-          copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, cpu_d, gauge);
+          copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, cpu_d, nullptr);
         } else {
           errorQuda("Ghost copy not supported here");
         }
@@ -636,17 +633,18 @@ namespace quda {
 	void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, cpu.Order(), geometry) : nullptr;
 
 	if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
-	  copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0);
-	  if (geometry == QUDA_COARSE_GEOMETRY) copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0, 3);
-	} else {
-	  copyExtendedGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge);
-	}
+          copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr);
+          if (geometry == QUDA_COARSE_GEOMETRY)
+            copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr, 3);
+        } else {
+          copyExtendedGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr);
+        }
 
-	if (cpu.Order() == QUDA_QDP_GAUGE_ORDER) {
+        if (cpu.Order() == QUDA_QDP_GAUGE_ORDER) {
           for (int d = 0; d < geometry; d++)
-            qudaMemcpy(((void **)cpu.gauge)[d], ((void **)buffer)[d], cpu.Bytes() / geometry, qudaMemcpyDefault);
+            qudaMemcpy((cpu.data<void *const *>())[d], ((void **)buffer)[d], cpu.Bytes() / geometry, qudaMemcpyDefault);
         } else {
-          qudaMemcpy(cpu.gauge, buffer, cpu.Bytes(), qudaMemcpyDefault);
+          qudaMemcpy(cpu.data(), buffer, cpu.Bytes(), qudaMemcpyDefault);
         }
 
         if (cpu.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD
@@ -663,9 +661,9 @@ namespace quda {
       qudaMemcpy(buffer, gauge, bytes, qudaMemcpyDefault);
 
       if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
-	copyGenericGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer);
+        copyGenericGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, nullptr, buffer);
       } else {
-	copyExtendedGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer);
+        copyExtendedGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, nullptr, buffer);
       }
       pool_pinned_free(buffer);
 
@@ -685,46 +683,11 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_D2H);
   }
 
-  void cudaGaugeField::backup() const {
-    if (backed_up) errorQuda("Gauge field already backed up");
-    backup_h = new char[bytes];
-    qudaMemcpy(backup_h, gauge, bytes, qudaMemcpyDefault);
-    backed_up = true;
-  }
-
-  void cudaGaugeField::restore() const
-  {
-    if (!backed_up) errorQuda("Cannot restore since not backed up");
-    qudaMemcpy(gauge, backup_h, bytes, qudaMemcpyDefault);
-    delete []backup_h;
-    backed_up = false;
-  }
-
-  void cudaGaugeField::prefetch(QudaFieldLocation mem_space, qudaStream_t stream) const
-  {
-    if (is_prefetch_enabled() && mem_type == QUDA_MEMORY_DEVICE) {
-      if (gauge) qudaMemPrefetchAsync(gauge, bytes, mem_space, stream);
-      if (!isNative()) {
-        for (int i = 0; i < nDim; i++) {
-          size_t nbytes = nFace * surface[i] * nInternal * precision;
-          if (ghost[i] && nbytes) qudaMemPrefetchAsync(ghost[i], nbytes, mem_space, stream);
-          if (ghost[i + 4] && nbytes && geometry == QUDA_COARSE_GEOMETRY)
-            qudaMemPrefetchAsync(ghost[i + 4], nbytes, mem_space, stream);
-        }
-      }
-    }
-  }
-
-  void cudaGaugeField::zero() { qudaMemset(gauge, 0, bytes); }
-
   void cudaGaugeField::copy_to_buffer(void *buffer) const
   {
-    qudaMemcpy(buffer, Gauge_p(), Bytes(), qudaMemcpyDeviceToHost);
+    qudaMemcpy(buffer, data(), Bytes(), qudaMemcpyDeviceToHost);
   }
 
-  void cudaGaugeField::copy_from_buffer(void *buffer)
-  {
-    qudaMemcpy(Gauge_p(), buffer, Bytes(), qudaMemcpyHostToDevice);
-  }
+  void cudaGaugeField::copy_from_buffer(void *buffer) { qudaMemcpy(data(), buffer, Bytes(), qudaMemcpyHostToDevice); }
 
 } // namespace quda
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 1181ecb733..ea17cb4610 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -29,12 +29,16 @@ namespace quda {
 
   GaugeField::GaugeField(const GaugeFieldParam &param) :
     LatticeField(param),
+    gauge(nullptr),
+    gauge_h(nullptr),
+    gauge_qdp {},
     bytes(0),
     phase_offset(0),
     phase_bytes(0),
     nColor(param.nColor),
     nFace(param.nFace),
     geometry(param.geometry),
+    site_dim(1),
     reconstruct(param.reconstruct),
     nInternal(reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : nColor * nColor * 2),
     order(param.order),
@@ -103,6 +107,19 @@ namespace quda {
     }
     total_bytes = bytes;
 
+    if (geometry == QUDA_SCALAR_GEOMETRY)
+      site_dim = 1;
+    else if (geometry == QUDA_VECTOR_GEOMETRY)
+      site_dim = nDim;
+    else if (geometry == QUDA_TENSOR_GEOMETRY)
+      site_dim = nDim * (nDim - 1) / 2;
+    else if (geometry == QUDA_COARSE_GEOMETRY)
+      site_dim = 2 * nDim;
+    else if (geometry == QUDA_KDINVERSE_GEOMETRY)
+      site_dim = 1 << nDim;
+    else
+      errorQuda("Unknown geometry type %d", geometry);
+
     setTuningString();
   }
 
@@ -296,6 +313,19 @@ namespace quda {
     return output;  // for multiple << operators.
   }
 
+  void GaugeField::zero()
+  {
+    if (location == QUDA_CUDA_FIELD_LOCATION) {
+      qudaMemset(gauge, 0, bytes);
+    } else {
+      if (order != QUDA_QDP_GAUGE_ORDER) {
+        memset(gauge, 0, bytes);
+      } else {
+        for (int g = 0; g < geometry; g++) memset(gauge_qdp[g], 0, volume * nInternal * precision);
+      }
+    }
+  }
+
   ColorSpinorParam colorSpinorParam(const GaugeField &a) {
    if (a.FieldOrder() == QUDA_QDP_GAUGE_ORDER || a.FieldOrder() == QUDA_QDPJIT_GAUGE_ORDER)
      errorQuda("Not implemented for this order %d", a.FieldOrder());
@@ -318,7 +348,7 @@ namespace quda {
     spinor_param.setPrecision(a.Precision(), a.Precision(), true);
     spinor_param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
     spinor_param.create = QUDA_REFERENCE_FIELD_CREATE;
-    spinor_param.v = (void*)a.Gauge_p();
+    spinor_param.v = a.data();
     spinor_param.location = a.Location();
     return spinor_param;
   }
@@ -417,4 +447,66 @@ namespace quda {
     return padded_cpu;
   }
 
+  void GaugeField::prefetch(QudaFieldLocation mem_space, qudaStream_t stream) const
+  {
+    if (location == QUDA_CUDA_FIELD_LOCATION && is_prefetch_enabled() && mem_type == QUDA_MEMORY_DEVICE) {
+      if (gauge) qudaMemPrefetchAsync(gauge, bytes, mem_space, stream);
+      if (!isNative()) {
+        for (int i = 0; i < nDim; i++) {
+          size_t nbytes = nFace * surface[i] * nInternal * precision;
+          if (ghost[i] && nbytes) qudaMemPrefetchAsync(ghost[i], nbytes, mem_space, stream);
+          if (ghost[i + 4] && nbytes && geometry == QUDA_COARSE_GEOMETRY)
+            qudaMemPrefetchAsync(ghost[i + 4], nbytes, mem_space, stream);
+        }
+      }
+    }
+  }
+
+  void GaugeField::backup() const
+  {
+    if (backed_up) errorQuda("Gauge field already backed up");
+
+    if (location == QUDA_CUDA_FIELD_LOCATION) {
+      backup_h = new char[bytes];
+      qudaMemcpy(backup_h, gauge, bytes, qudaMemcpyDefault);
+    } else {
+      if (order == QUDA_QDP_GAUGE_ORDER) {
+        char **buffer = new char *[geometry];
+        for (int d = 0; d < geometry; d++) {
+          buffer[d] = new char[bytes / geometry];
+          memcpy(buffer[d], gauge_qdp[d], bytes / geometry);
+        }
+        backup_h = reinterpret_cast<char *>(buffer);
+      } else {
+        backup_h = new char[bytes];
+        memcpy(backup_h, gauge, bytes);
+      }
+    }
+
+    backed_up = true;
+  }
+
+  void GaugeField::restore() const
+  {
+    if (!backed_up) errorQuda("Cannot restore since not backed up");
+
+    if (location == QUDA_CUDA_FIELD_LOCATION) {
+      qudaMemcpy(gauge, backup_h, bytes, qudaMemcpyDefault);
+      delete[] backup_h;
+    } else {
+      if (order == QUDA_QDP_GAUGE_ORDER) {
+        char **buffer = reinterpret_cast<char **>(backup_h);
+        for (int d = 0; d < geometry; d++) {
+          memcpy(gauge_qdp[d], buffer[d], bytes / geometry);
+          delete[] buffer[d];
+        }
+        delete[] buffer;
+      } else {
+        memcpy(gauge, backup_h, bytes);
+        delete[] backup_h;
+      }
+    }
+    backed_up = false;
+  }
+
 } // namespace quda
diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu
index af644ec238..f74f3fd50a 100644
--- a/lib/gauge_stout.cu
+++ b/lib/gauge_stout.cu
@@ -41,8 +41,8 @@ namespace quda {
       }
     }
 
-    void preTune() { if (out.Gauge_p() == in.Gauge_p()) out.backup(); }
-    void postTune() { if (out.Gauge_p() == in.Gauge_p()) out.restore(); }
+    void preTune() { if (out.data() == in.data()) out.backup(); }
+    void postTune() { if (out.data() == in.data()) out.restore(); }
 
     long long flops() const // just counts matrix multiplication
     {
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 5ff1191dac..9a8878499b 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -3324,11 +3324,11 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     // the split topology.
     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loading gauge field...\n"); }
     if (!is_staggered) {
-      loadGaugeQuda(collected_gauge->Gauge_p(), gauge_param);
+      loadGaugeQuda(collected_gauge->data(), gauge_param);
     } else {
       // freeGaugeQuda();
-      loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->Gauge_p(),
-                           collected_milc_longlink_field->Gauge_p());
+      loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->data(),
+                           collected_milc_longlink_field->data());
     }
     if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loaded gauge field...\n"); }
 
@@ -4619,7 +4619,7 @@ void computeHISQForceQuda(void* const milc_momentum,
 
   if (*num_failures_h>0) errorQuda("Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h);
 
-  qudaMemset((void **)(cudaOutForce->Gauge_p()), 0, cudaOutForce->Bytes());
+  cudaOutForce->zero();
 
   // read in u-link
   cudaGauge->loadCPUField(cpuULink, profileHISQForce);
diff --git a/lib/staggered_kd_build_xinv.cu b/lib/staggered_kd_build_xinv.cu
index b109bc2388..34ddb23137 100644
--- a/lib/staggered_kd_build_xinv.cu
+++ b/lib/staggered_kd_build_xinv.cu
@@ -245,11 +245,11 @@ namespace quda {
         
         X_.copy(X);
 
-        blas::flops += invert((void*)xInvMilcOrder->Gauge_p(), (void*)X_.Gauge_p(), n, X_.Volume(), X_.Precision(), X.Location());
+        blas::flops += invert(xInvMilcOrder->data(), X_.data(), n, X_.Volume(), X_.Precision(), X.Location());
 
       } else if (location == QUDA_CPU_FIELD_LOCATION) {
 
-        blas::flops += invert((void*)xInvMilcOrder->Gauge_p(), (void*)X.Gauge_p(), n, X.Volume(), X.Precision(), X.Location());
+        blas::flops += invert(xInvMilcOrder->data(), X.data(), n, X.Volume(), X.Precision(), X.Location());
       }
 
       if (getVerbosity() >= QUDA_VERBOSE) printfQuda("xInvMilcOrder = %e\n", xInvMilcOrder->norm2(0));
diff --git a/lib/staggered_oprod.cu b/lib/staggered_oprod.cu
index a1af1e9903..af1f520333 100644
--- a/lib/staggered_oprod.cu
+++ b/lib/staggered_oprod.cu
@@ -86,8 +86,8 @@ namespace quda {
       }
     } // apply
 
-    void preTune() { U.backup(); if (U.Gauge_p() != L.Gauge_p()) L.backup(); }
-    void postTune() { U.restore(); if (U.Gauge_p() != L.Gauge_p()) L.restore(); }
+    void preTune() { U.backup(); if (U.data() != L.data()) L.backup(); }
+    void postTune() { U.restore(); if (U.data() != L.data()) L.restore(); }
 
     long long flops() const { return 0; } // FIXME
     long long bytes() const { return 0; } // FIXME
diff --git a/lib/unitarize_links_quda.cu b/lib/unitarize_links_quda.cu
index fb08b7feb1..058dd91592 100644
--- a/lib/unitarize_links_quda.cu
+++ b/lib/unitarize_links_quda.cu
@@ -61,14 +61,14 @@ namespace quda {
     for (unsigned int i = 0; i < infield.Volume(); ++i) {
       for (int dir=0; dir<4; ++dir){
 	if (infield.Precision() == QUDA_SINGLE_PRECISION) {
-	  copyArrayToLink(inlink, ((float*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
-	  if (unitarizeLinkNewton(outlink, inlink, max_iter_newton) == false ) num_failures++;
-	  copyLinkToArray(((float*)(outfield.Gauge_p()) + (i*4 + dir)*18), outlink);
-	} else if (infield.Precision() == QUDA_DOUBLE_PRECISION) {
-	  copyArrayToLink(inlink, ((double*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
-	  if (unitarizeLinkNewton(outlink, inlink, max_iter_newton) == false ) num_failures++;
-	  copyLinkToArray(((double*)(outfield.Gauge_p()) + (i*4 + dir)*18), outlink);
-	} // precision?
+          copyArrayToLink(inlink, infield.data<float *>() + (i * 4 + dir) * 18); // order of arguments?
+          if (unitarizeLinkNewton(outlink, inlink, max_iter_newton) == false ) num_failures++;
+          copyLinkToArray(outfield.data<float *>() + (i * 4 + dir) * 18, outlink);
+        } else if (infield.Precision() == QUDA_DOUBLE_PRECISION) {
+          copyArrayToLink(inlink, infield.data<double *>() + (i * 4 + dir) * 18); // order of arguments?
+          if (unitarizeLinkNewton(outlink, inlink, max_iter_newton) == false ) num_failures++;
+          copyLinkToArray(outfield.data<double *>() + (i * 4 + dir) * 18, outlink);
+        } // precision?
       } // dir
     }   // loop over volume
   }
@@ -82,10 +82,10 @@ namespace quda {
     for (unsigned int i = 0; i < field.Volume(); ++i) {
       for (int dir=0; dir<4; ++dir) {
 	if (field.Precision() == QUDA_SINGLE_PRECISION) {
-	  copyArrayToLink(link, ((float*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
-	} else if (field.Precision() == QUDA_DOUBLE_PRECISION) {
-	  copyArrayToLink(link, ((double*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
-	} else {
+          copyArrayToLink(link, field.data<float *>() + (i * 4 + dir) * 18); // order of arguments?
+        } else if (field.Precision() == QUDA_DOUBLE_PRECISION) {
+          copyArrayToLink(link, field.data<double *>() + (i * 4 + dir) * 18); // order of arguments?
+        } else {
 	  errorQuda("Unsupported precision\n");
 	}
 	if (link.isUnitary(max_error) == false) {
@@ -126,9 +126,12 @@ namespace quda {
                         UnitarizeArg<Float, nColor, recon>(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error));
     }
 
-    void preTune() { if (in.Gauge_p() == out.Gauge_p()) out.backup(); }
+    void preTune()
+    {
+      if (in.data() == out.data()) out.backup();
+    }
     void postTune() {
-      if (in.Gauge_p() == out.Gauge_p()) out.restore();
+      if (in.data() == out.data()) out.restore();
       qudaMemset(fails, 0, sizeof(int)); // reset fails counter
     }
 
diff --git a/tests/gauge_force_test.cpp b/tests/gauge_force_test.cpp
index 88df55beef..60dfa94cd0 100644
--- a/tests/gauge_force_test.cpp
+++ b/tests/gauge_force_test.cpp
@@ -116,7 +116,7 @@ void gauge_force_test(bool compute_force = true)
   auto U_qdp = new quda::cpuGaugeField(param);
 
   // fills the gauge field with random numbers
-  createSiteLinkCPU((void **)U_qdp->Gauge_p(), gauge_param.cpu_prec, 0);
+  createSiteLinkCPU(U_qdp->data<void *const *>(), gauge_param.cpu_prec, 0);
 
   param.order = QUDA_MILC_GAUGE_ORDER;
   auto U_milc = new quda::cpuGaugeField(param);
@@ -134,7 +134,7 @@ void gauge_force_test(bool compute_force = true)
 
   // initialize some data in cpuMom
   if (compute_force) {
-    createMomCPU(Mom_ref_milc->Gauge_p(), gauge_param.cpu_prec);
+    createMomCPU(Mom_ref_milc->data(), gauge_param.cpu_prec);
     if (gauge_order == QUDA_MILC_GAUGE_ORDER) Mom_milc->copy(*Mom_ref_milc);
     if (gauge_order == QUDA_QDP_GAUGE_ORDER) Mom_qdp->copy(*Mom_ref_milc);
   }
@@ -142,11 +142,11 @@ void gauge_force_test(bool compute_force = true)
   void *sitelink = nullptr;
 
   if (gauge_order == QUDA_MILC_GAUGE_ORDER) {
-    sitelink = U_milc->Gauge_p();
-    mom = Mom_milc->Gauge_p();
+    sitelink = U_milc->data();
+    mom = Mom_milc->data();
   } else if (gauge_order == QUDA_QDP_GAUGE_ORDER) {
-    sitelink = U_qdp->Gauge_p();
-    mom = Mom_qdp->Gauge_p();
+    sitelink = U_qdp->data();
+    mom = Mom_qdp->data();
   } else {
     errorQuda("Unsupported gauge order %d", gauge_order);
   }
@@ -180,14 +180,14 @@ void gauge_force_test(bool compute_force = true)
   // The number comes from CPU implementation in MILC, gauge_force_imp.c
   int flops = 153004;
 
-  void *refmom = Mom_ref_milc->Gauge_p();
+  void *refmom = Mom_ref_milc->data();
   int *check_out = compute_force ? &force_check : &path_check;
   if (verify_results) {
-    gauge_force_reference(refmom, eb3, (void **)U_qdp->Gauge_p(), gauge_param.cpu_prec, input_path_buf, length,
+    gauge_force_reference(refmom, eb3, U_qdp->data<void *const *>(), gauge_param.cpu_prec, input_path_buf, length,
                           loop_coeff, num_paths, compute_force);
-    *check_out = compare_floats(Mom_milc->Gauge_p(), refmom, 4 * V * mom_site_size, getTolerance(cuda_prec),
-                                gauge_param.cpu_prec);
-    if (compute_force) strong_check_mom(Mom_milc->Gauge_p(), refmom, 4 * V, gauge_param.cpu_prec);
+    *check_out
+      = compare_floats(Mom_milc->data(), refmom, 4 * V * mom_site_size, getTolerance(cuda_prec), gauge_param.cpu_prec);
+    if (compute_force) strong_check_mom(Mom_milc->data(), refmom, 4 * V, gauge_param.cpu_prec);
   }
 
   if (compute_force) {
diff --git a/tests/heatbath_test.cpp b/tests/heatbath_test.cpp
index 673712201f..d840fda4c7 100644
--- a/tests/heatbath_test.cpp
+++ b/tests/heatbath_test.cpp
@@ -159,7 +159,7 @@ int main(int argc, char **argv)
       gauge_param.gauge_order = gauge->Order();
       gauge_param.location = QUDA_CUDA_FIELD_LOCATION;
 
-      loadGaugeQuda(gauge->Gauge_p(), &gauge_param);
+      loadGaugeQuda(gauge->data(), &gauge_param);
     }
 
     QudaGaugeObservableParam param = newQudaGaugeObservableParam();
@@ -189,7 +189,7 @@ int main(int argc, char **argv)
     gauge_param.gauge_order = gauge->Order();
     gauge_param.location = QUDA_CUDA_FIELD_LOCATION;
 
-    loadGaugeQuda(gauge->Gauge_p(), &gauge_param);
+    loadGaugeQuda(gauge->data(), &gauge_param);
     gaugeObservablesQuda(&param);
     printfQuda("step=0 plaquette = %e topological charge = %e\n", param.plaquette[0], param.qcharge);
 
@@ -205,7 +205,7 @@ int main(int argc, char **argv)
       // copy into regular field
       copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
-      loadGaugeQuda(gauge->Gauge_p(), &gauge_param);
+      loadGaugeQuda(gauge->data(), &gauge_param);
       gaugeObservablesQuda(&param);
       printfQuda("step=%d plaquette = %e topological charge = %e\n", step, param.plaquette[0], param.qcharge);
 
diff --git a/tests/hisq_paths_force_test.cpp b/tests/hisq_paths_force_test.cpp
index 6c32ca4853..e19d874e31 100644
--- a/tests/hisq_paths_force_test.cpp
+++ b/tests/hisq_paths_force_test.cpp
@@ -169,7 +169,7 @@ static void hisq_force_init()
   cpuGauge_ex = new cpuGaugeField(gParam_ex);
 
   if (gauge_order == QUDA_QDP_GAUGE_ORDER) {
-    createSiteLinkCPU((void **)cpuGauge->Gauge_p(), qudaGaugeParam.cpu_prec, 1);
+    createSiteLinkCPU(cpuGauge->data<void *const *>(), qudaGaugeParam.cpu_prec, 1);
   } else {
     errorQuda("Unsupported gauge order %d", gauge_order);
   }
@@ -221,8 +221,6 @@ static void hisq_force_init()
   cpuMom = new cpuGaugeField(gParam);
   refMom = new cpuGaugeField(gParam);
 
-  // createMomCPU(cpuMom->Gauge_p(), mom_prec);
-
   hw = safe_malloc(4 * cpuGauge->Volume() * hw_site_size * qudaGaugeParam.cpu_prec);
 
   createHwCPU(hw, hw_prec);
@@ -232,9 +230,9 @@ static void hisq_force_init()
   gParam.order = gauge_order;
   gParam.pad = 0;
   cpuOprod = new cpuGaugeField(gParam);
-  computeLinkOrderedOuterProduct(hw, cpuOprod->Gauge_p(), hw_prec, 1, gauge_order);
+  computeLinkOrderedOuterProduct(hw, cpuOprod->data(), hw_prec, 1, gauge_order);
   cpuLongLinkOprod = new cpuGaugeField(gParam);
-  computeLinkOrderedOuterProduct(hw, cpuLongLinkOprod->Gauge_p(), hw_prec, 3, gauge_order);
+  computeLinkOrderedOuterProduct(hw, cpuLongLinkOprod->data(), hw_prec, 3, gauge_order);
 
   gParam_ex.location = QUDA_CPU_FIELD_LOCATION;
   gParam_ex.link_type = QUDA_GENERAL_LINKS;
@@ -366,10 +364,9 @@ static int hisq_force_test(void)
 
   int accuracy_level = 3;
   if (verify_results) {
-    int res = compare_floats(cpuMom->Gauge_p(), refMom->Gauge_p(), 4 * cpuMom->Volume() * mom_site_size, 1e-5,
+    int res = compare_floats(cpuMom->data(), refMom->data(), 4 * cpuMom->Volume() * mom_site_size, 1e-5,
                              qudaGaugeParam.cpu_prec);
-    accuracy_level
-      = strong_check_mom(cpuMom->Gauge_p(), refMom->Gauge_p(), 4 * cpuMom->Volume(), qudaGaugeParam.cpu_prec);
+    accuracy_level = strong_check_mom(cpuMom->data(), refMom->data(), 4 * cpuMom->Volume(), qudaGaugeParam.cpu_prec);
     printfQuda("Test %s\n", (1 == res) ? "PASSED" : "FAILED");
   }
   double total_io;
diff --git a/tests/hisq_unitarize_force_test.cpp b/tests/hisq_unitarize_force_test.cpp
index 1ab7b6a71b..f6d68a9553 100644
--- a/tests/hisq_unitarize_force_test.cpp
+++ b/tests/hisq_unitarize_force_test.cpp
@@ -26,7 +26,7 @@ quda::cpuGaugeField *cpuReference = NULL;
 static QudaGaugeParam gaugeParam;
 
 // Create a field of links that are not su3_matrices
-void createNoisyLinkCPU(void **field, QudaPrecision prec, int seed)
+void createNoisyLinkCPU(void *const *field, QudaPrecision prec, int seed)
 {
   createSiteLinkCPU(field, prec, 0);
 
@@ -77,8 +77,8 @@ static void hisq_force_init()
   seed += quda::comm_rank();
 #endif
 
-  createNoisyLinkCPU((void **)cpuFatLink->Gauge_p(), gaugeParam.cpu_prec, seed);
-  createNoisyLinkCPU((void **)cpuOprod->Gauge_p(), gaugeParam.cpu_prec, seed + 1);
+  createNoisyLinkCPU(cpuFatLink->data<void *const *>(), gaugeParam.cpu_prec, seed);
+  createNoisyLinkCPU(cpuOprod->data<void *const *>(), gaugeParam.cpu_prec, seed + 1);
 
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.setPrecision(gaugeParam.cuda_prec, true);
@@ -142,7 +142,7 @@ TEST(hisq_force_unitarize, verify)
 
   double accuracy = prec == QUDA_DOUBLE_PRECISION ? 1e-10 : 1e-5;
   for (int dir = 0; dir < 4; ++dir) {
-    res[dir] = compare_floats(((char **)cpuReference->Gauge_p())[dir], ((char **)cpuResult->Gauge_p())[dir],
+    res[dir] = compare_floats(cpuReference->data<void *const *>()[dir], cpuResult->data<void *const *>()[dir],
                               cpuReference->Volume() * gauge_site_size, accuracy, gaugeParam.cpu_prec);
 
     quda::comm_allreduce_int(res[dir]);
diff --git a/tests/host_reference/domain_wall_dslash_reference.cpp b/tests/host_reference/domain_wall_dslash_reference.cpp
index c42b0e4a71..46d2620ce0 100644
--- a/tests/host_reference/domain_wall_dslash_reference.cpp
+++ b/tests/host_reference/domain_wall_dslash_reference.cpp
@@ -746,8 +746,8 @@ void mdw_eofa_m5inv(void *res, void *spinorField, int oddBit, int daggerBit, dou
 
 // this actually applies the preconditioned dslash, e.g., D_ee^{-1} D_eo or D_oo^{-1} D_oe
 #ifndef MULTI_GPU
-void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &,
-               double mferm)
+void dw_dslash(void *out, void *const *gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+               QudaGaugeParam &, double mferm)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
     dslashReference_4d_sgpu<QUDA_5D_PC>((double *)out, (double **)gauge, (double *)in, oddBit, daggerBit);
@@ -758,10 +758,10 @@ void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, Qud
   }
 }
 #else
-void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+void dw_dslash(void *out, void *const *gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision,
                QudaGaugeParam &gauge_param, double mferm)
 {
-  GaugeFieldParam gauge_field_param(gauge_param, gauge);
+  GaugeFieldParam gauge_field_param(gauge_param, (void **)gauge);
   gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
   cpuGaugeField cpu(gauge_field_param);
   void **ghostGauge = (void **)cpu.Ghost();
@@ -815,7 +815,7 @@ void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, Qud
 #endif
 
 #ifndef MULTI_GPU
-void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+void dslash_4_4d(void *out, void *const *gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision,
                  QudaGaugeParam &, double)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
@@ -825,10 +825,10 @@ void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, Q
   }
 }
 #else
-void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+void dslash_4_4d(void *out, void *const *gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision,
                  QudaGaugeParam &gauge_param, double)
 {
-  GaugeFieldParam gauge_field_param(gauge_param, gauge);
+  GaugeFieldParam gauge_field_param(gauge_param, (void **)gauge);
   gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
   cpuGaugeField cpu(gauge_field_param);
   void **ghostGauge = (void **)cpu.Ghost();
@@ -879,8 +879,8 @@ void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, Q
 }
 #endif
 
-void dw_dslash_5_4d(void *out, void **, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &,
-                    double mferm, bool zero_initialize)
+void dw_dslash_5_4d(void *out, void *const *, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+                    QudaGaugeParam &, double mferm, bool zero_initialize)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
     if (zero_initialize)
@@ -895,8 +895,8 @@ void dw_dslash_5_4d(void *out, void **, void *in, int oddBit, int daggerBit, Qud
   }
 }
 
-void dslash_5_inv(void *out, void **, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &,
-                  double mferm, double *kappa)
+void dslash_5_inv(void *out, void *const *, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+                  QudaGaugeParam &, double mferm, double *kappa)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
     dslashReference_5th_inv((double *)out, (double *)in, oddBit, daggerBit, mferm, kappa);
@@ -905,7 +905,7 @@ void dslash_5_inv(void *out, void **, void *in, int oddBit, int daggerBit, QudaP
   }
 }
 
-void mdw_dslash_5_inv(void *out, void **, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+void mdw_dslash_5_inv(void *out, void *const *, void *in, int oddBit, int daggerBit, QudaPrecision precision,
                       QudaGaugeParam &, double mferm, double _Complex *kappa)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
@@ -915,8 +915,8 @@ void mdw_dslash_5_inv(void *out, void **, void *in, int oddBit, int daggerBit, Q
   }
 }
 
-void mdw_dslash_5(void *out, void **, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &,
-                  double mferm, double _Complex *kappa, bool zero_initialize)
+void mdw_dslash_5(void *out, void *const *, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+                  QudaGaugeParam &, double mferm, double _Complex *kappa, bool zero_initialize)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
     if (zero_initialize)
@@ -935,7 +935,7 @@ void mdw_dslash_5(void *out, void **, void *in, int oddBit, int daggerBit, QudaP
   }
 }
 
-void mdw_dslash_4_pre(void *out, void **, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+void mdw_dslash_4_pre(void *out, void *const *, void *in, int oddBit, int daggerBit, QudaPrecision precision,
                       QudaGaugeParam &, double mferm, double _Complex *b5, double _Complex *c5, bool zero_initialize)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
@@ -960,7 +960,7 @@ void mdw_dslash_4_pre(void *out, void **, void *in, int oddBit, int daggerBit, Q
   }
 }
 
-void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision,
+void dw_mat(void *out, void *const *gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision,
             QudaGaugeParam &gauge_param, double mferm)
 {
 
@@ -976,7 +976,7 @@ void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, Qud
   xpay(in, -kappa, out, V5 * spinor_site_size, precision);
 }
 
-void dw_4d_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision,
+void dw_4d_mat(void *out, void *const *gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision,
                QudaGaugeParam &gauge_param, double mferm)
 {
 
@@ -995,7 +995,7 @@ void dw_4d_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit,
   xpay(in, -kappa, out, V5 * spinor_site_size, precision);
 }
 
-void mdw_mat(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, int dagger,
+void mdw_mat(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, int dagger,
              QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5)
 {
   void *tmp = safe_malloc(V5h * spinor_site_size * precision);
@@ -1042,9 +1042,9 @@ void mdw_mat(void *out, void **gauge, void *in, double _Complex *kappa_b, double
   host_free(tmp);
 }
 
-void mdw_eofa_mat(void *out, void **gauge, void *in, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param,
-                  double mferm, double m5, double b, double c, double mq1, double mq2, double mq3, int eofa_pm,
-                  double eofa_shift)
+void mdw_eofa_mat(void *out, void *const *gauge, void *in, int dagger, QudaPrecision precision,
+                  QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, double mq1, double mq2,
+                  double mq3, int eofa_pm, double eofa_shift)
 {
   void *tmp = safe_malloc(V5h * spinor_site_size * precision);
 
@@ -1096,7 +1096,7 @@ void mdw_eofa_mat(void *out, void **gauge, void *in, int dagger, QudaPrecision p
   host_free(tmp);
 }
 //
-void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision,
+void dw_matdagmat(void *out, void *const *gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision,
                   QudaGaugeParam &gauge_param, double mferm)
 {
   void *tmp = safe_malloc(V5 * spinor_site_size * precision);
@@ -1108,7 +1108,7 @@ void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger_bi
   host_free(tmp);
 }
 
-void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit,
+void dw_matpc(void *out, void *const *gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit,
               QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
 {
   void *tmp = safe_malloc(V5h * spinor_site_size * precision);
@@ -1128,7 +1128,7 @@ void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType mat
   host_free(tmp);
 }
 
-void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit,
+void dw_4d_matpc(void *out, void *const *gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit,
                  QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
 {
   double kappa2 = -kappa * kappa;
@@ -1168,7 +1168,7 @@ void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType
   host_free(kappa5);
 }
 
-void mdw_matpc(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
+void mdw_matpc(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
                QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm,
                double _Complex *b5, double _Complex *c5)
 {
@@ -1240,9 +1240,9 @@ void mdw_matpc(void *out, void **gauge, void *in, double _Complex *kappa_b, doub
   host_free(kappa_mdwf);
 }
 
-void mdw_eofa_matpc(void *out, void **gauge, void *in, QudaMatPCType matpc_type, int dagger, QudaPrecision precision,
-                    QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, double mq1, double mq2,
-                    double mq3, int eofa_pm, double eofa_shift)
+void mdw_eofa_matpc(void *out, void *const *gauge, void *in, QudaMatPCType matpc_type, int dagger,
+                    QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c,
+                    double mq1, double mq2, double mq3, int eofa_pm, double eofa_shift)
 {
   void *tmp = safe_malloc(V5h * spinor_site_size * precision);
 
@@ -1311,14 +1311,14 @@ void mdw_eofa_matpc(void *out, void **gauge, void *in, QudaMatPCType matpc_type,
   host_free(tmp);
 }
 
-void mdw_mdagm_local(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
+void mdw_mdagm_local(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
                      QudaMatPCType matpc_type, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm,
                      double _Complex *b5, double _Complex *c5)
 {
   lat_dim_t R;
   for (int d = 0; d < 4; d++) { R[d] = comm_dim_partitioned(d) ? 2 : 0; }
 
-  cpuGaugeField *padded_gauge = createExtendedGauge(gauge, gauge_param, R);
+  cpuGaugeField *padded_gauge = createExtendedGauge((void **)gauge, gauge_param, R);
 
   int padded_V = 1;
   int W[4];
@@ -1357,7 +1357,7 @@ void mdw_mdagm_local(void *out, void **gauge, void *in, double _Complex *kappa_b
   QudaGaugeParam padded_gauge_param(gauge_param);
   for (int d = 0; d < 4; d++) { padded_gauge_param.X[d] += 2 * R[d]; }
 
-  void **padded_gauge_p = (void **)(padded_gauge->Gauge_p());
+  auto padded_gauge_p = padded_gauge->data<void *const *>();
 
   // Extend these global variables then restore them
   int V5_old = V5;
@@ -1458,7 +1458,7 @@ void MatPCDag(sFloat *outEven, gFloat **gauge, sFloat *inEven, sFloat kappa,
 }
 */
 
-void matpc(void *, void **, void *, double, QudaMatPCType, int, QudaPrecision, QudaPrecision, double)
+void matpc(void *, void *const *, void *, double, QudaMatPCType, int, QudaPrecision, QudaPrecision, double)
 {
   /*
     if (!dagger_bit) {
@@ -1513,7 +1513,7 @@ void MatPCDagMatPC(sFloat *out, gFloat **gauge, sFloat *in, sFloat kappa,
 }
 */
 // Wrapper to templates that handles different precisions.
-void matdagmat(void *, void **, void *, double, QudaPrecision, QudaPrecision, double)
+void matdagmat(void *, void *const *, void *, double, QudaPrecision, QudaPrecision, double)
 {
   /*
     if (sPrecision == QUDA_DOUBLE_PRECISION) {
@@ -1533,7 +1533,7 @@ void matdagmat(void *, void **, void *, double, QudaPrecision, QudaPrecision, do
 }
 
 // Wrapper to templates that handles different precisions.
-void matpcdagmatpc(void *, void **, void *, double, QudaPrecision, QudaPrecision, double, QudaMatPCType)
+void matpcdagmatpc(void *, void *const *, void *, double, QudaPrecision, QudaPrecision, double, QudaMatPCType)
 {
   /*
     if (sPrecision == QUDA_DOUBLE_PRECISION) {
diff --git a/tests/host_reference/domain_wall_dslash_reference.h b/tests/host_reference/domain_wall_dslash_reference.h
index 3751fe88f4..4e6ff1edfb 100644
--- a/tests/host_reference/domain_wall_dslash_reference.h
+++ b/tests/host_reference/domain_wall_dslash_reference.h
@@ -8,51 +8,51 @@
 extern "C" {
 #endif
 
-void dw_dslash(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
+void dw_dslash(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
                QudaGaugeParam &param, double mferm);
 
-void dslash_4_4d(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
+void dslash_4_4d(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
                  QudaGaugeParam &param, double mferm);
 
-void dw_dslash_5_4d(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
-                    QudaGaugeParam &param, double mferm, bool zero_initialize);
+void dw_dslash_5_4d(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger,
+                    QudaPrecision precision, QudaGaugeParam &param, double mferm, bool zero_initialize);
 
-void dslash_5_inv(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
+void dslash_5_inv(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
                   QudaGaugeParam &param, double mferm, double *kappa);
 
-void mdw_dslash_5_inv(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
-                      QudaGaugeParam &param, double mferm, double _Complex *kappa);
+void mdw_dslash_5_inv(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger,
+                      QudaPrecision precision, QudaGaugeParam &param, double mferm, double _Complex *kappa);
 
-void mdw_dslash_5(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
+void mdw_dslash_5(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
                   QudaGaugeParam &param, double mferm, double _Complex *kappa, bool zero_initialize);
 
-void mdw_dslash_4_pre(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
-                      QudaGaugeParam &param, double mferm, double _Complex *b5, double _Complex *c5,
-                      bool zero_initialize);
+void mdw_dslash_4_pre(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger,
+                      QudaPrecision precision, QudaGaugeParam &param, double mferm, double _Complex *b5,
+                      double _Complex *c5, bool zero_initialize);
 
-void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger, QudaPrecision precision, QudaGaugeParam &param,
-            double mferm);
+void dw_mat(void *out, void *const *gauge, void *in, double kappa, int dagger, QudaPrecision precision,
+            QudaGaugeParam &param, double mferm);
 
-void dw_4d_mat(void *out, void **gauge, void *in, double kappa, int dagger, QudaPrecision precision,
+void dw_4d_mat(void *out, void *const *gauge, void *in, double kappa, int dagger, QudaPrecision precision,
                QudaGaugeParam &param, double mferm);
 
-void mdw_mat(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, int dagger,
+void mdw_mat(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, int dagger,
              QudaPrecision precision, QudaGaugeParam &param, double mferm, double _Complex *b5, double _Complex *c5);
 
-void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger, QudaPrecision precision,
+void dw_matdagmat(void *out, void *const *gauge, void *in, double kappa, int dagger, QudaPrecision precision,
                   QudaGaugeParam &param, double mferm);
 
-void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger,
+void dw_matpc(void *out, void *const *gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger,
               QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm);
 
-void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger,
+void dw_4d_matpc(void *out, void *const *gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger,
                  QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm);
 
-void mdw_matpc(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
+void mdw_matpc(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
                QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm,
                double _Complex *b5, double _Complex *c5);
 
-void mdw_mdagm_local(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
+void mdw_mdagm_local(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
                      QudaMatPCType matpc_type, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm,
                      double _Complex *b5, double _Complex *c5);
 void mdw_eofa_m5(void *res, void *spinorField, int oddBit, int daggerBit, double mferm, double m5, double b, double c,
@@ -61,13 +61,13 @@ void mdw_eofa_m5(void *res, void *spinorField, int oddBit, int daggerBit, double
 void mdw_eofa_m5inv(void *res, void *spinorField, int oddBit, int daggerBit, double mferm, double m5, double b, double c,
                     double mq1, double mq2, double mq3, int eofa_pm, double eofa_shift, QudaPrecision precision);
 
-void mdw_eofa_mat(void *out, void **gauge, void *in, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param,
-                  double mferm, double m5, double b, double c, double mq1, double mq2, double mq3, int eofa_pm,
-                  double eofa_shift);
+void mdw_eofa_mat(void *out, void *const *gauge, void *in, int dagger, QudaPrecision precision,
+                  QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, double mq1, double mq2,
+                  double mq3, int eofa_pm, double eofa_shift);
 
-void mdw_eofa_matpc(void *out, void **gauge, void *in, QudaMatPCType matpc_type, int dagger, QudaPrecision precision,
-                    QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, double mq1, double mq2,
-                    double mq3, int eofa_pm, double eofa_shift);
+void mdw_eofa_matpc(void *out, void *const *gauge, void *in, QudaMatPCType matpc_type, int dagger,
+                    QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c,
+                    double mq1, double mq2, double mq3, int eofa_pm, double eofa_shift);
 
 #ifdef __cplusplus
 }
diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp
index ef1ce87b77..ffe8cc4494 100644
--- a/tests/host_reference/gauge_force_reference.cpp
+++ b/tests/host_reference/gauge_force_reference.cpp
@@ -369,7 +369,7 @@ static void update_gauge(su3_matrix *gauge, int dir, su3_matrix **sitelink, su3_
 /* This function only computes one direction @dir
  *
  */
-void gauge_force_reference_dir(void *refMom, int dir, double eb3, void **sitelink, void **sitelink_ex,
+void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *sitelink, void *const *sitelink_ex,
                                QudaPrecision prec, int **path_dir, int *length, void *loop_coeff, int num_paths,
                                const lattice_t &lat, bool compute_force)
 {
@@ -405,8 +405,8 @@ void gauge_force_reference_dir(void *refMom, int dir, double eb3, void **sitelin
   host_free(staple);
 }
 
-void gauge_force_reference(void *refMom, double eb3, void **sitelink, QudaPrecision prec, int ***path_dir, int *length,
-                           void *loop_coeff, int num_paths, bool compute_force)
+void gauge_force_reference(void *refMom, double eb3, void *const *const sitelink, QudaPrecision prec, int ***path_dir,
+                           int *length, void *loop_coeff, int num_paths, bool compute_force)
 {
   // created extended field
   quda::lat_dim_t R;
@@ -420,7 +420,7 @@ void gauge_force_reference(void *refMom, double eb3, void **sitelink, QudaPrecis
   lattice_t lat(*qdp_ex);
 
   for (int dir = 0; dir < 4; dir++) {
-    gauge_force_reference_dir(refMom, dir, eb3, sitelink, (void **)qdp_ex->Gauge_p(), prec, path_dir[dir], length,
+    gauge_force_reference_dir(refMom, dir, eb3, sitelink, qdp_ex->data<void *const *>(), prec, path_dir[dir], length,
                               loop_coeff, num_paths, lat, compute_force);
   }
 
diff --git a/tests/host_reference/gauge_force_reference.h b/tests/host_reference/gauge_force_reference.h
index 4bf04f3f52..44106e5427 100644
--- a/tests/host_reference/gauge_force_reference.h
+++ b/tests/host_reference/gauge_force_reference.h
@@ -1,4 +1,4 @@
 #pragma once
 
-void gauge_force_reference(void *refMom, double eb3, void **sitelink, QudaPrecision prec, int ***path_dir, int *length,
-                           void *loop_coeff, int num_paths, bool compute_force);
+void gauge_force_reference(void *refMom, double eb3, void *const *sitelink, QudaPrecision prec, int ***path_dir,
+                           int *length, void *loop_coeff, int num_paths, bool compute_force);
diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp
index d4cb82e0c8..f3f080bed8 100644
--- a/tests/host_reference/hisq_force_reference.cpp
+++ b/tests/host_reference/hisq_force_reference.cpp
@@ -1266,12 +1266,12 @@ void hisqStaplesForceCPU(const double *path_coeff, const QudaGaugeParam &param,
   act_path_coeff.lepage = path_coeff[5];
 
   if (param.cpu_prec == QUDA_DOUBLE_PRECISION) {
-    doHisqStaplesForceCPU<double>(param.X, act_path_coeff, (double *)oprod.Gauge_p(), (double *)link.Gauge_p(),
-                                  (double **)tempmat, (double *)newOprod->Gauge_p());
+    doHisqStaplesForceCPU<double>(param.X, act_path_coeff, oprod.data<double *>(), link.data<double *>(),
+                                  (double **)tempmat, newOprod->data<double *>());
 
   } else if (param.cpu_prec == QUDA_SINGLE_PRECISION) {
-    doHisqStaplesForceCPU<float>(param.X, act_path_coeff, (float *)oprod.Gauge_p(), (float *)link.Gauge_p(),
-                                 (float **)tempmat, (float *)newOprod->Gauge_p());
+    doHisqStaplesForceCPU<float>(param.X, act_path_coeff, oprod.data<float *>(), link.data<float *>(),
+                                 (float **)tempmat, newOprod->data<float *>());
   } else {
     errorQuda("Unsupported precision");
   }
@@ -1350,11 +1350,11 @@ void hisqLongLinkForceCPU(double coeff, const QudaGaugeParam &param, quda::cpuGa
 {
   for (int sig = 0; sig < 4; ++sig) {
     if (param.cpu_prec == QUDA_SINGLE_PRECISION) {
-      computeLongLinkField<float>(param.X, (float *)oprod.Gauge_p(), (float *)link.Gauge_p(), sig, coeff,
-                                  (float *)newOprod->Gauge_p());
+      computeLongLinkField<float>(param.X, (float *)oprod.data<float *>(), link.data<float *>(), sig, coeff,
+                                  newOprod->data<float *>());
     } else if (param.cpu_prec == QUDA_DOUBLE_PRECISION) {
-      computeLongLinkField<double>(param.X, (double *)oprod.Gauge_p(), (double *)link.Gauge_p(), sig, coeff,
-                                   (double *)newOprod->Gauge_p());
+      computeLongLinkField<double>(param.X, oprod.data<double *>(), link.data<double *>(), sig, coeff,
+                                   newOprod->data<double *>());
     } else {
       errorQuda("Unrecognised precision\n");
     }
@@ -1405,10 +1405,9 @@ void hisqCompleteForceCPU(const QudaGaugeParam &param, quda::cpuGaugeField &opro
 {
   for (int sig = 0; sig < 4; ++sig) {
     if (param.cpu_prec == QUDA_SINGLE_PRECISION) {
-      completeForceField<float>(param.X, (float *)oprod.Gauge_p(), (float *)link.Gauge_p(), sig, (float *)mom->Gauge_p());
+      completeForceField<float>(param.X, oprod.data<float *>(), link.data<float *>(), sig, mom->data<float *>());
     } else if (param.cpu_prec == QUDA_DOUBLE_PRECISION) {
-      completeForceField<double>(param.X, (double *)oprod.Gauge_p(), (double *)link.Gauge_p(), sig,
-                                 (double *)mom->Gauge_p());
+      completeForceField<double>(param.X, oprod.data<double *>(), link.data<double *>(), sig, mom->data<double *>());
     } else {
       errorQuda("Unrecognised precision\n");
     }
diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
index 95c104be99..40860dc308 100644
--- a/tests/host_reference/staggered_dslash_reference.cpp
+++ b/tests/host_reference/staggered_dslash_reference.cpp
@@ -126,8 +126,8 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink,
   }   // right-hand-side
 }
 
-void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink,
-                     void **ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit,
+void staggeredDslash(ColorSpinorField &out, void *const *fatlink, void *const *longlink, void *const *ghost_fatlink,
+                     void *const *ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit,
                      QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type)
 {
   const int nSrc = in.X(4);
@@ -144,8 +144,8 @@ void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, voi
 
   in.exchangeGhost(otherparity, nFace, daggerBit);
 
-  void **fwd_nbr_spinor = in.fwdGhostFaceBuffer;
-  void **back_nbr_spinor = in.backGhostFaceBuffer;
+  auto fwd_nbr_spinor = in.fwdGhostFaceBuffer;
+  auto back_nbr_spinor = in.backGhostFaceBuffer;
 
   if (sPrecision == QUDA_DOUBLE_PRECISION) {
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
@@ -170,8 +170,8 @@ void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, voi
   }
 }
 
-void staggeredMatDagMat(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink,
-                        void **ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit,
+void staggeredMatDagMat(ColorSpinorField &out, void *const *fatlink, void *const *longlink, void *const *ghost_fatlink,
+                        void *const *ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit,
                         QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity,
                         QudaDslashType dslash_type)
 {
diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h
index 54d40fdc0d..2d47138dc0 100644
--- a/tests/host_reference/staggered_dslash_reference.h
+++ b/tests/host_reference/staggered_dslash_reference.h
@@ -16,11 +16,11 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink,
                               gFloat **ghostLonglink, sFloat *spinorField, sFloat **fwd_nbr_spinor,
                               sFloat **back_nbr_spinor, int oddBit, int daggerBit, int nSrc, QudaDslashType dslash_type);
 
-void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink,
-                     void **ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit,
+void staggeredDslash(ColorSpinorField &out, void *const *fatlink, void *const *longlink, void *const *ghost_fatlink,
+                     void *const *ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit,
                      QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type);
 
-void staggeredMatDagMat(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink,
-                        void **ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit,
+void staggeredMatDagMat(ColorSpinorField &out, void *const *fatlink, void *const *longlink, void *const *ghost_fatlink,
+                        void *const *ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit,
                         QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity,
                         QudaDslashType dslash_type);
diff --git a/tests/multigrid_evolve_test.cpp b/tests/multigrid_evolve_test.cpp
index d8c72f19fc..2fd02228a0 100644
--- a/tests/multigrid_evolve_test.cpp
+++ b/tests/multigrid_evolve_test.cpp
@@ -270,7 +270,7 @@ int main(int argc, char **argv)
     // load the gauge field from gauge
     gauge_param.gauge_order = gauge->Order();
     gauge_param.location = QUDA_CUDA_FIELD_LOCATION;
-    loadGaugeQuda(gauge->Gauge_p(), &gauge_param);
+    loadGaugeQuda(gauge->data(), &gauge_param);
     gaugeObservablesQuda(&obs_param);
 
     // Demonstrate MG evolution on an evolving gauge field
@@ -318,7 +318,7 @@ int main(int argc, char **argv)
 
       // Copy into regular field
       copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
-      loadGaugeQuda(gauge->Gauge_p(), &gauge_param);
+      loadGaugeQuda(gauge->data(), &gauge_param);
 
       if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
         constructHostCloverField(clover, clover_inv, inv_param);
@@ -384,7 +384,7 @@ int main(int argc, char **argv)
     // copy into regular field
     copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
-    loadGaugeQuda(gauge->Gauge_p(), &gauge_param);
+    loadGaugeQuda(gauge->data(), &gauge_param);
     // Recompute Gauge Observables
     gaugeObservablesQuda(&obs_param);
 
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 831351a36d..c6379c3342 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -71,7 +71,8 @@ struct StaggeredDslashTestWrapper {
   // In the HISQ case, we include building fat/long links in this unit test
   void *qdp_fatlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr};
   void *qdp_longlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr};
-  void **ghost_fatlink_cpu, **ghost_longlink_cpu;
+  void *const *ghost_fatlink_cpu;
+  void *const *ghost_longlink_cpu;
 
   QudaParity parity = QUDA_EVEN_PARITY;
 
diff --git a/tests/unitarize_link_test.cpp b/tests/unitarize_link_test.cpp
index a0322e397b..1c4849ba4a 100644
--- a/tests/unitarize_link_test.cpp
+++ b/tests/unitarize_link_test.cpp
@@ -42,8 +42,8 @@ TEST(unitarization, verify)
   unitarizeLinksCPU(*cpuULink, *cpuFatLink);
   cudaULink->saveCPUField(*cudaResult);
 
-  int res = compare_floats(cudaResult->Gauge_p(), cpuULink->Gauge_p(), 4 * cudaResult->Volume() * gauge_site_size,
-                           unittol, cpu_prec);
+  int res = compare_floats(cudaResult->data(), cpuULink->data(), 4 * cudaResult->Volume() * gauge_site_size, unittol,
+                           cpu_prec);
 
 #ifdef MULTI_GPU
   quda::comm_allreduce_int(res);
diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp
index ce188dbdcc..4df1882297 100644
--- a/tests/utils/host_utils.cpp
+++ b/tests/utils/host_utils.cpp
@@ -1272,7 +1272,7 @@ void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precisi
     checkGauge((float **)oldG, (float **)newG, epsilon);
 }
 
-void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
+void createSiteLinkCPU(void *const *link, QudaPrecision precision, int phase)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
     constructUnitaryGaugeField((double **)link);
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index 569cae8643..4d9b284e0a 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -154,7 +154,7 @@ int fullLatticeIndex(int i, int oddBit);
 int fullLatticeIndex(int dim[], int index, int oddBit);
 int getOddBit(int X);
 
-void createSiteLinkCPU(void **link, QudaPrecision precision, int phase);
+void createSiteLinkCPU(void *const *const link, QudaPrecision precision, int phase);
 void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision);
 void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision,
                      QudaGaugeParam *param);
diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp
index 118b849d17..3750fe05bc 100644
--- a/tests/utils/staggered_host_utils.cpp
+++ b/tests/utils/staggered_host_utils.cpp
@@ -490,7 +490,7 @@ void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, vo
   unitarizeLinksCPU(*cpuWLink, *cpuVLink);
 
   // Copy back into "w_reflink"
-  reorderMILCtoQDP(w_reflink, cpuWLink->Gauge_p(), V, gauge_site_size, prec, prec);
+  reorderMILCtoQDP(w_reflink, cpuWLink->data(), V, gauge_site_size, prec, prec);
 
   // Clean up cpuGaugeFields, we don't need them anymore.
   delete cpuVLink;

From 53b7517b3e7ae34650339170e23f43f0fe3d8e50 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 21 Jul 2022 14:44:15 -0700
Subject: [PATCH 02/60] Improve error reporting when vol_string exceeds max
 size

---
 lib/lattice_field.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp
index a2a9340dd6..c657a89e3b 100644
--- a/lib/lattice_field.cpp
+++ b/lib/lattice_field.cpp
@@ -557,7 +557,8 @@ namespace quda {
     vol_ss << x[0];
     for (int d = 1; d < nDim; d++) vol_ss << "x" << x[d];
     vol_string = vol_ss.str();
-    if (vol_string.size() >= TuneKey::volume_n) errorQuda("Vol string too large %lu", vol_string.size());
+    if (vol_string.size() >= TuneKey::volume_n)
+      errorQuda("Vol string %s (size = %lu) larger than maximum %d", vol_string.c_str(), vol_string.size(), TuneKey::volume_n);
   }
 
   void LatticeField::checkField(const LatticeField &a) const {

From c3fb2eb4959cf4cd6bde07bacb2ba144a463ed61 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 8 May 2023 11:31:43 -0700
Subject: [PATCH 03/60] Significant rework of memory allocation to facilitate
 gauge field unification.  Introduced new memory allocation wrapper quda_ptr,
 which is deployed for gauge field allocations.  Still a WIP

---
 include/enum_quda.h                           |   5 +-
 include/enum_quda_fortran.h                   |   5 -
 include/gauge_field.h                         |  80 +++---
 include/gauge_field_order.h                   |  26 +-
 include/lattice_field.h                       |   6 +-
 include/malloc_quda.h                         |  78 ++++++
 include/quda_api.h                            |  21 +-
 lib/coarse_op_preconditioned.cu               |   3 +-
 lib/color_spinor_field.cpp                    |   8 +-
 lib/cpu_gauge_field.cpp                       | 163 +++---------
 lib/cuda_gauge_field.cpp                      | 136 +---------
 lib/gauge_field.cpp                           | 249 ++++++++++++------
 lib/interface_quda.cpp                        |   2 +-
 lib/lattice_field.cpp                         |  10 +-
 lib/targets/cuda/malloc.cpp                   | 143 ++++++++++
 lib/targets/cuda/quda_api.cpp                 |  21 ++
 tests/covdev_test.cpp                         |   7 +-
 tests/gauge_force_test.cpp                    |  13 +-
 tests/hisq_paths_force_test.cpp               |   2 +-
 tests/hisq_unitarize_force_test.cpp           |  12 +-
 tests/host_reference/covdev_reference.cpp     |  41 +--
 tests/host_reference/covdev_reference.h       |  12 +-
 .../domain_wall_dslash_reference.cpp          |   6 +-
 tests/host_reference/dslash_reference.cpp     |  10 +-
 tests/host_reference/dslash_reference.h       |   6 +-
 .../host_reference/gauge_force_reference.cpp  |   7 +-
 tests/host_reference/gauge_force_reference.h  |   4 +-
 .../wilson_dslash_reference.cpp               |   3 +-
 tests/multigrid_evolve_test.cpp               |  39 ++-
 tests/staggered_dslash_test_utils.h           |   8 +-
 tests/staggered_invert_test.cpp               |   6 +-
 tests/utils/host_utils.cpp                    |   6 +
 tests/utils/host_utils.h                      |   1 +
 tests/utils/misc.cpp                          |   4 +-
 34 files changed, 658 insertions(+), 485 deletions(-)

diff --git a/include/enum_quda.h b/include/enum_quda.h
index 62f580e50d..665cffbf91 100644
--- a/include/enum_quda.h
+++ b/include/enum_quda.h
@@ -10,8 +10,11 @@ typedef enum qudaError_t { QUDA_SUCCESS = 0, QUDA_ERROR = 1, QUDA_ERROR_UNINITIA
 
 typedef enum QudaMemoryType_s {
   QUDA_MEMORY_DEVICE,
-  QUDA_MEMORY_PINNED,
+  QUDA_MEMORY_DEVICE_PINNED,
+  QUDA_MEMORY_HOST,
+  QUDA_MEMORY_HOST_PINNED,
   QUDA_MEMORY_MAPPED,
+  QUDA_MEMORY_MANAGED,
   QUDA_MEMORY_INVALID = QUDA_INVALID_ENUM
 } QudaMemoryType;
 
diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h
index 5e17a9df8f..21da3c138b 100644
--- a/include/enum_quda_fortran.h
+++ b/include/enum_quda_fortran.h
@@ -17,11 +17,6 @@
 #define QUDA_ERROR 1
 #define QUDA_ERROR_UNINITIALIZED 2
 
-#define QUDA_MEMORY_DEVICE 0
-#define QUDA_MEMORY_PINNED 1
-#define QUDA_MEMORY_MAPPED 2
-#define QUDA_MEMORY_INVALID QUDA_INVALID_ENUM
-
 #define QUDA_SU3_LINKS      0
 #define QUDA_GENERAL_LINKS  1
 #define QUDA_THREE_LINKS    2
diff --git a/include/gauge_field.h b/include/gauge_field.h
index 7e484bd3a0..155f68b958 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -195,9 +195,8 @@ namespace quda {
   class GaugeField : public LatticeField {
 
   protected:
-    void *gauge; /** The gauge field allocation */
-    void *gauge_h; /** Mapped-memory pointer when allocating on the host */
-    void **gauge_qdp; /** Array of pointers to each subset (QDP order) */
+    quda_ptr gauge; /** The gauge field allocation */
+    array<quda_ptr, 8> gauge_array; /** Array of pointers to each subset (e.g., QDP or QDPJITorder) */
       size_t bytes;        // bytes allocated per full field
       size_t phase_offset; // offset in bytes to gauge phases - useful to keep track of texture alignment
       size_t phase_bytes;  // bytes needed to store the phases
@@ -221,7 +220,7 @@ namespace quda {
 
       QudaFieldCreate create; // used to determine the type of field created
 
-      mutable void *ghost[2 * QUDA_MAX_DIM]; // stores the ghost zone of the gauge field (non-native fields only)
+      mutable array<quda_ptr, 2 * QUDA_MAX_DIM> ghost; // stores the ghost zone of the gauge field (non-native fields only)
 
       mutable int ghostFace[QUDA_MAX_DIM]; // the size of each face
 
@@ -273,6 +272,11 @@ namespace quda {
       */
       void setTuningString();
 
+    /**
+       @brief Initialize the padded region to 0
+     */
+    void zeroPad();
+
   public:
     GaugeField(const GaugeFieldParam &param);
     virtual ~GaugeField();
@@ -372,28 +376,55 @@ namespace quda {
 
     /**
        @brief Return base pointer to the gauge field allocation.
-       @tparam T Optional type to cast the pointer to.
+       @tparam T Optional type to cast the pointer to (default is void*).
        @return Base pointer to the gauge field allocation
      */
-    template <typename T = void*> auto data() const
+    template <typename T = void*>
+    std::enable_if_t<std::is_pointer_v<T> && !std::is_pointer_v<typename std::remove_pointer<T>::type>, T> data() const
     {
-      static_assert(std::is_pointer_v<T>, "data() requires a pointer cast type");
-
-      using U = typename std::remove_pointer<T>::type;
-      if constexpr (std::is_pointer_v<U>) {
-        if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order);
-        return reinterpret_cast<T>(gauge_qdp);
-      } else {
-        if (is_pointer_array(order) && !std::is_same_v<T, void*>) errorQuda("Non dim-array ordered field requested but order is %d", order);
-        return reinterpret_cast<T>(gauge);
-      }
+      if (is_pointer_array(order))
+        errorQuda("Non dim-array ordered field requested but order is %d", order);
+      return reinterpret_cast<T>(gauge.data());
+    }
+
+    /**
+       @brief Return base pointer to the gauge field allocation
+       specified by the array index.  This is for geometry-array
+       ordered fields, e.g., QDP or QDPJIT.
+
+       @tparam T Optional type to cast the pointer to (default is void*)
+       @param[in] d Dimension index when the allocation is an array type
+       @return Base pointer to the gauge field allocation
+     */
+    template <typename T = void*> auto data(unsigned int d) const
+    {
+      static_assert(std::is_pointer_v<T> && !std::is_pointer_v<typename std::remove_pointer<T>::type>, "data() requires a pointer cast type");
+      if (d >= (unsigned)geometry) errorQuda("Invalid array index %d for geometry %d field", d, geometry);
+      if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order);
+      return reinterpret_cast<T>(gauge_array[d].data());
+    }
+
+    /**
+       @brief Return array of pointers to the per dimension gauge field allocation(s).
+       @tparam T Optional type to cast the pointer to (default is
+       void*).  this is for geometry-array ordered fields, e.g., QDP
+       or QDPJIT.
+       @return Array of pointers to the gauge field allocations
+     */
+    template <typename T = void*>
+    std::enable_if_t<std::is_pointer_v<T> && !std::is_pointer_v<typename std::remove_pointer<T>::type>, array<T, QUDA_MAX_DIM>> data_array() const
+    {
+      if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order);
+      array<T, QUDA_MAX_DIM> u = {};
+      for (auto d = 0; d < geometry; d++) u[d] = static_cast<T>(gauge_array[d]);
+      return u;
     }
 
     virtual int full_dim(int d) const { return x[d]; }
 
-    auto Ghost() const {
+    auto& Ghost() const {
       if ( isNative() ) errorQuda("No ghost zone pointer for quda-native gauge fields");
-      return (void * const *)ghost;
+      return ghost;
     }
 
     /**
@@ -486,16 +517,8 @@ namespace quda {
 
   class cudaGaugeField : public GaugeField {
 
-  private:
-
-    /**
-       @brief Initialize the padded region to 0
-     */
-    void zeroPad();
-
   public:
     cudaGaugeField(const GaugeFieldParam &);
-    virtual ~cudaGaugeField();
 
     /**
        @brief Exchange the ghost and store store in the padded region
@@ -621,8 +644,6 @@ namespace quda {
       @param[in] the host buffer to copy from.
     */
     virtual void copy_from_buffer(void *buffer);
-
-    void setGauge(void* _gauge); //only allowed when create== QUDA_REFERENCE_FIELD_CREATE
   };
 
   class cpuGaugeField : public GaugeField {
@@ -640,7 +661,6 @@ namespace quda {
        extended.
     */
     cpuGaugeField(const GaugeFieldParam &param);
-    virtual ~cpuGaugeField();
 
     /**
        @brief Exchange the ghost and store store in the padded region
@@ -695,8 +715,6 @@ namespace quda {
       @param[in] the host buffer to copy from.
     */
     virtual void copy_from_buffer(void *buffer);
-
-    void setGauge(void** _gauge); //only allowed when create== QUDA_REFERENCE_FIELD_CREATE
   };
 
   /**
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 0fd3e944d5..3b9db8648b 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -356,7 +356,7 @@ namespace quda {
         scale_inv(static_cast<Float>(1.0))
       {
         for (int d = 0; d < U.Geometry(); d++)
-          u[d] = gauge_ ? static_cast<complex<storeFloat> **>(gauge_)[d] : U.data<complex<storeFloat> *const *>()[d];
+          u[d] = gauge_ ? static_cast<complex<storeFloat> **>(gauge_)[d] : U.data<complex<storeFloat> *>(d);
         resetScale(U.Scale());
       }
 
@@ -427,12 +427,12 @@ namespace quda {
       {
         for (int d=0; d<4; d++) {
 	  ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
-	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d]));
+	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d].data()));
 	  ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
 
 	  ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
 	    ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
-	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4]));
+	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4].data()));
 	  ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
 	}
 
@@ -548,12 +548,12 @@ namespace quda {
       {
         for (int d=0; d<4; d++) {
 	  ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
-	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d]));
+	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d].data()));
 	  ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
 
 	  ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
 	    ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
-	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4]));
+	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4].data()));
 	  ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
 	}
 
@@ -1753,8 +1753,8 @@ namespace quda {
         using store_t = Float;
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
-        Float *ghost[QUDA_MAX_DIM];
-        int faceVolumeCB[QUDA_MAX_DIM];
+        Float *ghost[QUDA_MAX_DIM] = {};
+        int faceVolumeCB[QUDA_MAX_DIM] = {};
         const int volumeCB;
         const int stride;
         const int geometry;
@@ -1769,9 +1769,11 @@ namespace quda {
           if (geometry == QUDA_COARSE_GEOMETRY)
             errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
 
-          for (int i = 0; i < 4; i++) {
-            ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i]);
-            faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth
+          if (u.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) {
+            for (int i = 0; i < 4; i++) {
+              ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i].data());
+              faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth
+            }
           }
         }
 
@@ -1831,7 +1833,7 @@ namespace quda {
     QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
       : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
     {
-      for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data<Float *const *>()[i];
+      for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data<Float *>(i);
     }
 
         __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const
@@ -1877,7 +1879,7 @@ namespace quda {
     QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
       : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
     {
-      for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data<Float *const *>()[i];
+      for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data<Float *>(i);
     }
 
         __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const
diff --git a/include/lattice_field.h b/include/lattice_field.h
index 1079da4553..005e09871b 100644
--- a/include/lattice_field.h
+++ b/include/lattice_field.h
@@ -72,7 +72,7 @@ namespace quda {
 
     QudaSiteSubset siteSubset = QUDA_INVALID_SITE_SUBSET;
 
-    QudaMemoryType mem_type = QUDA_MEMORY_DEVICE;
+    QudaMemoryType mem_type = QUDA_MEMORY_INVALID;
 
     /** The type of ghost exchange to be done with this field */
     QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
@@ -104,7 +104,7 @@ namespace quda {
       nDim(nDim),
       pad(pad),
       siteSubset(QUDA_FULL_SITE_SUBSET),
-      mem_type(QUDA_MEMORY_DEVICE),
+      mem_type(location == QUDA_CUDA_FIELD_LOCATION ? QUDA_MEMORY_DEVICE : QUDA_MEMORY_HOST),
       ghostExchange(ghostExchange),
       scale(1.0)
     {
@@ -128,7 +128,7 @@ namespace quda {
       nDim(4),
       pad(0),
       siteSubset(QUDA_FULL_SITE_SUBSET),
-      mem_type(QUDA_MEMORY_DEVICE),
+      mem_type(QUDA_MEMORY_HOST),
       ghostExchange(QUDA_GHOST_EXCHANGE_NO),
       scale(param.scale)
     {
diff --git a/include/malloc_quda.h b/include/malloc_quda.h
index 8df59bbf56..d1a7de9161 100644
--- a/include/malloc_quda.h
+++ b/include/malloc_quda.h
@@ -114,6 +114,9 @@ namespace quda {
 #define register_pinned(ptr, bytes) quda::register_pinned_(__func__, quda::file_name(__FILE__), __LINE__, ptr, bytes)
 #define unregister_pinned(size) quda::unregister_pinned_(__func__, quda::file_name(__FILE__), __LINE__, ptr)
 
+#define quda_malloc(size) quda::quda_malloc_(__func__, quda::file_name(__FILE__), __LINE__, size)
+#define quda_free(ptr) quda::quda_free_(__func__, quda::file_name(__FILE__), __LINE__, ptr)
+
 namespace quda {
 
   namespace pool {
@@ -169,3 +172,78 @@ namespace quda {
 #define pool_device_free(ptr) quda::pool::device_free_(__func__, __FILE__, __LINE__, ptr)
 #define pool_pinned_malloc(size) quda::pool::pinned_malloc_(__func__, __FILE__, __LINE__, size)
 #define pool_pinned_free(ptr) quda::pool::pinned_free_(__func__, __FILE__, __LINE__, ptr)
+
+namespace quda {
+
+  /**
+     Object that stores a memory allocation with different views for
+     host or device.  Depending on the nature of the underlying memory
+     type, both views may not be defined
+
+     type                       defined views
+     QUDA_MEMORY_DEVICE         device only
+     QUDA_MEMORY_DEVICE_PINNED  device only
+     QUDA_MEMORY_HOST           host only
+     QUDA_MEMORY_HOST_PINNED    both
+     QUDA_MEMORY_MAPPED         both (pinned to host)
+     QUDA_MEMORY_MANAGED        both
+   */
+  class quda_ptr {
+    QudaMemoryType type = QUDA_MEMORY_INVALID;
+    size_t size = 0;
+    bool pool = false;
+    void *device = nullptr;
+    void *host = nullptr;
+
+  public:
+    quda_ptr() = default;
+
+    quda_ptr &operator=(quda_ptr &&);
+
+    /**
+       @brief Constructor for quda_ptr
+       @param[in] type The memory type of the allocation
+       @param[in] size The size of the allocation
+       @param[in] pool Whether the allocation should be in the memory pool (default is true)
+    */
+    quda_ptr(QudaMemoryType type, size_t size, bool pool = true);
+
+    /**
+       @brief Constructor for quda_ptr where we are wrapping a non-owned pointer
+       @param[in] ptr Raw base pointer
+       @param[in] type The memory type of the allocation
+    */
+    quda_ptr(void *ptr, QudaMemoryType type);
+
+    /**
+       @brief Destructor for the quda_ptr
+    */
+    virtual ~quda_ptr();
+
+    /**
+       @return Returns true if allocation is visible to the device
+    */
+    bool is_device() const;
+
+    /**
+       @return Returns true if allocation is visible to the host
+    */
+    bool is_host() const;
+
+    /**
+       Return view of the pointer.  For mapped memory we return the device view.
+     */
+    void *data() const;
+
+    /**
+       Return the device view of the pointer
+     */
+    void *data_device() const;
+
+    /**
+       Return the host view of the pointer
+     */
+    void *data_host() const;
+  };
+
+}
diff --git a/include/quda_api.h b/include/quda_api.h
index 45c226ba19..ea475c43f6 100644
--- a/include/quda_api.h
+++ b/include/quda_api.h
@@ -3,6 +3,7 @@
 #include <quda_define.h>
 #include <string>
 #include <enum_quda.h>
+#include <malloc_quda.h>
 
 /**
    @file quda_api.h
@@ -63,6 +64,14 @@ namespace quda
   void qudaMemcpyP2PAsync_(void *dst, const void *src, size_t count, const qudaStream_t &stream, const char *func,
                            const char *file, const char *line);
 
+  /**
+     @brief Heterogenous memset function
+     @param[out] ptr Heterogeneous pointer
+     @param[in] value Value to set for each byte of specified memory
+     @param[in] count Size in bytes to set
+   */
+  void qudaMemset_(quda_ptr &ptr, int value, size_t count, const char *func, const char *file, const char *line);
+
   /**
      @brief Wrapper around cudaMemset or driver API equivalent
      @param[out] ptr Starting address pointer
@@ -72,15 +81,15 @@ namespace quda
   void qudaMemset_(void *ptr, int value, size_t count, const char *func, const char *file, const char *line);
 
   /**
-     @brief Wrapper around cudaMemset2D or driver API equivalent
-     @param[out] ptr Starting address pointer
+     @brief Heterogenous memset2d function
+     @param[out] ptr Heterogeneous pointer
+     @param[in] offset Offset shift in bytes from the base pointer
      @param[in] Pitch in bytes
      @param[in] value Value to set for each byte of specified memory
      @param[in] width Width in bytes
      @param[in] height Height in bytes
    */
-  void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height, const char *func,
-                     const char *file, const char *line);
+  void qudaMemset2D_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height, const char *func, const char *file, const char *line);
 
   /**
      @brief Wrapper around cudaMemsetAsync or driver API equivalent
@@ -224,8 +233,8 @@ namespace quda
 #define qudaMemset(ptr, value, count)                                                                                  \
   ::quda::qudaMemset_(ptr, value, count, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
 
-#define qudaMemset2D(ptr, pitch, value, width, height)                                                                 \
-  ::quda::qudaMemset2D_(ptr, pitch, value, width, height, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
+#define qudaMemset2D(ptr, offset, pitch, value, width, height)          \
+  ::quda::qudaMemset2D_(ptr, offset, pitch, value, width, height, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
 
 #define qudaMemsetAsync(ptr, value, count, stream)                                                                     \
   ::quda::qudaMemsetAsync_(ptr, value, count, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
diff --git a/lib/coarse_op_preconditioned.cu b/lib/coarse_op_preconditioned.cu
index 1a2dbda501..ab8ee88f7f 100644
--- a/lib/coarse_op_preconditioned.cu
+++ b/lib/coarse_op_preconditioned.cu
@@ -187,8 +187,7 @@ namespace quda
     } else if (X.Location() == QUDA_CPU_FIELD_LOCATION && X.Order() == QUDA_QDP_GAUGE_ORDER) {
       const cpuGaugeField *X_h = static_cast<const cpuGaugeField*>(&X);
       cpuGaugeField *Xinv_h = static_cast<cpuGaugeField*>(&Xinv);
-      blas::flops += invert(Xinv_h->data<void *const *>()[0], X_h->data<void *const *>()[0], n, X_h->Volume(),
-                            X.Precision(), X.Location());
+      blas::flops += invert(Xinv_h->data<void *>(0), X_h->data<void *>(0), n, X_h->Volume(), X.Precision(), X.Location());
     } else {
       errorQuda("Unsupported location=%d and order=%d", X.Location(), X.Order());
     }
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 4bf7457584..56b6631832 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -908,9 +908,7 @@ namespace quda
     coarseParam.setPrecision(new_precision);
 
     // set where we allocate the field
-    coarseParam.mem_type = (new_mem_type != QUDA_MEMORY_INVALID) ?
-      new_mem_type :
-      (new_location == QUDA_CUDA_FIELD_LOCATION ? QUDA_MEMORY_DEVICE : QUDA_MEMORY_PINNED);
+    coarseParam.mem_type = new_mem_type;
 
     return new ColorSpinorField(coarseParam);
   }
@@ -941,9 +939,7 @@ namespace quda
     }
 
     // set where we allocate the field
-    fineParam.mem_type = (new_mem_type != QUDA_MEMORY_INVALID) ?
-      new_mem_type :
-      (new_location == QUDA_CUDA_FIELD_LOCATION ? QUDA_MEMORY_DEVICE : QUDA_MEMORY_PINNED);
+    fineParam.mem_type = new_mem_type;
 
     return new ColorSpinorField(fineParam);
   }
diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp
index f3063d5d32..604bf04c13 100644
--- a/lib/cpu_gauge_field.cpp
+++ b/lib/cpu_gauge_field.cpp
@@ -10,112 +10,16 @@ namespace quda {
   cpuGaugeField::cpuGaugeField(const GaugeFieldParam &param) :
     GaugeField(param)
   {
-    if (precision == QUDA_HALF_PRECISION) {
-      errorQuda("CPU fields do not support half precision");
-    }
-    if (precision == QUDA_QUARTER_PRECISION) {
-      errorQuda("CPU fields do not support quarter precision");
-    }
-    if (pad != 0) {
-      errorQuda("CPU fields do not support non-zero padding");
-    }
-    if (reconstruct != QUDA_RECONSTRUCT_NO && reconstruct != QUDA_RECONSTRUCT_10) {
-      errorQuda("Reconstruction type %d not supported", reconstruct);
-    }
-    if (reconstruct == QUDA_RECONSTRUCT_10 && link_type != QUDA_ASQTAD_MOM_LINKS) {
-      errorQuda("10-reconstruction only supported with momentum links");
-    }
-
-    // compute the correct bytes size for these padded field orders
-    if (order == QUDA_TIFR_PADDED_GAUGE_ORDER) {
-      bytes = site_dim * (x[0] * x[1] * (x[2] + 4) * x[3]) * nInternal * precision;
-    } else if (order == QUDA_BQCD_GAUGE_ORDER) {
-      bytes = site_dim * (x[0] + 4) * (x[1] + 2) * (x[2] + 2) * (x[3] + 2) * nInternal * precision;
-    } else if (order == QUDA_MILC_SITE_GAUGE_ORDER) {
-      bytes = volume * site_size;
-    }
-
-    if (order == QUDA_QDP_GAUGE_ORDER) {
-      gauge = safe_malloc(site_dim * sizeof(void *));
-      size_t nbytes = volume * nInternal * precision;
-      gauge_qdp = reinterpret_cast<void **>(gauge);
-      for (int d = 0; d < site_dim; d++) {
-        if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
-          gauge_qdp[d] = nbytes ? safe_malloc(nbytes) : nullptr;
-          if (create == QUDA_ZERO_FIELD_CREATE && nbytes) memset(gauge_qdp[d], 0, nbytes);
-        } else if (create == QUDA_REFERENCE_FIELD_CREATE) {
-          gauge_qdp[d] = ((void **)param.gauge)[d];
-        } else {
-          errorQuda("Unsupported creation type %d", create);
-        }
+    // exchange the boundaries if a non-trivial field
+    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD)
+      if (create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) {
+        exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
       }
 
-    } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER  ||
-	       order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_TIFR_GAUGE_ORDER ||
-	       order == QUDA_TIFR_PADDED_GAUGE_ORDER || order == QUDA_MILC_SITE_GAUGE_ORDER) {
-
-      if (order == QUDA_MILC_SITE_GAUGE_ORDER && create != QUDA_REFERENCE_FIELD_CREATE) {
-	errorQuda("MILC site gauge order only supported for reference fields");
-      }
-
-      if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
-        gauge = bytes ? (void **)safe_malloc(bytes) : nullptr;
-        if (create == QUDA_ZERO_FIELD_CREATE && bytes) memset(gauge, 0, bytes);
-      } else if (create == QUDA_REFERENCE_FIELD_CREATE) {
-        gauge = param.gauge;
-      } else {
-	errorQuda("Unsupported creation type %d", create);
-      }
-
-    } else {
-      errorQuda("Unsupported gauge order type %d", order);
-    }
-  
-    // no need to exchange data if this is a momentum field
-    if (link_type != QUDA_ASQTAD_MOM_LINKS) {
-      // Ghost zone is always 2-dimensional    
-      for (int i=0; i<nDim; i++) {
-	size_t nbytes = nFace * surface[i] * nInternal * precision;
-	ghost[i] = nbytes ? safe_malloc(nbytes) : nullptr;
-	ghost[i+4] = (nbytes && geometry == QUDA_COARSE_GEOMETRY) ? safe_malloc(nbytes) : nullptr;
-      }
-
-      if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) {
-	// exchange the boundaries if a non-trivial field
-	if (create != QUDA_NULL_FIELD_CREATE && create != QUDA_ZERO_FIELD_CREATE &&
-	    (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY) )
-	  exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
-      }
-    }
-
     // compute the fat link max now in case it is needed later (i.e., for half precision)
     if (param.compute_fat_link_max) fat_link_max = this->abs_max();
   }
 
-
-  cpuGaugeField::~cpuGaugeField()
-  {
-    if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
-      if (order == QUDA_QDP_GAUGE_ORDER) {
-        for (int d = 0; d < site_dim; d++) {
-          if (gauge_qdp[d]) host_free(gauge_qdp[d]);
-        }
-      }
-      if (gauge) host_free(gauge);
-    } else { // QUDA_REFERENCE_FIELD_CREATE 
-      if (order == QUDA_QDP_GAUGE_ORDER){
-	if (gauge) host_free(gauge);
-      }
-    }
-  
-    if (link_type != QUDA_ASQTAD_MOM_LINKS) {
-      for (int i=0; i<nDim; i++) {
-	if (ghost[i]) host_free(ghost[i]);
-	if (ghost[i+4] && geometry == QUDA_COARSE_GEOMETRY) host_free(ghost[i+4]);
-      }
-    }
-  }
-
   // This does the exchange of the gauge field ghost zone and places it
   // into the ghost array.
   void cpuGaugeField::exchangeGhost(QudaLinkDirection link_direction) {
@@ -125,27 +29,30 @@ namespace quda {
     if ( (link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == QUDA_LINK_FORWARDS) && geometry != QUDA_COARSE_GEOMETRY)
       errorQuda("Cannot request exchange of forward links on non-coarse geometry");
 
-    void *send[2*QUDA_MAX_DIM];
+    void *send[2 * QUDA_MAX_DIM];
     for (int d=0; d<nDim; d++) {
-      send[d] = safe_malloc(nFace*surface[d]*nInternal*precision);
-      if (geometry == QUDA_COARSE_GEOMETRY) send[d+4] = safe_malloc(nFace*surface[d]*nInternal*precision);
+      send[d] = safe_malloc(nFace * surface[d] * nInternal * precision);
+      if (geometry == QUDA_COARSE_GEOMETRY) send[d+4] = safe_malloc(nFace * surface[d] * nInternal * precision);
     }
 
+    void *ghost_[2 * QUDA_MAX_DIM];
+    for (auto i = 0; i < geometry; i++) ghost_[i] = ghost[i].data();
+
+    // get the links into contiguous buffers
     if (link_direction == QUDA_LINK_BACKWARDS || link_direction == QUDA_LINK_BIDIRECTIONAL) {
-      // get the links into contiguous buffers
       extractGaugeGhost(*this, send, true);
 
       // communicate between nodes
-      exchange(ghost, send, QUDA_FORWARDS);
+      exchange(ghost_, send, QUDA_FORWARDS);
     }
 
     // repeat if requested and links are bi-directional
     if (link_direction == QUDA_LINK_FORWARDS || link_direction == QUDA_LINK_BIDIRECTIONAL) {
       extractGaugeGhost(*this, send, true, nDim);
-      exchange(ghost+nDim, send+nDim, QUDA_FORWARDS);
+      exchange(ghost_+nDim, send+nDim, QUDA_FORWARDS);
     }
 
-    for (int d=0; d<geometry; d++) host_free(send[d]);
+    for (int d = 0; d < geometry; d++) host_free(send[d]);
   }
 
   // This does the opposite of exchangeGhost and sends back the ghost
@@ -158,16 +65,19 @@ namespace quda {
     if (link_direction != QUDA_LINK_BACKWARDS)
       errorQuda("link_direction = %d not supported", link_direction);
 
-    void *recv[2*QUDA_MAX_DIM];
+    void *recv[QUDA_MAX_DIM];
     for (int d=0; d<nDim; d++) recv[d] = safe_malloc(nFace*surface[d]*nInternal*precision);
 
+    void *ghost_[] = {ghost[0].data(), ghost[1].data(), ghost[2].data(), ghost[3].data(),
+                      ghost[4].data(), ghost[5].data(), ghost[6].data(), ghost[7].data()};
+
     // communicate between nodes
-    exchange(recv, ghost, QUDA_BACKWARDS);
+    exchange(recv, ghost_, QUDA_BACKWARDS);
 
     // get the links into contiguous buffers
     extractGaugeGhost(*this, recv, false);
 
-    for (int d=0; d<nDim; d++) host_free(recv[d]);
+    for (int d = 0; d < QUDA_MAX_DIM; d++) host_free(recv[d]);
   }
 
   void cpuGaugeField::exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill)
@@ -195,22 +105,22 @@ namespace quda {
 	MsgHandle *mh_recv_fwd;
 	MsgHandle *mh_send_fwd;
 	MsgHandle *mh_send_back;
-	
+
 	mh_recv_back = comm_declare_receive_relative(recv[d], d, -1, bytes[d]);
 	mh_recv_fwd  = comm_declare_receive_relative(((char*)recv[d])+bytes[d], d, +1, bytes[d]);
 	mh_send_back = comm_declare_send_relative(send[d], d, -1, bytes[d]);
 	mh_send_fwd  = comm_declare_send_relative(((char*)send[d])+bytes[d], d, +1, bytes[d]);
-	
+
 	comm_start(mh_recv_back);
 	comm_start(mh_recv_fwd);
 	comm_start(mh_send_fwd);
 	comm_start(mh_send_back);
-	
+
 	comm_wait(mh_send_fwd);
 	comm_wait(mh_send_back);
 	comm_wait(mh_recv_back);
 	comm_wait(mh_recv_fwd);
-	
+
 	comm_free(mh_send_fwd);
 	comm_free(mh_send_back);
 	comm_free(mh_recv_back);
@@ -218,7 +128,7 @@ namespace quda {
       } else {
 	memcpy(static_cast<char*>(recv[d])+bytes[d], send[d], bytes[d]);
 	memcpy(recv[d], static_cast<char*>(send[d])+bytes[d], bytes[d]);
-      }      
+      }
 
       // inject back into the gauge field
       extractExtendedGaugeGhost(*this, d, R, recv, false);
@@ -286,15 +196,15 @@ namespace quda {
 
 	if (order == QUDA_QDP_GAUGE_ORDER) {
 	  for (int d=0; d<geometry; d++) {
-            qudaMemcpy(((void **)gauge)[d], ((void **)buffer)[d], bytes / geometry, qudaMemcpyDeviceToHost);
+            qudaMemcpy(gauge_array[d].data(), ((void **)buffer)[d], bytes / geometry, qudaMemcpyDeviceToHost);
           }
 	} else {
-          qudaMemcpy(gauge, buffer, bytes, qudaMemcpyHostToDevice);
+          qudaMemcpy(gauge.data(), buffer, bytes, qudaMemcpyHostToDevice);
         }
 
 	if (order > 4 && ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
 	  for (int d=0; d<geometry; d++)
-            qudaMemcpy(Ghost()[d], ghost_buffer[d], ghost_bytes[d], qudaMemcpyDeviceToHost);
+            qudaMemcpy(Ghost()[d].data(), ghost_buffer[d], ghost_bytes[d], qudaMemcpyDeviceToHost);
 
         free_gauge_buffer(buffer, order, geometry);
 	if (nFace > 0) free_ghost_buffer(ghost_buffer, order, geometry);
@@ -314,21 +224,12 @@ namespace quda {
     }
   }
 
-  void cpuGaugeField::setGauge(void **gauge_)
-  {
-    if(create != QUDA_REFERENCE_FIELD_CREATE) {
-      errorQuda("Setting gauge pointer is only allowed when create="
-		"QUDA_REFERENCE_FIELD_CREATE type\n");
-    }
-    gauge = gauge_;
-  }
-
   void cpuGaugeField::copy_to_buffer(void *buffer) const
   {
     if (is_pointer_array(order)) {
       char *dst_buffer = reinterpret_cast<char *>(buffer);
-      for (int d = 0; d < geometry; d++) {
-        std::memcpy(&dst_buffer[d * bytes / geometry], data<void *const *>()[d], bytes / geometry);
+      for (int d = 0; d < site_dim; d++) {
+        std::memcpy(&dst_buffer[d * bytes / site_dim], gauge_array[d].data(), bytes / site_dim);
       }
     } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER
                || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER
@@ -343,8 +244,8 @@ namespace quda {
   {
     if (is_pointer_array(order)) {
       const char *dst_buffer = reinterpret_cast<const char *>(buffer);
-      for (int d = 0; d < geometry; d++) {
-        std::memcpy(data<void *const *>()[d], &dst_buffer[d * bytes / geometry], Bytes() / geometry);
+      for (int d = 0; d < site_dim; d++) {
+        std::memcpy(gauge_array[d].data(), &dst_buffer[d * bytes / site_dim], bytes / site_dim);
       }
     } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER
                || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER
diff --git a/lib/cuda_gauge_field.cpp b/lib/cuda_gauge_field.cpp
index bb0299d027..ae21213770 100644
--- a/lib/cuda_gauge_field.cpp
+++ b/lib/cuda_gauge_field.cpp
@@ -9,112 +9,11 @@ namespace quda {
 
   cudaGaugeField::cudaGaugeField(const GaugeFieldParam &param) : GaugeField(param)
   {
-    if ((order == QUDA_QDP_GAUGE_ORDER || order == QUDA_QDPJIT_GAUGE_ORDER) &&
-        create != QUDA_REFERENCE_FIELD_CREATE) {
-      errorQuda("QDP ordering only supported for reference fields");
-    }
-
-    if (order == QUDA_QDP_GAUGE_ORDER ||
-	order == QUDA_TIFR_GAUGE_ORDER || order == QUDA_TIFR_PADDED_GAUGE_ORDER ||
-	order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_CPS_WILSON_GAUGE_ORDER)
-      errorQuda("Field ordering %d presently disabled for this type", order);
-
-#ifdef MULTI_GPU
-    if (link_type != QUDA_ASQTAD_MOM_LINKS &&
-	ghostExchange == QUDA_GHOST_EXCHANGE_PAD &&
-	isNative()) {
-      bool pad_check = true;
-      for (int i=0; i<nDim; i++) {
-	// when we have coarse links we need to double the pad since we're storing forwards and backwards links
-	int minimum_pad = nFace*surfaceCB[i] * (geometry == QUDA_COARSE_GEOMETRY ? 2 : 1);
-	if (pad < minimum_pad) pad_check = false;
-	if (!pad_check)
-	  errorQuda("cudaGaugeField being constructed with insufficient padding in dim %d (%d < %d)\n", i, pad, minimum_pad);
-      }
-    }
-#endif
-
-    if (create != QUDA_NULL_FIELD_CREATE &&
-        create != QUDA_ZERO_FIELD_CREATE &&
-        create != QUDA_REFERENCE_FIELD_CREATE){
-      errorQuda("ERROR: create type(%d) not supported yet\n", create);
-    }
-
-    if (create != QUDA_REFERENCE_FIELD_CREATE) {
-      switch(mem_type) {
-      case QUDA_MEMORY_DEVICE: gauge = bytes ? pool_device_malloc(bytes) : nullptr; break;
-      case QUDA_MEMORY_MAPPED:
-        gauge_h = bytes ? mapped_malloc(bytes) : nullptr;
-        gauge = bytes ? get_mapped_device_pointer(gauge_h) : nullptr; // set the matching device pointer
-        break;
-      default:
-	errorQuda("Unsupported memory type %d", mem_type);
-      }
-      if (create == QUDA_ZERO_FIELD_CREATE && bytes) qudaMemset(gauge, 0, bytes);
-    } else {
-      gauge = param.gauge;
-    }
-
-    if ( !isNative() ) {
-      for (int i=0; i<nDim; i++) {
-        size_t nbytes = nFace * surface[i] * nInternal * precision;
-        ghost[i] = nbytes ? pool_device_malloc(nbytes) : nullptr;
-	ghost[i+4] = (nbytes && geometry == QUDA_COARSE_GEOMETRY) ? pool_device_malloc(nbytes) : nullptr;
-      }
-    }
-
-    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) {
-      if (create == QUDA_REFERENCE_FIELD_CREATE) exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
-    }
-
-    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) {
-      if (isNative()) {
-        if (create != QUDA_ZERO_FIELD_CREATE) zeroPad();
-      } else {
-        for (int i = 0; i < nDim; i++) {
-          size_t nbytes = nFace * surface[i] * nInternal * precision;
-          qudaMemset(ghost[i], 0, nbytes);
-          if (nbytes && geometry == QUDA_COARSE_GEOMETRY) qudaMemset(ghost[i + 4], 0, nbytes);
-        }
+    // exchange the boundaries if a non-trivial field
+    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD)
+      if (create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) {
+        exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
       }
-    }
-  }
-
-  void cudaGaugeField::zeroPad() {
-    size_t pad_bytes = (stride - volumeCB) * precision * order;
-    int Npad = (geometry * (reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : nColor * nColor * 2)) / order;
-
-    size_t pitch = stride*order*precision;
-    if (pad_bytes) {
-      qudaMemset2D(static_cast<char *>(gauge) + 0 * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes,
-                   Npad);
-      qudaMemset2D(static_cast<char *>(gauge) + 1 * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes,
-                   Npad);
-    }
-  }
-
-  cudaGaugeField::~cudaGaugeField()
-  {
-    if (create != QUDA_REFERENCE_FIELD_CREATE) {
-      switch(mem_type) {
-      case QUDA_MEMORY_DEVICE:
-        if (gauge) pool_device_free(gauge);
-        break;
-      case QUDA_MEMORY_MAPPED:
-        if (gauge_h) host_free(gauge_h);
-        break;
-      default:
-        errorQuda("Unsupported memory type %d", mem_type);
-      }
-    }
-
-    if ( !isNative() ) {
-      for (int i=0; i<nDim; i++) {
-        if (ghost[i]) pool_device_free(ghost[i]);
-        if (ghost[i + 4] && geometry == QUDA_COARSE_GEOMETRY) pool_device_free(ghost[i + 4]);
-      }
-    }
-
   }
 
   // This does the exchange of the forwards boundary gauge field ghost zone and places
@@ -197,7 +96,7 @@ namespace quda {
       } else {
 	// copy from receive buffer into ghost array
 	for (int dim=0; dim<nDim; dim++)
-          qudaMemcpy(ghost[dim + link_dir * nDim], recv_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
+          qudaMemcpy(ghost[dim + link_dir * nDim].data(), recv_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
       }
 
       bufferIndex = 1-bufferIndex;
@@ -244,7 +143,7 @@ namespace quda {
 	copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, send_d, 0, 1 + 2*link_dir);
       } else { // copy from receive buffer into ghost array
         for (int dim = 0; dim < nDim; dim++)
-          qudaMemcpy(send_d[dim], ghost[dim + link_dir * nDim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
+          qudaMemcpy(send_d[dim], ghost[dim + link_dir * nDim].data(), ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
       }
       qudaDeviceSynchronize(); // need to synchronize before issueing copies in different streams - could replace with event post and wait
 
@@ -456,15 +355,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMMS);
   }
 
-  void cudaGaugeField::setGauge(void *gauge_)
-  {
-    if(create != QUDA_REFERENCE_FIELD_CREATE) {
-      errorQuda("Setting gauge pointer is only allowed when create="
-          "QUDA_REFERENCE_FIELD_CREATE type\n");
-    }
-    gauge = gauge_;
-  }
-
   void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
     if (order == QUDA_QDP_GAUGE_ORDER) {
       void **buffer = new void*[geometry];
@@ -544,7 +434,7 @@ namespace quda {
           if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
 	}
 
-        qudaMemcpy(gauge, buffer, bytes, qudaMemcpyDefault);
+        qudaMemcpy(gauge.data(), buffer, bytes, qudaMemcpyDefault);
         pool_pinned_free(buffer);
       } else { // else on the GPU
 
@@ -555,7 +445,7 @@ namespace quda {
           void *src_d = get_mapped_device_pointer(src.data());
 
           if (src.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
-            copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, src_d);
+            copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, data(), src_d);
           } else {
             errorQuda("Ghost copy not supported here");
           }
@@ -569,7 +459,7 @@ namespace quda {
 
 	  if (src.Order() == QUDA_QDP_GAUGE_ORDER) {
 	    for (int d=0; d<geometry; d++) {
-              qudaMemcpy(((void **)buffer)[d], src.data<void *const *>()[d], src.Bytes() / geometry, qudaMemcpyDefault);
+              qudaMemcpy(((void **)buffer)[d], src.data(d), src.Bytes() / geometry, qudaMemcpyDefault);
             }
           } else {
             qudaMemcpy(buffer, src.data(), src.Bytes(), qudaMemcpyDefault);
@@ -578,7 +468,7 @@ namespace quda {
           if (src.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD
               && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
             for (int d = 0; d < geometry; d++)
-              qudaMemcpy(ghost_buffer[d], src.Ghost()[d], ghost_bytes[d], qudaMemcpyDefault);
+              qudaMemcpy(ghost_buffer[d], src.Ghost()[d].data(), ghost_bytes[d], qudaMemcpyDefault);
 
           if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
             copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer);
@@ -652,7 +542,7 @@ namespace quda {
 
         if (cpu.Order() == QUDA_QDP_GAUGE_ORDER) {
           for (int d = 0; d < geometry; d++)
-            qudaMemcpy((cpu.data<void *const *>())[d], ((void **)buffer)[d], cpu.Bytes() / geometry, qudaMemcpyDefault);
+            qudaMemcpy(cpu.data(d), ((void **)buffer)[d], cpu.Bytes() / geometry, qudaMemcpyDefault);
         } else {
           qudaMemcpy(cpu.data(), buffer, cpu.Bytes(), qudaMemcpyDefault);
         }
@@ -660,7 +550,7 @@ namespace quda {
         if (cpu.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD
             && cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
           for (int d = 0; d < geometry; d++)
-            qudaMemcpy(cpu.Ghost()[d], ghost_buffer[d], ghost_bytes[d], qudaMemcpyDefault);
+            qudaMemcpy(cpu.Ghost()[d].data(), ghost_buffer[d], ghost_bytes[d], qudaMemcpyDefault);
 
         free_gauge_buffer(buffer, cpu.Order(), cpu.Geometry());
         if (nFace > 0) free_ghost_buffer(ghost_buffer, cpu.Order(), geometry);
@@ -668,7 +558,7 @@ namespace quda {
     } else if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do copy then host-side reorder
 
       void *buffer = pool_pinned_malloc(bytes);
-      qudaMemcpy(buffer, gauge, bytes, qudaMemcpyDefault);
+      qudaMemcpy(buffer, gauge.data(), bytes, qudaMemcpyDefault);
 
       if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
         copyGenericGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, nullptr, buffer);
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index ea17cb4610..caeddfa298 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -29,9 +29,8 @@ namespace quda {
 
   GaugeField::GaugeField(const GaugeFieldParam &param) :
     LatticeField(param),
-    gauge(nullptr),
-    gauge_h(nullptr),
-    gauge_qdp {},
+    gauge(),
+    gauge_array {},
     bytes(0),
     phase_offset(0),
     phase_bytes(0),
@@ -88,43 +87,152 @@ namespace quda {
       errorQuda("Cannot request a 12/8 reconstruct type without SU(3) link type");
     }
 
-    if (reconstruct == QUDA_RECONSTRUCT_9 || reconstruct == QUDA_RECONSTRUCT_13) {
-      // Need to adjust the phase alignment as well.
-      int half_phase_bytes
-        = (length / (2 * reconstruct)) * precision; // number of bytes needed to store phases for a single parity
-      int half_gauge_bytes = (length / 2) * precision
-        - half_phase_bytes; // number of bytes needed to store the gauge field for a single parity excluding the phases
-      // Adjust the alignments for the gauge and phase separately
-      half_phase_bytes = ((half_phase_bytes + (512-1))/512)*512;
-      half_gauge_bytes = ((half_gauge_bytes + (512-1))/512)*512;
-    
-      phase_offset = half_gauge_bytes;
-      phase_bytes = half_phase_bytes*2;
-      bytes = (half_gauge_bytes + half_phase_bytes)*2;      
+    if (reconstruct == QUDA_RECONSTRUCT_10 && link_type != QUDA_ASQTAD_MOM_LINKS) {
+      errorQuda("10-reconstruction only supported with momentum links");
+    }
+
+    if (create != QUDA_NULL_FIELD_CREATE && create != QUDA_ZERO_FIELD_CREATE && create != QUDA_REFERENCE_FIELD_CREATE) {
+      errorQuda("ERROR: create type(%d) not supported yet\n", create);
+    }
+
+    switch (geometry) {
+    case QUDA_SCALAR_GEOMETRY: site_dim = 1; break;
+    case QUDA_VECTOR_GEOMETRY: site_dim = nDim; break;
+    case QUDA_TENSOR_GEOMETRY: site_dim = nDim * (nDim - 1) / 2; break;
+    case QUDA_COARSE_GEOMETRY: site_dim = 2 * nDim; break;
+    case QUDA_KDINVERSE_GEOMETRY: site_dim = 1 << nDim; break;
+    default: errorQuda("Unknown geometry type %d", geometry);
+    }
+
+    if (isNative()) {
+      if (reconstruct == QUDA_RECONSTRUCT_9 || reconstruct == QUDA_RECONSTRUCT_13) {
+        // Need to adjust the phase alignment as well.
+        int half_phase_bytes
+          = (length / (2 * reconstruct)) * precision; // bytes needed to store phases for a single parity
+        int half_gauge_bytes = (length / 2) * precision
+          - half_phase_bytes; // bytes needed to store the gauge field for a single parity excluding the phases
+        // Adjust the alignments for the gauge and phase separately
+        half_phase_bytes = ALIGNMENT_ADJUST(half_phase_bytes);
+        half_gauge_bytes = ALIGNMENT_ADJUST(half_gauge_bytes);
+        phase_offset = half_gauge_bytes;
+        phase_bytes = half_phase_bytes * 2;
+        bytes = (half_gauge_bytes + half_phase_bytes) * 2;
+      } else {
+        bytes = length * precision;
+        bytes = 2 * ALIGNMENT_ADJUST(bytes / 2);
+      }
     } else {
-      bytes = length * precision;
-      if (isNative()) bytes = 2*ALIGNMENT_ADJUST(bytes/2);
+      // compute the correct bytes size for these padded field orders
+      if (order == QUDA_TIFR_PADDED_GAUGE_ORDER) {
+        bytes = site_dim * (x[0] * x[1] * (x[2] + 4) * x[3]) * nInternal * precision;
+      } else if (order == QUDA_BQCD_GAUGE_ORDER) {
+        bytes = site_dim * (x[0] + 4) * (x[1] + 2) * (x[2] + 2) * (x[3] + 2) * nInternal * precision;
+      } else if (order == QUDA_MILC_SITE_GAUGE_ORDER) {
+        bytes = volume * site_size;
+      } else {
+        bytes = length * precision;
+      }
     }
+
     total_bytes = bytes;
 
-    if (geometry == QUDA_SCALAR_GEOMETRY)
-      site_dim = 1;
-    else if (geometry == QUDA_VECTOR_GEOMETRY)
-      site_dim = nDim;
-    else if (geometry == QUDA_TENSOR_GEOMETRY)
-      site_dim = nDim * (nDim - 1) / 2;
-    else if (geometry == QUDA_COARSE_GEOMETRY)
-      site_dim = 2 * nDim;
-    else if (geometry == QUDA_KDINVERSE_GEOMETRY)
-      site_dim = 1 << nDim;
-    else
-      errorQuda("Unknown geometry type %d", geometry);
+    if (isNative() && ghostExchange == QUDA_GHOST_EXCHANGE_PAD) {
+      bool pad_check = true;
+      for (int i = 0; i < nDim; i++) {
+	// when we have coarse links we need to double the pad since we're storing forwards and backwards links
+	int minimum_pad = comm_dim_partitioned(i) ? nFace*surfaceCB[i] * (geometry == QUDA_COARSE_GEOMETRY ? 2 : 1) : 0;
+	if (pad < minimum_pad) pad_check = false;
+	if (!pad_check) errorQuda("GaugeField being constructed with insufficient padding in dim %d (%d < %d)", i, pad, minimum_pad);
+      }
+    }
+
+    if (isNative()) {
+      if (create != QUDA_REFERENCE_FIELD_CREATE) {
+        gauge = std::move(quda_ptr(mem_type, bytes));
+        if (create == QUDA_ZERO_FIELD_CREATE) qudaMemset(gauge, 0, bytes);
+      } else {
+        gauge = std::move(quda_ptr(param.gauge, mem_type));
+      }
+    } else if (is_pointer_array(order)) {
+
+      size_t nbytes = volume * nInternal * precision;
+      for (int d = 0; d < site_dim; d++) {
+        if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
+          gauge_array[d] = std::move(quda_ptr(mem_type, nbytes));
+          if (create == QUDA_ZERO_FIELD_CREATE) qudaMemset(gauge_array[d], 0, nbytes);
+        } else if (create == QUDA_REFERENCE_FIELD_CREATE) {
+          gauge_array[d] = std::move(quda_ptr(static_cast<void **>(param.gauge)[d], mem_type));
+        } else {
+          errorQuda("Unsupported creation type %d", create);
+        }
+      }
+
+    } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER  ||
+	       order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_TIFR_GAUGE_ORDER ||
+	       order == QUDA_TIFR_PADDED_GAUGE_ORDER || order == QUDA_MILC_SITE_GAUGE_ORDER) {
+      // does not support device
+
+      if (order == QUDA_MILC_SITE_GAUGE_ORDER && create != QUDA_REFERENCE_FIELD_CREATE) {
+	errorQuda("MILC site gauge order only supported for reference fields");
+      }
+
+      if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
+        gauge = std::move(quda_ptr(mem_type, bytes));
+        if (create == QUDA_ZERO_FIELD_CREATE) qudaMemset(gauge, 0, bytes);
+      } else if (create == QUDA_REFERENCE_FIELD_CREATE) {
+        gauge = std::move(quda_ptr(param.gauge, mem_type));
+      } else {
+	errorQuda("Unsupported creation type %d", create);
+      }
+
+    } else {
+      errorQuda("Unsupported gauge order type %d", order);
+    }
+
+    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) {
+      if (!isNative()) {
+        for (int i=0; i<nDim; i++) {
+          size_t nbytes = nFace * surface[i] * nInternal * precision;
+          ghost[i] = std::move(quda_ptr(mem_type, nbytes));
+          if (geometry == QUDA_COARSE_GEOMETRY) ghost[i+4] = std::move(quda_ptr(mem_type, nbytes));
+
+          qudaMemset(ghost[i], 0, nbytes);
+          if (geometry == QUDA_COARSE_GEOMETRY) qudaMemset(ghost[i + 4], 0, nbytes);
+        }
+      } else {
+        if (create != QUDA_ZERO_FIELD_CREATE) zeroPad();
+      }
+    }
 
     setTuningString();
   }
 
-  GaugeField::~GaugeField() {
+  GaugeField::~GaugeField() { }
 
+  void GaugeField::zeroPad()
+  {
+    if (!isNative()) return;
+    size_t pad_bytes = (stride - volumeCB) * precision * order;
+    int Npad = (geometry * (reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : nColor * nColor * 2)) / order;
+
+    size_t pitch = stride * order * precision;
+    if (pad_bytes) {
+      for (int parity = 0; parity < 2; parity++) {
+        qudaMemset2D(gauge, parity * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad);
+      }
+    }
+#if 0
+    if (location == QUDA_CUDA_FIELD_LOCATION) {
+      for (int parity = 0; parity < 2; parity++) {
+        qudaMemset2D(data<char *>() + parity * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad);
+      }
+    } else {
+      for (int parity = 0; parity < 2; parity++)
+          for (int p = 0; p < Npad; p++)
+            memset(data<char *>() + parity * (bytes / 2) + (volumeCB + p * stride) * order * precision, 0, pad_bytes);
+      }
+    }
+#endif
   }
 
   void GaugeField::setTuningString() {
@@ -194,7 +302,8 @@ namespace quda {
     staggeredPhaseApplied = false;
   }
 
-  void GaugeField::exchange(void **ghost_link, void **link_sendbuf, QudaDirection dir) const {
+  void GaugeField::exchange(void **ghost_link, void **link_sendbuf, QudaDirection dir) const
+  {
     MsgHandle *mh_send[4];
     MsgHandle *mh_recv[4];
     size_t bytes[4];
@@ -219,16 +328,8 @@ namespace quda {
 	  if (no_comms_fill) memcpy(ghost_link[i], link_sendbuf[i], bytes[i]);
 	}
       }
-    } else { // FIXME for CUDA field copy back to the CPU
-      for (int i=0; i<nDimComms; i++) {
-	if (comm_dim_partitioned(i)) {
-	  send[i] = pool_pinned_malloc(bytes[i]);
-	  receive[i] = pool_pinned_malloc(bytes[i]);
-          qudaMemcpy(send[i], link_sendbuf[i], bytes[i], qudaMemcpyDeviceToHost);
-        } else {
-          if (no_comms_fill) qudaMemcpy(ghost_link[i], link_sendbuf[i], bytes[i], qudaMemcpyDeviceToDevice);
-        }
-      }
+    } else {
+      errorQuda("Not supported");
     }
 
     for (int i=0; i<nDimComms; i++) {
@@ -296,7 +397,7 @@ namespace quda {
     output << "nColor = " << param.nColor << std::endl;
     output << "nFace = " << param.nFace << std::endl;
     output << "reconstruct = " << param.reconstruct << std::endl;
-    int nInternal = (param.reconstruct != QUDA_RECONSTRUCT_NO ? 
+    int nInternal = (param.reconstruct != QUDA_RECONSTRUCT_NO ?
 		     param.reconstruct : param.nColor * param.nColor * 2);
     output << "nInternal = " << nInternal << std::endl;
     output << "order = " << param.order << std::endl;
@@ -315,14 +416,10 @@ namespace quda {
 
   void GaugeField::zero()
   {
-    if (location == QUDA_CUDA_FIELD_LOCATION) {
+    if (order != QUDA_QDP_GAUGE_ORDER) {
       qudaMemset(gauge, 0, bytes);
     } else {
-      if (order != QUDA_QDP_GAUGE_ORDER) {
-        memset(gauge, 0, bytes);
-      } else {
-        for (int g = 0; g < geometry; g++) memset(gauge_qdp[g], 0, volume * nInternal * precision);
-      }
+      for (int g = 0; g < geometry; g++) qudaMemset(gauge_array[g], 0, volume * nInternal * precision);
     }
   }
 
@@ -450,13 +547,13 @@ namespace quda {
   void GaugeField::prefetch(QudaFieldLocation mem_space, qudaStream_t stream) const
   {
     if (location == QUDA_CUDA_FIELD_LOCATION && is_prefetch_enabled() && mem_type == QUDA_MEMORY_DEVICE) {
-      if (gauge) qudaMemPrefetchAsync(gauge, bytes, mem_space, stream);
+      if (gauge.data()) qudaMemPrefetchAsync(gauge.data(), bytes, mem_space, stream);
       if (!isNative()) {
         for (int i = 0; i < nDim; i++) {
           size_t nbytes = nFace * surface[i] * nInternal * precision;
-          if (ghost[i] && nbytes) qudaMemPrefetchAsync(ghost[i], nbytes, mem_space, stream);
-          if (ghost[i + 4] && nbytes && geometry == QUDA_COARSE_GEOMETRY)
-            qudaMemPrefetchAsync(ghost[i + 4], nbytes, mem_space, stream);
+          if (ghost[i].data() && nbytes) qudaMemPrefetchAsync(ghost[i].data(), nbytes, mem_space, stream);
+          if (ghost[i + 4].data() && nbytes && geometry == QUDA_COARSE_GEOMETRY)
+            qudaMemPrefetchAsync(ghost[i + 4].data(), nbytes, mem_space, stream);
         }
       }
     }
@@ -466,21 +563,16 @@ namespace quda {
   {
     if (backed_up) errorQuda("Gauge field already backed up");
 
-    if (location == QUDA_CUDA_FIELD_LOCATION) {
-      backup_h = new char[bytes];
-      qudaMemcpy(backup_h, gauge, bytes, qudaMemcpyDefault);
-    } else {
-      if (order == QUDA_QDP_GAUGE_ORDER) {
-        char **buffer = new char *[geometry];
-        for (int d = 0; d < geometry; d++) {
-          buffer[d] = new char[bytes / geometry];
-          memcpy(buffer[d], gauge_qdp[d], bytes / geometry);
-        }
-        backup_h = reinterpret_cast<char *>(buffer);
-      } else {
-        backup_h = new char[bytes];
-        memcpy(backup_h, gauge, bytes);
+    if (order == QUDA_QDP_GAUGE_ORDER) {
+      char **buffer = new char *[geometry];
+      for (int d = 0; d < geometry; d++) {
+        buffer[d] = new char[bytes / geometry];
+        qudaMemcpy(buffer[d], gauge_array[d].data(), bytes / geometry, qudaMemcpyDefault);
       }
+      backup_h = reinterpret_cast<char *>(buffer);
+    } else {
+      backup_h = new char[bytes];
+      qudaMemcpy(backup_h, gauge.data(), bytes, qudaMemcpyDefault);
     }
 
     backed_up = true;
@@ -490,21 +582,16 @@ namespace quda {
   {
     if (!backed_up) errorQuda("Cannot restore since not backed up");
 
-    if (location == QUDA_CUDA_FIELD_LOCATION) {
-      qudaMemcpy(gauge, backup_h, bytes, qudaMemcpyDefault);
-      delete[] backup_h;
-    } else {
-      if (order == QUDA_QDP_GAUGE_ORDER) {
-        char **buffer = reinterpret_cast<char **>(backup_h);
-        for (int d = 0; d < geometry; d++) {
-          memcpy(gauge_qdp[d], buffer[d], bytes / geometry);
-          delete[] buffer[d];
-        }
-        delete[] buffer;
-      } else {
-        memcpy(gauge, backup_h, bytes);
-        delete[] backup_h;
+    if (order == QUDA_QDP_GAUGE_ORDER) {
+      char **buffer = reinterpret_cast<char **>(backup_h);
+      for (int d = 0; d < geometry; d++) {
+        qudaMemcpy(gauge_array[d].data(), buffer[d], bytes / geometry, qudaMemcpyDefault);
+        delete[] buffer[d];
       }
+      delete[] buffer;
+    } else {
+      qudaMemcpy(gauge.data(), backup_h, bytes, qudaMemcpyDefault);
+      delete[] backup_h;
     }
     backed_up = false;
   }
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 65464e3f6e..6a8cc64e54 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -2662,7 +2662,7 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
     //but if not sufficient device memory, then the user may choose mapped type of memory
     ritzParam.mem_type = eig_param.mem_type_ritz;
   } else { //host location
-    ritzParam.mem_type = QUDA_MEMORY_PINNED;
+    ritzParam.mem_type = QUDA_MEMORY_HOST_PINNED;
   }
 
   int ritzVolume = 1;
diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp
index c657a89e3b..af108d17ff 100644
--- a/lib/lattice_field.cpp
+++ b/lib/lattice_field.cpp
@@ -183,8 +183,16 @@ namespace quda {
     // for 5-dimensional fields, we only communicate in the space-time dimensions
     nDimComms = nDim == 5 ? 4 : nDim;
 
+    // if the memory location isn't set, use field location to set it
     mem_type = param.mem_type;
-
+    if (mem_type == QUDA_MEMORY_INVALID) {
+      mem_type = location == QUDA_CUDA_FIELD_LOCATION ? QUDA_MEMORY_DEVICE : QUDA_MEMORY_HOST;
+      logQuda(QUDA_DEBUG_VERBOSE, "setting default memory type mem_type %d\n", mem_type);
+    } else if (mem_type == QUDA_MEMORY_DEVICE && location == QUDA_CPU_FIELD_LOCATION) {
+      mem_type = QUDA_MEMORY_HOST;
+    } else if (mem_type == QUDA_MEMORY_HOST && location == QUDA_CUDA_FIELD_LOCATION) {
+      mem_type = QUDA_MEMORY_DEVICE;
+    }
     setTuningString();
   }
 
diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp
index 82ec226a6d..1a486b98e8 100644
--- a/lib/targets/cuda/malloc.cpp
+++ b/lib/targets/cuda/malloc.cpp
@@ -788,4 +788,147 @@ namespace quda
 
   } // namespace pool
 
+
+  quda_ptr::quda_ptr(QudaMemoryType type, size_t size, bool pool) :
+    type(type),
+    size(size),
+    pool(pool)
+  {
+    if (pool && (type != QUDA_MEMORY_DEVICE && type != QUDA_MEMORY_HOST_PINNED && type != QUDA_MEMORY_HOST))
+      errorQuda("Memory pool not available for memory type %d", type);
+
+    if (size > 0) {
+      switch (type) {
+      case QUDA_MEMORY_DEVICE:
+        device = pool ? pool_device_malloc(size) : device_malloc(size);
+        break;
+      case QUDA_MEMORY_DEVICE_PINNED:
+        device = device_pinned_malloc(size);
+        break;
+      case QUDA_MEMORY_HOST:
+        host = safe_malloc(size);
+        break;
+      case QUDA_MEMORY_HOST_PINNED:
+        host = pool ? pool_pinned_malloc(size) : pinned_malloc(size);
+        break;
+      case QUDA_MEMORY_MAPPED:
+        host = mapped_malloc(size);
+        device = get_mapped_device_pointer(host);
+        break;
+      case QUDA_MEMORY_MANAGED:
+        host = managed_malloc(size);
+        device = host;
+        break;
+      default: errorQuda("Unknown memory type %d", type);
+      }
+    }
+  }
+
+  quda_ptr::quda_ptr(void *ptr, QudaMemoryType type) :
+    type(type)
+  {
+    switch (type) {
+    case QUDA_MEMORY_DEVICE:
+    case QUDA_MEMORY_DEVICE_PINNED:
+      device = ptr;
+      host = nullptr;
+      break;
+    case QUDA_MEMORY_HOST:
+    case QUDA_MEMORY_HOST_PINNED:
+      device = nullptr;
+      host = ptr;
+      break;
+    case QUDA_MEMORY_MANAGED:
+      device = ptr;
+      host = ptr;
+      break;
+    default: errorQuda("Unsupported memory type %d", type);
+    }
+  }
+
+  quda_ptr& quda_ptr::operator=(quda_ptr &&other)
+  {
+    if (&other != this) {
+      type = std::exchange(other.type, QUDA_MEMORY_INVALID);
+      size = std::exchange(other.size, 0);
+      pool = std::exchange(other.pool, false);
+      device = std::exchange(other.device, nullptr);
+      host = std::exchange(other.host, nullptr);
+    }
+    return *this;
+  }
+
+  quda_ptr::~quda_ptr()
+  {
+    if (size > 0) {
+      switch (type) {
+      case QUDA_MEMORY_DEVICE:        pool ? pool_device_free(device) : device_free(device); break;
+      case QUDA_MEMORY_DEVICE_PINNED: device_pinned_free(device); break;
+      case QUDA_MEMORY_HOST:          host_free(host); break;
+      case QUDA_MEMORY_HOST_PINNED:   pool ? pool_pinned_free(host) : host_free(host); break;
+      case QUDA_MEMORY_MAPPED:        host_free(host); break;
+      default: errorQuda("Unknown memory type %d", type);
+      }
+    }
+
+    device = nullptr;
+    host = nullptr;
+  }
+
+  bool quda_ptr::is_device() const
+  {
+    switch (type) {
+    case QUDA_MEMORY_DEVICE:
+    case QUDA_MEMORY_DEVICE_PINNED:
+    case QUDA_MEMORY_MAPPED:
+    case QUDA_MEMORY_MANAGED:
+      return true;
+    default: return false;
+    }
+  }
+
+  bool quda_ptr::is_host() const
+  {
+    switch (type) {
+    case QUDA_MEMORY_HOST:
+    case QUDA_MEMORY_HOST_PINNED:
+    case QUDA_MEMORY_MANAGED:
+      return true;
+    default: return false;
+    }
+  }
+
+  void *quda_ptr::data() const
+  {
+    void *ptr = nullptr;
+
+    switch (type) {
+    case QUDA_MEMORY_DEVICE:
+    case QUDA_MEMORY_DEVICE_PINNED:
+    case QUDA_MEMORY_MAPPED:
+    case QUDA_MEMORY_MANAGED:
+      ptr = device;
+      break;
+    case QUDA_MEMORY_HOST:
+    case QUDA_MEMORY_HOST_PINNED:
+      ptr = host;
+      break;
+    default: errorQuda("Unknown memory type %d", type);
+    }
+
+    return ptr;
+  }
+
+  void *quda_ptr::data_device() const
+  {
+    if (!device) errorQuda("Device view not defined");
+    return device;
+  }
+
+  void *quda_ptr::data_host() const
+  {
+    if (!host) errorQuda("Host view not defined");
+    return host;
+  }
+
 } // namespace quda
diff --git a/lib/targets/cuda/quda_api.cpp b/lib/targets/cuda/quda_api.cpp
index 1af28417d8..856aa44e2d 100644
--- a/lib/targets/cuda/quda_api.cpp
+++ b/lib/targets/cuda/quda_api.cpp
@@ -376,6 +376,16 @@ namespace quda
     QudaMem set(ptr, value, count, device::get_default_stream(), false, func, file, line);
   }
 
+  void qudaMemset_(quda_ptr &ptr, int value, size_t count, const char *func, const char *file, const char *line)
+  {
+    if (count == 0) return;
+    if (ptr.is_device()) {
+      QudaMem set(ptr.data(), value, count, device::get_default_stream(), false, func, file, line);
+    } else {
+      memset(ptr.data(), value, count);
+    }
+  }
+
   void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
                         const char *file, const char *line)
   {
@@ -390,6 +400,17 @@ namespace quda
     set_runtime_error(error, __func__, func, file, line);
   }
 
+  void qudaMemset2D_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height, const char *func,
+                     const char *file, const char *line)
+  {
+    if (ptr.is_device()) {
+      cudaError_t error = cudaMemset2D(static_cast<char*>(ptr.data()) + offset, pitch, value, width, height);
+      set_runtime_error(error, __func__, func, file, line);
+    } else {
+      for (auto i = 0u; i < height; i++) memset(static_cast<char*>(ptr.data()) + offset + i * pitch, value, width);
+    }
+  }
+
   void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream,
                           const char *func, const char *file, const char *line)
   {
diff --git a/tests/covdev_test.cpp b/tests/covdev_test.cpp
index 50473151a1..a59f26bf28 100644
--- a/tests/covdev_test.cpp
+++ b/tests/covdev_test.cpp
@@ -34,8 +34,6 @@ std::unique_ptr<ColorSpinorField> tmp;
 
 void *links[4];
 
-void **ghostLink;
-
 QudaParity parity = QUDA_EVEN_PARITY;
 
 GaugeCovDev *dirac;
@@ -97,7 +95,6 @@ void init(int argc, char **argv)
   GaugeFieldParam cpuParam(gauge_param, links);
   cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
   cpuLink = new cpuGaugeField(cpuParam);
-  ghostLink = cpuLink->Ghost();
 
   printfQuda("Links sending...");
   loadGaugeQuda(links, &gauge_param);
@@ -166,9 +163,9 @@ void covdevRef(int mu)
   // compare to dslash reference implementation
   printfQuda("Calculating reference implementation...");
 #ifdef MULTI_GPU
-  mat_mg4dir(*spinorRef, links, ghostLink, *spinor, dagger, mu, inv_param.cpu_prec, gauge_param.cpu_prec);
+  mat_mg4dir(*spinorRef, *cpuLink, *spinor, dagger, mu, inv_param.cpu_prec, gauge_param.cpu_prec);
 #else
-  mat(spinorRef->V(), links, spinor->V(), dagger, mu, inv_param.cpu_prec, gauge_param.cpu_prec);
+  mat(spinorRef->V(), *cpuLink, spinor->V(), dagger, mu, inv_param.cpu_prec, gauge_param.cpu_prec);
 #endif
   printfQuda("done.\n");
 }
diff --git a/tests/gauge_force_test.cpp b/tests/gauge_force_test.cpp
index 60dfa94cd0..64ba5c1048 100644
--- a/tests/gauge_force_test.cpp
+++ b/tests/gauge_force_test.cpp
@@ -111,12 +111,13 @@ void gauge_force_test(bool compute_force = true)
   }
 
   quda::GaugeFieldParam param(gauge_param);
+  param.location = QUDA_CPU_FIELD_LOCATION;
   param.create = QUDA_NULL_FIELD_CREATE;
   param.order = QUDA_QDP_GAUGE_ORDER;
   auto U_qdp = new quda::cpuGaugeField(param);
 
   // fills the gauge field with random numbers
-  createSiteLinkCPU(U_qdp->data<void *const *>(), gauge_param.cpu_prec, 0);
+  createSiteLinkCPU(*U_qdp, gauge_param.cpu_prec, 0);
 
   param.order = QUDA_MILC_GAUGE_ORDER;
   auto U_milc = new quda::cpuGaugeField(param);
@@ -140,13 +141,17 @@ void gauge_force_test(bool compute_force = true)
   }
   void *mom = nullptr;
   void *sitelink = nullptr;
+  void *sitelink_array[QUDA_MAX_DIM];
+  void *mom_array[QUDA_MAX_DIM];
 
   if (gauge_order == QUDA_MILC_GAUGE_ORDER) {
     sitelink = U_milc->data();
     mom = Mom_milc->data();
   } else if (gauge_order == QUDA_QDP_GAUGE_ORDER) {
-    sitelink = U_qdp->data();
-    mom = Mom_qdp->data();
+    for (int d = 0; d < 4; d++) sitelink_array[d] = U_qdp->data(d);
+    sitelink = reinterpret_cast<void*>(sitelink_array);
+    for (int d = 0; d < 4; d++) mom_array[d] = Mom_qdp->data(d);
+    mom = reinterpret_cast<void*>(mom_array);
   } else {
     errorQuda("Unsupported gauge order %d", gauge_order);
   }
@@ -183,7 +188,7 @@ void gauge_force_test(bool compute_force = true)
   void *refmom = Mom_ref_milc->data();
   int *check_out = compute_force ? &force_check : &path_check;
   if (verify_results) {
-    gauge_force_reference(refmom, eb3, U_qdp->data<void *const *>(), gauge_param.cpu_prec, input_path_buf, length,
+    gauge_force_reference(refmom, eb3, *U_qdp, gauge_param.cpu_prec, input_path_buf, length,
                           loop_coeff, num_paths, compute_force);
     *check_out
       = compare_floats(Mom_milc->data(), refmom, 4 * V * mom_site_size, getTolerance(cuda_prec), gauge_param.cpu_prec);
diff --git a/tests/hisq_paths_force_test.cpp b/tests/hisq_paths_force_test.cpp
index e19d874e31..58e5ba97b1 100644
--- a/tests/hisq_paths_force_test.cpp
+++ b/tests/hisq_paths_force_test.cpp
@@ -169,7 +169,7 @@ static void hisq_force_init()
   cpuGauge_ex = new cpuGaugeField(gParam_ex);
 
   if (gauge_order == QUDA_QDP_GAUGE_ORDER) {
-    createSiteLinkCPU(cpuGauge->data<void *const *>(), qudaGaugeParam.cpu_prec, 1);
+    createSiteLinkCPU(*cpuGauge, qudaGaugeParam.cpu_prec, 1);
   } else {
     errorQuda("Unsupported gauge order %d", gauge_order);
   }
diff --git a/tests/hisq_unitarize_force_test.cpp b/tests/hisq_unitarize_force_test.cpp
index f6d68a9553..7a9d19255c 100644
--- a/tests/hisq_unitarize_force_test.cpp
+++ b/tests/hisq_unitarize_force_test.cpp
@@ -26,7 +26,7 @@ quda::cpuGaugeField *cpuReference = NULL;
 static QudaGaugeParam gaugeParam;
 
 // Create a field of links that are not su3_matrices
-void createNoisyLinkCPU(void *const *field, QudaPrecision prec, int seed)
+void createNoisyLinkCPU(quda::GaugeField &field, QudaPrecision prec, int seed)
 {
   createSiteLinkCPU(field, prec, 0);
 
@@ -34,10 +34,10 @@ void createNoisyLinkCPU(void *const *field, QudaPrecision prec, int seed)
   for (int dir = 0; dir < 4; ++dir) {
     for (int i = 0; i < V * 18; ++i) {
       if (prec == QUDA_DOUBLE_PRECISION) {
-        double *ptr = ((double **)field)[dir] + i;
+        double *ptr = field.data<double*>(dir) + i;
         *ptr += (rand() - RAND_MAX / 2.0) / (20.0 * RAND_MAX);
       } else if (prec == QUDA_SINGLE_PRECISION) {
-        float *ptr = ((float **)field)[dir] + i;
+        float *ptr = field.data<float *>(dir) + i;
         *ptr += (rand() - RAND_MAX / 2.0) / (20.0 * RAND_MAX);
       }
     }
@@ -77,8 +77,8 @@ static void hisq_force_init()
   seed += quda::comm_rank();
 #endif
 
-  createNoisyLinkCPU(cpuFatLink->data<void *const *>(), gaugeParam.cpu_prec, seed);
-  createNoisyLinkCPU(cpuOprod->data<void *const *>(), gaugeParam.cpu_prec, seed + 1);
+  createNoisyLinkCPU(*cpuFatLink, gaugeParam.cpu_prec, seed);
+  createNoisyLinkCPU(*cpuOprod, gaugeParam.cpu_prec, seed + 1);
 
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.setPrecision(gaugeParam.cuda_prec, true);
@@ -142,7 +142,7 @@ TEST(hisq_force_unitarize, verify)
 
   double accuracy = prec == QUDA_DOUBLE_PRECISION ? 1e-10 : 1e-5;
   for (int dir = 0; dir < 4; ++dir) {
-    res[dir] = compare_floats(cpuReference->data<void *const *>()[dir], cpuResult->data<void *const *>()[dir],
+    res[dir] = compare_floats(cpuReference->data<void *>(dir), cpuResult->data<void *>(dir),
                               cpuReference->Volume() * gauge_site_size, accuracy, gaugeParam.cpu_prec);
 
     quda::comm_allreduce_int(res[dir]);
diff --git a/tests/host_reference/covdev_reference.cpp b/tests/host_reference/covdev_reference.cpp
index 081b19142c..a8c178af00 100644
--- a/tests/host_reference/covdev_reference.cpp
+++ b/tests/host_reference/covdev_reference.cpp
@@ -193,7 +193,7 @@ void covdevReference_mg4dir(sFloat *res, gFloat **link, gFloat **ghostLink, cons
   } // 4-d volume
 }
 
-void covdev_dslash_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int oddBit,
+void covdev_dslash_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int oddBit,
                           int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
 {
   QudaParity otherparity = QUDA_INVALID_PARITY;
@@ -208,32 +208,38 @@ void covdev_dslash_mg4dir(ColorSpinorField &out, void **link, void **ghostLink,
 
   in.exchangeGhost(otherparity, nFace, daggerBit);
 
+  void *data[4] = {link.data(0), link.data(1), link.data(2), link.data(3)};
+  void *ghostLink[4] = {link.Ghost()[0].data(), link.Ghost()[1].data(), link.Ghost()[2].data(), link.Ghost()[3].data()};
+
   if (sPrecision == QUDA_DOUBLE_PRECISION) {
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      covdevReference_mg4dir((double *)out.V(), (double **)link, (double **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((double *)out.V(), reinterpret_cast<double**>(data), (double **)ghostLink, in, oddBit, daggerBit, mu);
     } else {
-      covdevReference_mg4dir((double *)out.V(), (float **)link, (float **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((double *)out.V(), reinterpret_cast<float**>(data), (float **)ghostLink, in, oddBit, daggerBit, mu);
     }
   } else {
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      covdevReference_mg4dir((float *)out.V(), (double **)link, (double **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((float *)out.V(), reinterpret_cast<double**>(data), (double **)ghostLink, in, oddBit, daggerBit, mu);
     } else {
-      covdevReference_mg4dir((float *)out.V(), (float **)link, (float **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((float *)out.V(), reinterpret_cast<float**>(data), (float **)ghostLink, in, oddBit, daggerBit, mu);
     }
   }
 }
 
 template <typename sFloat, typename gFloat>
-void Mat_mg4dir(ColorSpinorField &out, gFloat **link, gFloat **ghostLink, const ColorSpinorField &in, int daggerBit,
-                int mu)
+void Mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu)
 {
+  void *data[4] = {link.data(0), link.data(1), link.data(2), link.data(3)};
+  void *ghostLink[4] = {link.Ghost()[0].data(), link.Ghost()[1].data(), link.Ghost()[2].data(), link.Ghost()[3].data()};
+
   const int nFace = 1;
   {
     auto &inEven = in.Even();
     auto &outOdd = out.Odd();
 
     inEven.exchangeGhost(QUDA_EVEN_PARITY, nFace, daggerBit);
-    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outOdd.V()), link, ghostLink, in.Even(), 1, daggerBit, mu);
+    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outOdd.V()), reinterpret_cast<gFloat**>(data),
+                           reinterpret_cast<gFloat**>(ghostLink), in.Even(), 1, daggerBit, mu);
   }
 
   {
@@ -241,29 +247,30 @@ void Mat_mg4dir(ColorSpinorField &out, gFloat **link, gFloat **ghostLink, const
     auto &outEven = out.Even();
 
     inOdd.exchangeGhost(QUDA_ODD_PARITY, nFace, daggerBit);
-    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outEven.V()), link, ghostLink, in.Odd(), 0, daggerBit, mu);
+    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outEven.V()), reinterpret_cast<gFloat**>(data),
+                           reinterpret_cast<gFloat**>(ghostLink), in.Odd(), 0, daggerBit, mu);
   }
 }
 
-void mat_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int dagger_bit,
+void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit,
                 int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
 {
   if (sPrecision == QUDA_DOUBLE_PRECISION) {
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      Mat_mg4dir<double, double>(out, (double **)link, (double **)ghostLink, in, dagger_bit, mu);
+      Mat_mg4dir<double, double>(out, link, in, dagger_bit, mu);
     } else {
-      Mat_mg4dir<double, float>(out, (float **)link, (float **)ghostLink, in, dagger_bit, mu);
+      Mat_mg4dir<double, float>(out, link, in, dagger_bit, mu);
     }
   } else {
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      Mat_mg4dir<float, double>(out, (double **)link, (double **)ghostLink, in, dagger_bit, mu);
+      Mat_mg4dir<float, double>(out, link, in, dagger_bit, mu);
     } else {
-      Mat_mg4dir<float, float>(out, (float **)link, (float **)ghostLink, in, dagger_bit, mu);
+      Mat_mg4dir<float, float>(out, link, in, dagger_bit, mu);
     }
   }
 }
 
-void matdagmat_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int dagger_bit,
+void matdagmat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit,
                       int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp,
                       QudaParity parity)
 {
@@ -279,9 +286,9 @@ void matdagmat_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, cons
     errorQuda("full parity not supported");
   }
 
-  covdev_dslash_mg4dir(tmp, link, ghostLink, in, otherparity, dagger_bit, mu, sPrecision, gPrecision);
+  covdev_dslash_mg4dir(tmp, link, in, otherparity, dagger_bit, mu, sPrecision, gPrecision);
 
-  covdev_dslash_mg4dir(out, link, ghostLink, tmp, parity, dagger_bit, mu, sPrecision, gPrecision);
+  covdev_dslash_mg4dir(out, link, tmp, parity, dagger_bit, mu, sPrecision, gPrecision);
 }
 
 #endif
diff --git a/tests/host_reference/covdev_reference.h b/tests/host_reference/covdev_reference.h
index 19b1809cf0..c2045773ed 100644
--- a/tests/host_reference/covdev_reference.h
+++ b/tests/host_reference/covdev_reference.h
@@ -6,18 +6,18 @@ using namespace quda;
 
 void setDims(int *);
 
-void covdev_dslash(void *res, void **link, void *spinorField, int oddBit, int daggerBit, int mu,
+void covdev_dslash(void *res, const GaugeField &link, void *spinorField, int oddBit, int daggerBit, int mu,
                    QudaPrecision sPrecision, QudaPrecision gPrecision);
-void covdev_dslash_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int oddBit,
+void covdev_dslash_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int oddBit,
                           int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision);
 
-void mat(void *out, void **link, void *in, int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision);
+void mat(void *out, const GaugeField &link, void *in, int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision);
 
-void matdagmat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision,
+void matdagmat(void *out, const GaugeField &link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision,
                QudaPrecision gPrecision, void *tmp, QudaParity parity);
 
-void mat_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int daggerBit, int mu,
+void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu,
                 QudaPrecision sPrecision, QudaPrecision gPrecision);
-void matdagmat_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int dagger_bit,
+void matdagmat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit,
                       int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp,
                       QudaParity parity);
diff --git a/tests/host_reference/domain_wall_dslash_reference.cpp b/tests/host_reference/domain_wall_dslash_reference.cpp
index 46d2620ce0..5fba06fe30 100644
--- a/tests/host_reference/domain_wall_dslash_reference.cpp
+++ b/tests/host_reference/domain_wall_dslash_reference.cpp
@@ -764,7 +764,7 @@ void dw_dslash(void *out, void *const *gauge, void *in, int oddBit, int daggerBi
   GaugeFieldParam gauge_field_param(gauge_param, (void **)gauge);
   gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
   cpuGaugeField cpu(gauge_field_param);
-  void **ghostGauge = (void **)cpu.Ghost();
+  void *ghostGauge[4] = {cpu.Ghost()[0].data(), cpu.Ghost()[1].data(), cpu.Ghost()[2].data(), cpu.Ghost()[3].data()};
 
   // Get spinor ghost fields
   // First wrap the input spinor into a ColorSpinorField
@@ -831,7 +831,7 @@ void dslash_4_4d(void *out, void *const *gauge, void *in, int oddBit, int dagger
   GaugeFieldParam gauge_field_param(gauge_param, (void **)gauge);
   gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
   cpuGaugeField cpu(gauge_field_param);
-  void **ghostGauge = (void **)cpu.Ghost();
+  void *ghostGauge[4] = {cpu.Ghost()[0].data(), cpu.Ghost()[1].data(), cpu.Ghost()[2].data(), cpu.Ghost()[3].data()};
 
   // Get spinor ghost fields
   // First wrap the input spinor into a ColorSpinorField
@@ -1357,7 +1357,7 @@ void mdw_mdagm_local(void *out, void *const *gauge, void *in, double _Complex *k
   QudaGaugeParam padded_gauge_param(gauge_param);
   for (int d = 0; d < 4; d++) { padded_gauge_param.X[d] += 2 * R[d]; }
 
-  auto padded_gauge_p = padded_gauge->data<void *const *>();
+  void *padded_gauge_p[] = {padded_gauge->data(0), padded_gauge->data(1), padded_gauge->data(2), padded_gauge->data(3)};
 
   // Extend these global variables then restore them
   int V5_old = V5;
diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index eeb1a56bd4..907a857824 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -743,10 +743,14 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou
 }
 
 double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                quda::ColorSpinorField &out, double mass, void *qdp_fatlink[], void *qdp_longlink[],
-                                void **ghost_fatlink, void **ghost_longlink, QudaGaugeParam &gauge_param,
-                                QudaInvertParam &inv_param, int shift)
+                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fatlink, quda::GaugeField &longlink,
+                                QudaGaugeParam &gauge_param, QudaInvertParam &inv_param, int shift)
 {
+  void *qdp_fatlink[] = {fatlink.data(0), fatlink.data(1), fatlink.data(2), fatlink.data(3)};
+  void *qdp_longlink[] = {longlink.data(0), longlink.data(1), longlink.data(2), longlink.data(3)};
+  void *ghost_fatlink[] = {fatlink.Ghost()[0].data(), fatlink.Ghost()[1].data(), fatlink.Ghost()[2].data(), fatlink.Ghost()[3].data()};
+  void *ghost_longlink[] = {longlink.Ghost()[0].data(), longlink.Ghost()[1].data(), longlink.Ghost()[2].data(), longlink.Ghost()[3].data()};
+
   switch (test_type) {
   case 0: // full parity solution, full parity system
   case 1: // full parity solution, solving EVEN EVEN prec system
diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h
index f124e99f24..48188d9a1e 100644
--- a/tests/host_reference/dslash_reference.h
+++ b/tests/host_reference/dslash_reference.h
@@ -2,6 +2,7 @@
 
 #include <host_utils.h>
 #include <comm_quda.h>
+#include <gauge_field.h>
 
 template <typename Float> static inline void sum(Float *dst, Float *a, Float *b, int cnt)
 {
@@ -107,9 +108,8 @@ double verifyWilsonTypeInversion(void *spinorOut, void **spinorOutMulti, void *s
                                  void *clover_inv);
 
 double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                quda::ColorSpinorField &out, double mass, void *qdp_fatlink[], void *qdp_longlink[],
-                                void **ghost_fatlink, void **ghost_longlink, QudaGaugeParam &gauge_param,
-                                QudaInvertParam &inv_param, int shift);
+                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fatlink, quda::GaugeField &longlink,
+                                QudaGaugeParam &gauge_param, QudaInvertParam &inv_param, int shift);
 
 // i represents a "half index" into an even or odd "half lattice".
 // when oddBit={0,1} the half lattice is {even,odd}.
diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp
index ffe8cc4494..4d12185981 100644
--- a/tests/host_reference/gauge_force_reference.cpp
+++ b/tests/host_reference/gauge_force_reference.cpp
@@ -405,9 +405,11 @@ void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *s
   host_free(staple);
 }
 
-void gauge_force_reference(void *refMom, double eb3, void *const *const sitelink, QudaPrecision prec, int ***path_dir,
+void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, QudaPrecision prec, int ***path_dir,
                            int *length, void *loop_coeff, int num_paths, bool compute_force)
 {
+  void *sitelink[] = {u.data(0), u.data(1), u.data(2), u.data(3)};
+
   // created extended field
   quda::lat_dim_t R;
   for (int d = 0; d < 4; d++) R[d] = 2 * quda::comm_dim_partitioned(d);
@@ -419,8 +421,9 @@ void gauge_force_reference(void *refMom, double eb3, void *const *const sitelink
   auto qdp_ex = quda::createExtendedGauge((void **)sitelink, param, R);
   lattice_t lat(*qdp_ex);
 
+  void *sitelink_ex[] = {qdp_ex->data(0), qdp_ex->data(1), qdp_ex->data(2), qdp_ex->data(3)};
   for (int dir = 0; dir < 4; dir++) {
-    gauge_force_reference_dir(refMom, dir, eb3, sitelink, qdp_ex->data<void *const *>(), prec, path_dir[dir], length,
+    gauge_force_reference_dir(refMom, dir, eb3, sitelink, sitelink_ex, prec, path_dir[dir], length,
                               loop_coeff, num_paths, lat, compute_force);
   }
 
diff --git a/tests/host_reference/gauge_force_reference.h b/tests/host_reference/gauge_force_reference.h
index 44106e5427..adaeaacdda 100644
--- a/tests/host_reference/gauge_force_reference.h
+++ b/tests/host_reference/gauge_force_reference.h
@@ -1,4 +1,6 @@
 #pragma once
 
-void gauge_force_reference(void *refMom, double eb3, void *const *sitelink, QudaPrecision prec, int ***path_dir,
+#include <gauge_field.h>
+
+void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, QudaPrecision prec, int ***path_dir,
                            int *length, void *loop_coeff, int num_paths, bool compute_force);
diff --git a/tests/host_reference/wilson_dslash_reference.cpp b/tests/host_reference/wilson_dslash_reference.cpp
index 3a766e570c..fbe5aa241d 100644
--- a/tests/host_reference/wilson_dslash_reference.cpp
+++ b/tests/host_reference/wilson_dslash_reference.cpp
@@ -191,8 +191,9 @@ void wil_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, Qu
 
   GaugeFieldParam gauge_field_param(gauge_param, gauge);
   gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
+  gauge_field_param.location = QUDA_CPU_FIELD_LOCATION;
   cpuGaugeField cpu(gauge_field_param);
-  void **ghostGauge = (void **)cpu.Ghost();
+  void *ghostGauge[4] = {cpu.Ghost()[0].data(), cpu.Ghost()[1].data(), cpu.Ghost()[2].data(), cpu.Ghost()[3].data()};
 
   // Get spinor ghost fields
   // First wrap the input spinor into a ColorSpinorField
diff --git a/tests/multigrid_evolve_test.cpp b/tests/multigrid_evolve_test.cpp
index 2fd02228a0..2436ddabf7 100644
--- a/tests/multigrid_evolve_test.cpp
+++ b/tests/multigrid_evolve_test.cpp
@@ -35,13 +35,13 @@ void setReunitarizationConsts()
   setUnitarizeLinksConstants(unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error);
 }
 
-void CallUnitarizeLinks(quda::cudaGaugeField *cudaInGauge)
+void CallUnitarizeLinks(quda::GaugeField &gauge)
 {
   using namespace quda;
   int *num_failures_dev = (int *)device_malloc(sizeof(int));
   int num_failures;
   qudaMemset(num_failures_dev, 0, sizeof(int));
-  unitarizeLinks(*cudaInGauge, num_failures_dev);
+  unitarizeLinks(gauge, num_failures_dev);
 
   qudaMemcpy(&num_failures, num_failures_dev, sizeof(int), qudaMemcpyDeviceToHost);
   if (num_failures > 0) errorQuda("Error in the unitarization\n");
@@ -219,12 +219,13 @@ int main(int argc, char **argv)
   {
     using namespace quda;
     GaugeFieldParam gParam(gauge_param);
+    gParam.location = QUDA_CUDA_FIELD_LOCATION;
     gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
     gParam.create = QUDA_NULL_FIELD_CREATE;
     gParam.link_type = gauge_param.type;
     gParam.reconstruct = gauge_param.reconstruct;
     gParam.setPrecision(gParam.Precision(), true);
-    cudaGaugeField *gauge = new cudaGaugeField(gParam);
+    cudaGaugeField gauge(gParam);
 
     int pad = 0;
     lat_dim_t y;
@@ -239,15 +240,15 @@ int main(int argc, char **argv)
     gParamEx.siteSubset = QUDA_FULL_SITE_SUBSET;
     gParamEx.t_boundary = gParam.t_boundary;
     gParamEx.nFace = 1;
-    for (int dir = 0; dir < 4; ++dir) gParamEx.r[dir] = R[dir];
-    cudaGaugeField *gaugeEx = new cudaGaugeField(gParamEx);
+    gParamEx.r = R;
+    cudaGaugeField gaugeEx(gParamEx);
 
     QudaGaugeObservableParam obs_param = newQudaGaugeObservableParam();
     obs_param.compute_plaquette = QUDA_BOOLEAN_TRUE;
     obs_param.compute_qcharge = QUDA_BOOLEAN_TRUE;
 
     // CURAND random generator initialization
-    RNG *randstates = new RNG(*gauge, 1234);
+    RNG randstates(gauge, 1234);
     int nsteps = 10;
     int nhbsteps = 1;
     int novrsteps = 1;
@@ -255,22 +256,22 @@ int main(int argc, char **argv)
     double beta_value = 6.2;
 
     if (link_recon != QUDA_RECONSTRUCT_8 && coldstart)
-      InitGaugeField(*gaugeEx);
+      InitGaugeField(gaugeEx);
     else
-      InitGaugeField(*gaugeEx, *randstates);
+      InitGaugeField(gaugeEx, randstates);
     // Reunitarization setup
     setReunitarizationConsts();
 
     // Do a series of Heatbath updates
-    Monte(*gaugeEx, *randstates, beta_value, 100 * nhbsteps, 100 * novrsteps);
+    Monte(gaugeEx, randstates, beta_value, 100 * nhbsteps, 100 * novrsteps);
 
     // Copy into regular field
-    copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
+    copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
     // load the gauge field from gauge
-    gauge_param.gauge_order = gauge->Order();
+    gauge_param.gauge_order = gauge.Order();
     gauge_param.location = QUDA_CUDA_FIELD_LOCATION;
-    loadGaugeQuda(gauge->data(), &gauge_param);
+    loadGaugeQuda(gauge.data(), &gauge_param);
     gaugeObservablesQuda(&obs_param);
 
     // Demonstrate MG evolution on an evolving gauge field
@@ -311,14 +312,14 @@ int main(int argc, char **argv)
 
     for (int step = 1; step < nsteps; ++step) {
       freeGaugeQuda();
-      Monte(*gaugeEx, *randstates, beta_value, nhbsteps, novrsteps);
+      Monte(gaugeEx, randstates, beta_value, nhbsteps, novrsteps);
 
       // Reunitarize gauge links
       CallUnitarizeLinks(gaugeEx);
 
       // Copy into regular field
-      copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
-      loadGaugeQuda(gauge->data(), &gauge_param);
+      copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION);
+      loadGaugeQuda(gauge.data(), &gauge_param);
 
       if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
         constructHostCloverField(clover, clover_inv, inv_param);
@@ -382,9 +383,9 @@ int main(int argc, char **argv)
     CallUnitarizeLinks(gaugeEx);
 
     // copy into regular field
-    copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
+    copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
-    loadGaugeQuda(gauge->data(), &gauge_param);
+    loadGaugeQuda(gauge.data(), &gauge_param);
     // Recompute Gauge Observables
     gaugeObservablesQuda(&obs_param);
 
@@ -447,12 +448,8 @@ int main(int argc, char **argv)
     // free the multigrid solver
     if (inv_multigrid) destroyMultigridQuda(mg_preconditioner);
 
-    delete gauge;
-    delete gaugeEx;
     // Release all temporary memory used for data exchange between GPUs in multi-GPU mode
     PGaugeExchangeFree();
-
-    delete randstates;
   }
 
   // stop the timer
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index c6379c3342..3756eac6b8 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -71,8 +71,8 @@ struct StaggeredDslashTestWrapper {
   // In the HISQ case, we include building fat/long links in this unit test
   void *qdp_fatlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr};
   void *qdp_longlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *const *ghost_fatlink_cpu;
-  void *const *ghost_longlink_cpu;
+  void **ghost_fatlink_cpu;
+  void **ghost_longlink_cpu;
 
   QudaParity parity = QUDA_EVEN_PARITY;
 
@@ -225,14 +225,14 @@ struct StaggeredDslashTestWrapper {
     GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink_cpu);
     cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
     cpuFat = new cpuGaugeField(cpuFatParam);
-    ghost_fatlink_cpu = cpuFat->Ghost();
+    for (int i = 0; i < 4; i++) ghost_fatlink_cpu[i] = cpuFat->Ghost()[i].data();
 
     if (dslash_type == QUDA_ASQTAD_DSLASH) {
       gauge_param.type = QUDA_ASQTAD_LONG_LINKS;
       GaugeFieldParam cpuLongParam(gauge_param, milc_longlink_cpu);
       cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
       cpuLong = new cpuGaugeField(cpuLongParam);
-      ghost_longlink_cpu = cpuLong ? cpuLong->Ghost() : nullptr;
+      for (int i = 0; i < 4; i++) ghost_longlink_cpu[i] = cpuLong ? cpuLong->Ghost()[i].data() : nullptr;
     }
 #endif
 
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 12b67cec13..87c574d974 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -364,8 +364,7 @@ int main(int argc, char **argv)
 
     for (int k = 0; k < Nsrc; k++) {
       if (verify_results)
-        verifyStaggeredInversion(*tmp, *ref, *in[k], *out[k], mass, qdp_fatlink, qdp_longlink, (void **)cpuFat->Ghost(),
-                                 (void **)cpuLong->Ghost(), gauge_param, inv_param, 0);
+        verifyStaggeredInversion(*tmp, *ref, *in[k], *out[k], mass, *cpuFat, *cpuLong, gauge_param, inv_param, 0);
     }
     break;
 
@@ -405,8 +404,7 @@ int main(int argc, char **argv)
 
       for (int i = 0; i < multishift; i++) {
         printfQuda("%dth solution: mass=%f, ", i, masses[i]);
-        verifyStaggeredInversion(*tmp, *ref, *in[k], *qudaOutArray[i], masses[i], qdp_fatlink, qdp_longlink,
-                                 (void **)cpuFat->Ghost(), (void **)cpuLong->Ghost(), gauge_param, inv_param, i);
+        verifyStaggeredInversion(*tmp, *ref, *in[k], *qudaOutArray[i], masses[i], *cpuFat, *cpuLong, gauge_param, inv_param, i);
       }
     }
 
diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp
index 4df1882297..f6d9dd4074 100644
--- a/tests/utils/host_utils.cpp
+++ b/tests/utils/host_utils.cpp
@@ -1376,6 +1376,12 @@ void createSiteLinkCPU(void *const *link, QudaPrecision precision, int phase)
   return;
 }
 
+void createSiteLinkCPU(quda::GaugeField &u, QudaPrecision precision, int phase)
+{
+  void *link[] = {u.data(0), u.data(1), u.data(2), u.data(3)};
+  createSiteLinkCPU(link, precision, phase);
+}
+
 template <typename Float> int compareLink(Float **linkA, Float **linkB, int len)
 {
   const int fail_check = 16;
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index c6da599fcc..a804c46f80 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -155,6 +155,7 @@ int fullLatticeIndex(int dim[], int index, int oddBit);
 int getOddBit(int X);
 
 void createSiteLinkCPU(void *const *const link, QudaPrecision precision, int phase);
+void createSiteLinkCPU(quda::GaugeField &u, QudaPrecision precision, int phase);
 void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision);
 void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision,
                      QudaGaugeParam *param);
diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp
index 61f5c9ef2e..6de4e900d0 100644
--- a/tests/utils/misc.cpp
+++ b/tests/utils/misc.cpp
@@ -339,7 +339,9 @@ const char *get_memory_type_str(QudaMemoryType type)
 
   switch (type) {
   case QUDA_MEMORY_DEVICE: s = "device"; break;
-  case QUDA_MEMORY_PINNED: s = "pinned"; break;
+  case QUDA_MEMORY_DEVICE_PINNED: s = "device_pinned"; break;
+  case QUDA_MEMORY_HOST: s = "host"; break;
+  case QUDA_MEMORY_HOST_PINNED: s = "host_pinned"; break;
   case QUDA_MEMORY_MAPPED: s = "mapped"; break;
   default: fprintf(stderr, "Error: invalid memory type\n"); exit(1);
   }

From 8e0207ef56c977cc2f3ec95684bfabdd9e1961be Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 11 May 2023 09:36:46 -0700
Subject: [PATCH 04/60] Move gauge field exchange functions to GaugeField from
 cpu/cuda children

---
 include/gauge_field.h    | 212 ++++++-----------
 lib/cpu_gauge_field.cpp  | 160 -------------
 lib/cuda_gauge_field.cpp | 346 ---------------------------
 lib/gauge_field.cpp      | 488 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 559 insertions(+), 647 deletions(-)

diff --git a/include/gauge_field.h b/include/gauge_field.h
index 0d70441f3f..cfc38855f4 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -273,8 +273,64 @@ namespace quda {
     GaugeField(const GaugeFieldParam &param);
     virtual ~GaugeField();
 
-    virtual void exchangeGhost(QudaLinkDirection = QUDA_LINK_BACKWARDS) = 0;
-    virtual void injectGhost(QudaLinkDirection = QUDA_LINK_BACKWARDS) = 0;
+    /**
+       @brief Create the communication handlers and buffers
+       @param[in] R The thickness of the extended region in each dimension
+       @param[in] no_comms_fill Do local exchange to fill out the extended
+       region in non-partitioned dimensions
+       @param[in] bidir Whether to allocate communication buffers to
+       allow for simultaneous bi-directional exchange.  If false, then
+       the forwards and backwards buffers will alias (saving memory).
+    */
+    void createComms(const lat_dim_t &R, bool no_comms_fill, bool bidir = true);
+
+    /**
+       @brief Allocate the ghost buffers
+       @param[in] R The thickness of the extended region in each dimension
+       @param[in] no_comms_fill Do local exchange to fill out the extended
+       @param[in] bidir Is this a bi-directional exchange - if not
+       then we alias the fowards and backwards offsetss
+       region in non-partitioned dimensions
+    */
+    void allocateGhostBuffer(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const;
+
+    /**
+       @brief Start the receive communicators
+       @param[in] dim The communication dimension
+       @param[in] dir The communication direction (0=backwards, 1=forwards)
+    */
+    void recvStart(int dim, int dir);
+
+    /**
+       @brief Start the sending communicators
+       @param[in] dim The communication dimension
+       @param[in] dir The communication direction (0=backwards, 1=forwards)
+       @param[in] stream_p Pointer to CUDA stream to post the
+       communication in (if 0, then use null stream)
+    */
+    void sendStart(int dim, int dir, const qudaStream_t &stream_p);
+
+    /**
+       @brief Wait for communication to complete
+       @param[in] dim The communication dimension
+       @param[in] dir The communication direction (0=backwards, 1=forwards)
+    */
+    void commsComplete(int dim, int dir);
+
+    /**
+       @brief Exchange the ghost and store store in the padded region
+       @param[in] link_direction Which links are we exchanging: this
+       flag only applies to bi-directional coarse-link fields
+     */
+    void exchangeGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS);
+
+    /**
+       @brief The opposite of exchangeGhost: take the ghost zone on x,
+       send to node x-1, and inject back into the field
+       @param[in] link_direction Which links are we injecting: this
+       flag only applies to bi-directional coarse-link fields
+     */
+    void injectGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS);
 
     size_t Length() const { return length; }
     int Ncolor() const { return nColor; }
@@ -323,7 +379,7 @@ namespace quda {
        @param no_comms_fill Do local exchange to fill out the extended
        region in non-partitioned dimensions
     */
-    virtual void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false) = 0;
+    void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false);
 
     /**
        @brief This routine will populate the border / halo region
@@ -334,7 +390,7 @@ namespace quda {
        @param no_comms_fill Do local exchange to fill out the extended
        region in non-partitioned dimensions
     */
-    virtual void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false) = 0;
+    void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false);
 
     void checkField(const LatticeField &) const;
 
@@ -505,91 +561,25 @@ namespace quda {
        @brief Restores the GaugeField
     */
     void restore() const;
-  };
-
-  class cudaGaugeField : public GaugeField {
-
-  public:
-    cudaGaugeField(const GaugeFieldParam &);
-
-    /**
-       @brief Exchange the ghost and store store in the padded region
-       @param[in] link_direction Which links are we exchanging: this
-       flag only applies to bi-directional coarse-link fields
-     */
-    void exchangeGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS);
-
-    /**
-       @brief The opposite of exchangeGhost: take the ghost zone on x,
-       send to node x-1, and inject back into the field
-       @param[in] link_direction Which links are we injecting: this
-       flag only applies to bi-directional coarse-link fields
-     */
-    void injectGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS);
-
-    /**
-       @brief Create the communication handlers and buffers
-       @param[in] R The thickness of the extended region in each dimension
-       @param[in] no_comms_fill Do local exchange to fill out the extended
-       region in non-partitioned dimensions
-       @param[in] bidir Whether to allocate communication buffers to
-       allow for simultaneous bi-directional exchange.  If false, then
-       the forwards and backwards buffers will alias (saving memory).
-    */
-    void createComms(const lat_dim_t &R, bool no_comms_fill, bool bidir = true);
-
-    /**
-       @brief Allocate the ghost buffers
-       @param[in] R The thickness of the extended region in each dimension
-       @param[in] no_comms_fill Do local exchange to fill out the extended
-       @param[in] bidir Is this a bi-directional exchange - if not
-       then we alias the fowards and backwards offsetss
-       region in non-partitioned dimensions
-    */
-    void allocateGhostBuffer(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const;
 
     /**
-       @brief Start the receive communicators
-       @param[in] dim The communication dimension
-       @param[in] dir The communication direction (0=backwards, 1=forwards)
-    */
-    void recvStart(int dim, int dir);
-
-    /**
-       @brief Start the sending communicators
-       @param[in] dim The communication dimension
-       @param[in] dir The communication direction (0=backwards, 1=forwards)
-       @param[in] stream_p Pointer to CUDA stream to post the
-       communication in (if 0, then use null stream)
+      @brief Copy all contents of the field to a host buffer.
+      @param[in] the host buffer to copy to.
     */
-    void sendStart(int dim, int dir, const qudaStream_t &stream_p);
+    void copy_to_buffer(void *buffer) const;
 
     /**
-       @brief Wait for communication to complete
-       @param[in] dim The communication dimension
-       @param[in] dir The communication direction (0=backwards, 1=forwards)
+      @brief Copy all contents of the field from a host buffer to this field.
+      @param[in] the host buffer to copy from.
     */
-    void commsComplete(int dim, int dir);
+    void copy_from_buffer(void *buffer);
+  };
 
-    /**
-       @brief This does routine will populate the border / halo region of a
-       gauge field that has been created using copyExtendedGauge.
-       @param R The thickness of the extended region in each dimension
-       @param no_comms_fill Do local exchange to fill out the extended
-       region in non-partitioned dimensions
-    */
-    void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false);
+  class cudaGaugeField : public GaugeField
+  {
 
-    /**
-       @brief This does routine will populate the border / halo region
-       of a gauge field that has been created using copyExtendedGauge.
-       Overloaded variant that will start and stop a comms profile.
-       @param R The thickness of the extended region in each dimension
-       @param profile TimeProfile intance which will record the time taken
-       @param no_comms_fill Do local exchange to fill out the extended
-       region in non-partitioned dimensions
-    */
-    void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false);
+  public:
+    cudaGaugeField(const GaugeFieldParam &);
 
     /**
      * Generic gauge field copy
@@ -624,18 +614,6 @@ namespace quda {
        @param[in] profile Time profile to record the transfer
     */
     void saveCPUField(cpuGaugeField &cpu, TimeProfile &profile) const;
-
-    /**
-      @brief Copy all contents of the field to a host buffer.
-      @param[in] the host buffer to copy to.
-    */
-    virtual void copy_to_buffer(void *buffer) const;
-
-    /**
-      @brief Copy all contents of the field from a host buffer to this field.
-      @param[in] the host buffer to copy from.
-    */
-    virtual void copy_from_buffer(void *buffer);
   };
 
   class cpuGaugeField : public GaugeField {
@@ -654,59 +632,11 @@ namespace quda {
     */
     cpuGaugeField(const GaugeFieldParam &param);
 
-    /**
-       @brief Exchange the ghost and store store in the padded region
-       @param[in] link_direction Which links are we extracting: this
-       flag only applies to bi-directional coarse-link fields
-     */
-    void exchangeGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS);
-
-    /**
-       @brief The opposite of exchangeGhost: take the ghost zone on x,
-       send to node x-1, and inject back into the field
-       @param[in] link_direction Which links are we injecting: this
-       flag only applies to bi-directional coarse-link fields
-     */
-    void injectGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS);
-
-    /**
-       @brief This does routine will populate the border / halo region of a
-       gauge field that has been created using copyExtendedGauge.
-
-       @param R The thickness of the extended region in each dimension
-       @param no_comms_fill Do local exchange to fill out the extended
-       region in non-partitioned dimenions
-    */
-    void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false);
-
-    /**
-       @brief This does routine will populate the border / halo region
-       of a gauge field that has been created using copyExtendedGauge.
-       Overloaded variant that will start and stop a comms profile.
-       @param R The thickness of the extended region in each dimension
-       @param profile TimeProfile intance which will record the time taken
-       @param no_comms_fill Do local exchange to fill out the extended
-       region in non-partitioned dimensions
-    */
-    void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false);
-
     /**
      * Generic gauge field copy
      * @param[in] src Source from which we are copying
      */
     void copy(const GaugeField &src);
-
-    /**
-      @brief Copy all contents of the field to a host buffer.
-      @param[in] the host buffer to copy to.
-    */
-    virtual void copy_to_buffer(void *buffer) const;
-
-    /**
-      @brief Copy all contents of the field from a host buffer to this field.
-      @param[in] the host buffer to copy from.
-    */
-    virtual void copy_from_buffer(void *buffer);
   };
 
   /**
diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp
index 604bf04c13..a6eb000b58 100644
--- a/lib/cpu_gauge_field.cpp
+++ b/lib/cpu_gauge_field.cpp
@@ -20,134 +20,6 @@ namespace quda {
     if (param.compute_fat_link_max) fat_link_max = this->abs_max();
   }
 
-  // This does the exchange of the gauge field ghost zone and places it
-  // into the ghost array.
-  void cpuGaugeField::exchangeGhost(QudaLinkDirection link_direction) {
-    if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY)
-      errorQuda("Cannot exchange for %d geometry gauge field", geometry);
-
-    if ( (link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == QUDA_LINK_FORWARDS) && geometry != QUDA_COARSE_GEOMETRY)
-      errorQuda("Cannot request exchange of forward links on non-coarse geometry");
-
-    void *send[2 * QUDA_MAX_DIM];
-    for (int d=0; d<nDim; d++) {
-      send[d] = safe_malloc(nFace * surface[d] * nInternal * precision);
-      if (geometry == QUDA_COARSE_GEOMETRY) send[d+4] = safe_malloc(nFace * surface[d] * nInternal * precision);
-    }
-
-    void *ghost_[2 * QUDA_MAX_DIM];
-    for (auto i = 0; i < geometry; i++) ghost_[i] = ghost[i].data();
-
-    // get the links into contiguous buffers
-    if (link_direction == QUDA_LINK_BACKWARDS || link_direction == QUDA_LINK_BIDIRECTIONAL) {
-      extractGaugeGhost(*this, send, true);
-
-      // communicate between nodes
-      exchange(ghost_, send, QUDA_FORWARDS);
-    }
-
-    // repeat if requested and links are bi-directional
-    if (link_direction == QUDA_LINK_FORWARDS || link_direction == QUDA_LINK_BIDIRECTIONAL) {
-      extractGaugeGhost(*this, send, true, nDim);
-      exchange(ghost_+nDim, send+nDim, QUDA_FORWARDS);
-    }
-
-    for (int d = 0; d < geometry; d++) host_free(send[d]);
-  }
-
-  // This does the opposite of exchangeGhost and sends back the ghost
-  // zone to the node from which it came and injects it back into the
-  // field
-  void cpuGaugeField::injectGhost(QudaLinkDirection link_direction) {
-    if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY)
-      errorQuda("Cannot exchange for %d geometry gauge field", geometry);
-
-    if (link_direction != QUDA_LINK_BACKWARDS)
-      errorQuda("link_direction = %d not supported", link_direction);
-
-    void *recv[QUDA_MAX_DIM];
-    for (int d=0; d<nDim; d++) recv[d] = safe_malloc(nFace*surface[d]*nInternal*precision);
-
-    void *ghost_[] = {ghost[0].data(), ghost[1].data(), ghost[2].data(), ghost[3].data(),
-                      ghost[4].data(), ghost[5].data(), ghost[6].data(), ghost[7].data()};
-
-    // communicate between nodes
-    exchange(recv, ghost_, QUDA_BACKWARDS);
-
-    // get the links into contiguous buffers
-    extractGaugeGhost(*this, recv, false);
-
-    for (int d = 0; d < QUDA_MAX_DIM; d++) host_free(recv[d]);
-  }
-
-  void cpuGaugeField::exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill)
-  {
-
-    void *send[QUDA_MAX_DIM];
-    void *recv[QUDA_MAX_DIM];
-    size_t bytes[QUDA_MAX_DIM];
-    // store both parities and directions in each
-    for (int d=0; d<nDim; d++) {
-      if (!(comm_dim_partitioned(d) || (no_comms_fill && R[d])) ) continue;
-      bytes[d] = surface[d] * R[d] * geometry * nInternal * precision;
-      send[d] = safe_malloc(2 * bytes[d]);
-      recv[d] = safe_malloc(2 * bytes[d]);
-    }
-
-    for (int d=0; d<nDim; d++) {
-      if (!(comm_dim_partitioned(d) || (no_comms_fill && R[d])) ) continue;
-      //extract into a contiguous buffer
-      extractExtendedGaugeGhost(*this, d, R, send, true);
-
-      if (comm_dim_partitioned(d)) {
-	// do the exchange
-	MsgHandle *mh_recv_back;
-	MsgHandle *mh_recv_fwd;
-	MsgHandle *mh_send_fwd;
-	MsgHandle *mh_send_back;
-
-	mh_recv_back = comm_declare_receive_relative(recv[d], d, -1, bytes[d]);
-	mh_recv_fwd  = comm_declare_receive_relative(((char*)recv[d])+bytes[d], d, +1, bytes[d]);
-	mh_send_back = comm_declare_send_relative(send[d], d, -1, bytes[d]);
-	mh_send_fwd  = comm_declare_send_relative(((char*)send[d])+bytes[d], d, +1, bytes[d]);
-
-	comm_start(mh_recv_back);
-	comm_start(mh_recv_fwd);
-	comm_start(mh_send_fwd);
-	comm_start(mh_send_back);
-
-	comm_wait(mh_send_fwd);
-	comm_wait(mh_send_back);
-	comm_wait(mh_recv_back);
-	comm_wait(mh_recv_fwd);
-
-	comm_free(mh_send_fwd);
-	comm_free(mh_send_back);
-	comm_free(mh_recv_back);
-	comm_free(mh_recv_fwd);
-      } else {
-	memcpy(static_cast<char*>(recv[d])+bytes[d], send[d], bytes[d]);
-	memcpy(recv[d], static_cast<char*>(send[d])+bytes[d], bytes[d]);
-      }
-
-      // inject back into the gauge field
-      extractExtendedGaugeGhost(*this, d, R, recv, false);
-    }
-
-    for (int d=0; d<nDim; d++) {
-      if (!(comm_dim_partitioned(d) || (no_comms_fill && R[d])) ) continue;
-      host_free(send[d]);
-      host_free(recv[d]);
-    }
-  }
-
-  void cpuGaugeField::exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill)
-  {
-    profile.TPSTART(QUDA_PROFILE_COMMS);
-    exchangeExtendedGhost(R, no_comms_fill);
-    profile.TPSTOP(QUDA_PROFILE_COMMS);
-  }
-
   // defined in cudaGaugeField
   void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry);
   void **create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry);
@@ -224,36 +96,4 @@ namespace quda {
     }
   }
 
-  void cpuGaugeField::copy_to_buffer(void *buffer) const
-  {
-    if (is_pointer_array(order)) {
-      char *dst_buffer = reinterpret_cast<char *>(buffer);
-      for (int d = 0; d < site_dim; d++) {
-        std::memcpy(&dst_buffer[d * bytes / site_dim], gauge_array[d].data(), bytes / site_dim);
-      }
-    } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER
-               || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER
-               || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
-      std::memcpy(buffer, data(), Bytes());
-    } else {
-      errorQuda("Unsupported order = %d", Order());
-    }
-  }
-
-  void cpuGaugeField::copy_from_buffer(void *buffer)
-  {
-    if (is_pointer_array(order)) {
-      const char *dst_buffer = reinterpret_cast<const char *>(buffer);
-      for (int d = 0; d < site_dim; d++) {
-        std::memcpy(gauge_array[d].data(), &dst_buffer[d * bytes / site_dim], bytes / site_dim);
-      }
-    } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER
-               || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER
-               || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
-      std::memcpy(data(), buffer, Bytes());
-    } else {
-      errorQuda("Unsupported order = %d", Order());
-    }
-  }
-
 } // namespace quda
diff --git a/lib/cuda_gauge_field.cpp b/lib/cuda_gauge_field.cpp
index ae21213770..3e5e60acdc 100644
--- a/lib/cuda_gauge_field.cpp
+++ b/lib/cuda_gauge_field.cpp
@@ -16,345 +16,6 @@ namespace quda {
       }
   }
 
-  // This does the exchange of the forwards boundary gauge field ghost zone and places
-  // it into the ghost array of the next node
-  void cudaGaugeField::exchangeGhost(QudaLinkDirection link_direction) {
-
-    if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
-    if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) errorQuda("Invalid geometry=%d", geometry);
-    if ( (link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == QUDA_LINK_FORWARDS) && geometry != QUDA_COARSE_GEOMETRY)
-      errorQuda("Cannot request exchange of forward links on non-coarse geometry");
-    if (nFace == 0) errorQuda("nFace = 0");
-
-    const int dir = 1; // sending forwards only
-    const lat_dim_t R = {nFace, nFace, nFace, nFace};
-    const bool no_comms_fill = true; // dslash kernels presently require this
-    const bool bidir = false; // communication is only ever done in one direction at once
-    createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge
-
-    // loop over backwards and forwards links
-    const QudaLinkDirection directions[] = {QUDA_LINK_BACKWARDS, QUDA_LINK_FORWARDS};
-    for (int link_dir = 0; link_dir<2; link_dir++) {
-      if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;
-
-      void *send_d[2*QUDA_MAX_DIM] = { };
-      void *recv_d[2*QUDA_MAX_DIM] = { };
-
-      size_t offset = 0;
-      for (int d=0; d<nDim; d++) {
-        recv_d[d] = static_cast<char *>(ghost_recv_buffer_d[bufferIndex]) + offset;
-        if (bidir) offset += ghost_face_bytes_aligned[d];
-        send_d[d] = static_cast<char *>(ghost_send_buffer_d[bufferIndex]) + offset;
-        offset += ghost_face_bytes_aligned[d];
-      }
-
-      extractGaugeGhost(*this, send_d, true, link_dir*nDim); // get the links into contiguous buffers
-      qudaDeviceSynchronize(); // synchronize before issuing mem copies in different streams - could replace with event post and wait
-
-      // issue receive preposts and host-to-device copies if needed
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim)) continue;
-	recvStart(dim, dir); // prepost the receive
-	if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
-          qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
-                          ghost_face_bytes[dim], qudaMemcpyDeviceToHost, device::get_stream(2 * dim + dir));
-        }
-      }
-
-      // if gdr enabled then synchronize
-      if (comm_gdr_enabled()) qudaDeviceSynchronize();
-
-      // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim)) continue;
-        if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled())
-          qudaStreamSynchronize(device::get_stream(2 * dim + dir));
-        sendStart(dim, dir, device::get_stream(2 * dim + dir)); // start sending
-      }
-
-      // complete communication and issue host-to-device copies if needed
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim)) continue;
-	commsComplete(dim, dir);
-	if (!comm_peer2peer_enabled(1-dir,dim) && !comm_gdr_enabled()) {
-          qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1 - dir], from_face_dim_dir_h[bufferIndex][dim][1 - dir],
-                          ghost_face_bytes[dim], qudaMemcpyHostToDevice, device::get_stream(2 * dim + dir));
-        }
-      }
-
-      qudaDeviceSynchronize(); // synchronize before issuing kernels / copies in default stream - could replace with event post and wait
-
-      // fill in the halos for non-partitioned dimensions
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim) && no_comms_fill) {
-          qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
-        }
-      }
-
-      if (isNative()) {
-	copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, 0, recv_d, 1 + 2*link_dir); // 1, 3
-      } else {
-	// copy from receive buffer into ghost array
-	for (int dim=0; dim<nDim; dim++)
-          qudaMemcpy(ghost[dim + link_dir * nDim].data(), recv_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
-      }
-
-      bufferIndex = 1-bufferIndex;
-    } // link_dir
-
-    qudaDeviceSynchronize();
-  }
-
-  // This does the opposite of exchangeGhost and sends back the ghost
-  // zone to the node from which it came and injects it back into the
-  // field
-  void cudaGaugeField::injectGhost(QudaLinkDirection link_direction)
-  {
-    if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
-    if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) errorQuda("Invalid geometry=%d", geometry);
-    if (link_direction != QUDA_LINK_BACKWARDS) errorQuda("Invalid link_direction = %d", link_direction);
-    if (nFace == 0) errorQuda("nFace = 0");
-
-    const int dir = 0; // sending backwards only
-    const lat_dim_t R = {nFace, nFace, nFace, nFace};
-    const bool no_comms_fill = false; // injection never does no_comms_fill
-    const bool bidir = false; // communication is only ever done in one direction at once
-    createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge
-
-    // loop over backwards and forwards links (forwards links never sent but leave here just in case)
-    const QudaLinkDirection directions[] = {QUDA_LINK_BACKWARDS, QUDA_LINK_FORWARDS};
-    for (int link_dir = 0; link_dir<2; link_dir++) {
-      if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;
-
-      void *send_d[2*QUDA_MAX_DIM] = { };
-      void *recv_d[2*QUDA_MAX_DIM] = { };
-
-      size_t offset = 0;
-      for (int d=0; d<nDim; d++) {
-	// send backwards is first half of each ghost_send_buffer
-        send_d[d] = static_cast<char *>(ghost_send_buffer_d[bufferIndex]) + offset;
-        if (bidir) offset += ghost_face_bytes_aligned[d];
-        // receive from forwards is the second half of each ghost_recv_buffer
-        recv_d[d] = static_cast<char *>(ghost_recv_buffer_d[bufferIndex]) + offset;
-        offset += ghost_face_bytes_aligned[d];
-      }
-
-      if (isNative()) { // copy from padded region in gauge field into send buffer
-	copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, send_d, 0, 1 + 2*link_dir);
-      } else { // copy from receive buffer into ghost array
-        for (int dim = 0; dim < nDim; dim++)
-          qudaMemcpy(send_d[dim], ghost[dim + link_dir * nDim].data(), ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
-      }
-      qudaDeviceSynchronize(); // need to synchronize before issueing copies in different streams - could replace with event post and wait
-
-      // issue receive preposts and host-to-device copies if needed
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim)) continue;
-	recvStart(dim, dir); // prepost the receive
-	if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
-          qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
-                          ghost_face_bytes[dim], qudaMemcpyDeviceToHost, device::get_stream(2 * dim + dir));
-        }
-      }
-
-      // if gdr enabled then synchronize
-      if (comm_gdr_enabled()) qudaDeviceSynchronize();
-
-      // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim)) continue;
-        if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled())
-          qudaStreamSynchronize(device::get_stream(2 * dim + dir));
-        sendStart(dim, dir, device::get_stream(2 * dim + dir)); // start sending
-      }
-
-      // complete communication and issue host-to-device copies if needed
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim)) continue;
-	commsComplete(dim, dir);
-	if (!comm_peer2peer_enabled(1-dir,dim) && !comm_gdr_enabled()) {
-          qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1 - dir], from_face_dim_dir_h[bufferIndex][dim][1 - dir],
-                          ghost_face_bytes[dim], qudaMemcpyHostToDevice, device::get_stream(2 * dim + dir));
-        }
-      }
-
-      qudaDeviceSynchronize(); // synchronize before issuing kernel / copies in default stream - could replace with event post and wait
-
-      // fill in the halos for non-partitioned dimensions
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim) && no_comms_fill) {
-          qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
-        }
-      }
-
-      // get the links into contiguous buffers
-      extractGaugeGhost(*this, recv_d, false, link_dir*nDim);
-
-      bufferIndex = 1-bufferIndex;
-    } // link_dir
-
-    qudaDeviceSynchronize();
-  }
-
-  void cudaGaugeField::allocateGhostBuffer(const lat_dim_t &R, bool no_comms_fill, bool bidir) const
-  {
-    createGhostZone(R, no_comms_fill, bidir);
-    LatticeField::allocateGhostBuffer(ghost_bytes);
-  }
-
-  void cudaGaugeField::createComms(const lat_dim_t &R, bool no_comms_fill, bool bidir)
-  {
-    allocateGhostBuffer(R, no_comms_fill, bidir); // allocate the ghost buffer if not yet allocated
-
-    // ascertain if this instance needs it comms buffers to be updated
-    bool comms_reset = ghost_field_reset || // FIXME add send buffer check
-      (my_face_h[0] != ghost_pinned_send_buffer_h[0]) || (my_face_h[1] != ghost_pinned_send_buffer_h[1]) ||
-      (from_face_h[0] != ghost_pinned_recv_buffer_h[0]) || (from_face_h[1] != ghost_pinned_recv_buffer_h[1]) ||
-      ghost_bytes != ghost_bytes_old; // ghost buffer has been resized (e.g., bidir to unidir)
-
-    if (!initComms || comms_reset) LatticeField::createComms(no_comms_fill);
-
-    if (ghost_field_reset) destroyIPCComms();
-    createIPCComms();
-  }
-
-  void cudaGaugeField::recvStart(int dim, int dir)
-  {
-    if (!comm_dim_partitioned(dim)) return;
-
-    // receive from neighboring the processor
-    if (comm_peer2peer_enabled(1 - dir, dim)) {
-      comm_start(mh_recv_p2p[bufferIndex][dim][1 - dir]);
-    } else if (comm_gdr_enabled()) {
-      comm_start(mh_recv_rdma[bufferIndex][dim][1 - dir]);
-    } else {
-      comm_start(mh_recv[bufferIndex][dim][1 - dir]);
-    }
-  }
-
-  void cudaGaugeField::sendStart(int dim, int dir, const qudaStream_t &stream)
-  {
-    if (!comm_dim_partitioned(dim)) return;
-
-    if (!comm_peer2peer_enabled(dir,dim)) {
-      if (comm_gdr_enabled()) {
-        comm_start(mh_send_rdma[bufferIndex][dim][dir]);
-      } else {
-        comm_start(mh_send[bufferIndex][dim][dir]);
-      }
-    } else { // doing peer-to-peer
-
-      void *ghost_dst
-        = static_cast<char *>(ghost_remote_send_buffer_d[bufferIndex][dim][dir]) + ghost_offset[dim][(dir + 1) % 2];
-
-      qudaMemcpyP2PAsync(ghost_dst, my_face_dim_dir_d[bufferIndex][dim][dir], ghost_face_bytes[dim], stream);
-
-      // record the event
-      qudaEventRecord(ipcCopyEvent[bufferIndex][dim][dir], stream);
-      // send to the neighboring processor
-      comm_start(mh_send_p2p[bufferIndex][dim][dir]);
-    }
-  }
-
-  void cudaGaugeField::commsComplete(int dim, int dir)
-  {
-    if (!comm_dim_partitioned(dim)) return;
-
-    if (comm_peer2peer_enabled(1 - dir, dim)) {
-      comm_wait(mh_recv_p2p[bufferIndex][dim][1 - dir]);
-      qudaEventSynchronize(ipcRemoteCopyEvent[bufferIndex][dim][1 - dir]);
-    } else if (comm_gdr_enabled()) {
-      comm_wait(mh_recv_rdma[bufferIndex][dim][1 - dir]);
-    } else {
-      comm_wait(mh_recv[bufferIndex][dim][1 - dir]);
-    }
-
-    if (comm_peer2peer_enabled(dir, dim)) {
-      comm_wait(mh_send_p2p[bufferIndex][dim][dir]);
-      qudaEventSynchronize(ipcCopyEvent[bufferIndex][dim][dir]);
-    } else if (comm_gdr_enabled()) {
-      comm_wait(mh_send_rdma[bufferIndex][dim][dir]);
-    } else {
-      comm_wait(mh_send[bufferIndex][dim][dir]);
-    }
-  }
-
-  void cudaGaugeField::exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill)
-  {
-    const int b = bufferIndex;
-    void *send_d[QUDA_MAX_DIM], *recv_d[QUDA_MAX_DIM];
-
-    createComms(R, no_comms_fill);
-
-    size_t offset = 0;
-    for (int dim=0; dim<nDim; dim++) {
-      if ( !(comm_dim_partitioned(dim) || (no_comms_fill && R[dim])) ) continue;
-      send_d[dim] = static_cast<char*>(ghost_send_buffer_d[b]) + offset;
-      recv_d[dim] = static_cast<char*>(ghost_recv_buffer_d[b]) + offset;
-
-      // silence cuda-memcheck initcheck errors that arise since we
-      // have an oversized ghost buffer when doing the extended exchange
-      qudaMemsetAsync(send_d[dim], 0, 2 * ghost_face_bytes_aligned[dim], device::get_default_stream());
-      offset += 2 * ghost_face_bytes_aligned[dim]; // factor of two from fwd/back
-    }
-
-    for (int dim=0; dim<nDim; dim++) {
-      if ( !(comm_dim_partitioned(dim) || (no_comms_fill && R[dim])) ) continue;
-
-      //extract into a contiguous buffer
-      extractExtendedGaugeGhost(*this, dim, R, send_d, true);
-
-      if (comm_dim_partitioned(dim)) {
-        qudaDeviceSynchronize(); // synchronize before issuing mem copies in different streams - could replace with event post and wait
-
-        for (int dir=0; dir<2; dir++) recvStart(dim, dir);
-
-	for (int dir=0; dir<2; dir++) {
-	  // issue host-to-device copies if needed
-	  if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
-            qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
-                            ghost_face_bytes[dim], qudaMemcpyDeviceToHost, device::get_stream(dir));
-          }
-        }
-
-        // if either direction is not peer-to-peer then we need to synchronize
-        if (!comm_peer2peer_enabled(0, dim) || !comm_peer2peer_enabled(1, dim)) qudaDeviceSynchronize();
-
-        for (int dir = 0; dir < 2; dir++) sendStart(dim, dir, device::get_stream(dir));
-        for (int dir = 0; dir < 2; dir++) commsComplete(dim, dir);
-
-        for (int dir = 0; dir < 2; dir++) {
-          // issue host-to-device copies if needed
-          if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) {
-            qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][dir], from_face_dim_dir_h[bufferIndex][dim][dir],
-                            ghost_face_bytes[dim], qudaMemcpyHostToDevice, device::get_stream(dir));
-          }
-        }
-
-      } else { // if just doing a local exchange to fill halo then need to swap faces
-        qudaMemcpy(from_face_dim_dir_d[b][dim][1], my_face_dim_dir_d[b][dim][0], ghost_face_bytes[dim],
-                   qudaMemcpyDeviceToDevice);
-        qudaMemcpy(from_face_dim_dir_d[b][dim][0], my_face_dim_dir_d[b][dim][1], ghost_face_bytes[dim],
-                   qudaMemcpyDeviceToDevice);
-      }
-
-      // inject back into the gauge field
-      // need to synchronize the copy streams before rejoining the compute stream - could replace with event post and wait
-      qudaDeviceSynchronize();
-      extractExtendedGaugeGhost(*this, dim, R, recv_d, false);
-    }
-
-    bufferIndex = 1-bufferIndex;
-    qudaDeviceSynchronize();
-  }
-
-  void cudaGaugeField::exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill)
-  {
-    profile.TPSTART(QUDA_PROFILE_COMMS);
-    exchangeExtendedGhost(R, no_comms_fill);
-    profile.TPSTOP(QUDA_PROFILE_COMMS);
-  }
-
   void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
     if (order == QUDA_QDP_GAUGE_ORDER) {
       void **buffer = new void*[geometry];
@@ -583,11 +244,4 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_D2H);
   }
 
-  void cudaGaugeField::copy_to_buffer(void *buffer) const
-  {
-    qudaMemcpy(buffer, data(), Bytes(), qudaMemcpyDeviceToHost);
-  }
-
-  void cudaGaugeField::copy_from_buffer(void *buffer) { qudaMemcpy(data(), buffer, Bytes(), qudaMemcpyHostToDevice); }
-
 } // namespace quda
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index caeddfa298..6e23168e87 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -302,6 +302,454 @@ namespace quda {
     staggeredPhaseApplied = false;
   }
 
+  void GaugeField::createComms(const lat_dim_t &R, bool no_comms_fill, bool bidir)
+  {
+    allocateGhostBuffer(R, no_comms_fill, bidir); // allocate the ghost buffer if not yet allocated
+
+    // ascertain if this instance needs it comms buffers to be updated
+    bool comms_reset = ghost_field_reset || // FIXME add send buffer check
+      (my_face_h[0] != ghost_pinned_send_buffer_h[0]) || (my_face_h[1] != ghost_pinned_send_buffer_h[1])
+      || (from_face_h[0] != ghost_pinned_recv_buffer_h[0]) || (from_face_h[1] != ghost_pinned_recv_buffer_h[1])
+      || ghost_bytes != ghost_bytes_old; // ghost buffer has been resized (e.g., bidir to unidir)
+
+    if (!initComms || comms_reset) LatticeField::createComms(no_comms_fill);
+
+    if (ghost_field_reset) destroyIPCComms();
+    createIPCComms();
+  }
+
+  void GaugeField::allocateGhostBuffer(const lat_dim_t &R, bool no_comms_fill, bool bidir) const
+  {
+    createGhostZone(R, no_comms_fill, bidir);
+    LatticeField::allocateGhostBuffer(ghost_bytes);
+  }
+
+  void GaugeField::recvStart(int dim, int dir)
+  {
+    if (!comm_dim_partitioned(dim)) return;
+
+    // receive from neighboring the processor
+    if (comm_peer2peer_enabled(1 - dir, dim)) {
+      comm_start(mh_recv_p2p[bufferIndex][dim][1 - dir]);
+    } else if (comm_gdr_enabled()) {
+      comm_start(mh_recv_rdma[bufferIndex][dim][1 - dir]);
+    } else {
+      comm_start(mh_recv[bufferIndex][dim][1 - dir]);
+    }
+  }
+
+  void GaugeField::sendStart(int dim, int dir, const qudaStream_t &stream)
+  {
+    if (!comm_dim_partitioned(dim)) return;
+
+    if (!comm_peer2peer_enabled(dir, dim)) {
+      if (comm_gdr_enabled()) {
+        comm_start(mh_send_rdma[bufferIndex][dim][dir]);
+      } else {
+        comm_start(mh_send[bufferIndex][dim][dir]);
+      }
+    } else { // doing peer-to-peer
+
+      void *ghost_dst
+        = static_cast<char *>(ghost_remote_send_buffer_d[bufferIndex][dim][dir]) + ghost_offset[dim][(dir + 1) % 2];
+
+      qudaMemcpyP2PAsync(ghost_dst, my_face_dim_dir_d[bufferIndex][dim][dir], ghost_face_bytes[dim], stream);
+
+      // record the event
+      qudaEventRecord(ipcCopyEvent[bufferIndex][dim][dir], stream);
+      // send to the neighboring processor
+      comm_start(mh_send_p2p[bufferIndex][dim][dir]);
+    }
+  }
+
+  void GaugeField::commsComplete(int dim, int dir)
+  {
+    if (!comm_dim_partitioned(dim)) return;
+
+    if (comm_peer2peer_enabled(1 - dir, dim)) {
+      comm_wait(mh_recv_p2p[bufferIndex][dim][1 - dir]);
+      qudaEventSynchronize(ipcRemoteCopyEvent[bufferIndex][dim][1 - dir]);
+    } else if (comm_gdr_enabled()) {
+      comm_wait(mh_recv_rdma[bufferIndex][dim][1 - dir]);
+    } else {
+      comm_wait(mh_recv[bufferIndex][dim][1 - dir]);
+    }
+
+    if (comm_peer2peer_enabled(dir, dim)) {
+      comm_wait(mh_send_p2p[bufferIndex][dim][dir]);
+      qudaEventSynchronize(ipcCopyEvent[bufferIndex][dim][dir]);
+    } else if (comm_gdr_enabled()) {
+      comm_wait(mh_send_rdma[bufferIndex][dim][dir]);
+    } else {
+      comm_wait(mh_send[bufferIndex][dim][dir]);
+    }
+  }
+
+  // This does the exchange of the forwards boundary gauge field ghost zone and places
+  // it into the ghost array of the next node
+  void GaugeField::exchangeGhost(QudaLinkDirection link_direction)
+  {
+    if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD)
+      errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
+    if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY)
+      errorQuda("Invalid geometry=%d", geometry);
+    if ((link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == QUDA_LINK_FORWARDS)
+        && geometry != QUDA_COARSE_GEOMETRY)
+      errorQuda("Cannot request exchange of forward links on non-coarse geometry");
+    if (nFace == 0) errorQuda("nFace = 0");
+
+    if (location == QUDA_CUDA_FIELD_LOCATION) {
+      const int dir = 1; // sending forwards only
+      const lat_dim_t R = {nFace, nFace, nFace, nFace};
+      const bool no_comms_fill = true; // dslash kernels presently require this
+      const bool bidir = false;        // communication is only ever done in one direction at once
+      createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge
+
+      // loop over backwards and forwards links
+      const QudaLinkDirection directions[] = {QUDA_LINK_BACKWARDS, QUDA_LINK_FORWARDS};
+      for (int link_dir = 0; link_dir < 2; link_dir++) {
+        if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;
+
+        void *send_d[2 * QUDA_MAX_DIM] = {};
+        void *recv_d[2 * QUDA_MAX_DIM] = {};
+
+        size_t offset = 0;
+        for (int d = 0; d < nDim; d++) {
+          recv_d[d] = static_cast<char *>(ghost_recv_buffer_d[bufferIndex]) + offset;
+          if (bidir) offset += ghost_face_bytes_aligned[d];
+          send_d[d] = static_cast<char *>(ghost_send_buffer_d[bufferIndex]) + offset;
+          offset += ghost_face_bytes_aligned[d];
+        }
+
+        extractGaugeGhost(*this, send_d, true, link_dir * nDim); // get the links into contiguous buffers
+        qudaDeviceSynchronize(); // synchronize before issuing mem copies in different streams - could replace with event post and wait
+
+        // issue receive preposts and host-to-device copies if needed
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim)) continue;
+          recvStart(dim, dir); // prepost the receive
+          if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) {
+            qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
+                            ghost_face_bytes[dim], qudaMemcpyDeviceToHost, device::get_stream(2 * dim + dir));
+          }
+        }
+
+        // if gdr enabled then synchronize
+        if (comm_gdr_enabled()) qudaDeviceSynchronize();
+
+        // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim)) continue;
+          if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled())
+            qudaStreamSynchronize(device::get_stream(2 * dim + dir));
+          sendStart(dim, dir, device::get_stream(2 * dim + dir)); // start sending
+        }
+
+        // complete communication and issue host-to-device copies if needed
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim)) continue;
+          commsComplete(dim, dir);
+          if (!comm_peer2peer_enabled(1 - dir, dim) && !comm_gdr_enabled()) {
+            qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1 - dir],
+                            from_face_dim_dir_h[bufferIndex][dim][1 - dir], ghost_face_bytes[dim],
+                            qudaMemcpyHostToDevice, device::get_stream(2 * dim + dir));
+          }
+        }
+
+        qudaDeviceSynchronize(); // synchronize before issuing kernels / copies in default stream - could replace with event post and wait
+
+        // fill in the halos for non-partitioned dimensions
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim) && no_comms_fill) {
+            qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
+          }
+        }
+
+        if (isNative()) {
+          copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, 0, recv_d, 1 + 2 * link_dir); // 1, 3
+        } else {
+          // copy from receive buffer into ghost array
+          for (int dim = 0; dim < nDim; dim++)
+            qudaMemcpy(ghost[dim + link_dir * nDim].data(), recv_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
+        }
+
+        bufferIndex = 1 - bufferIndex;
+      } // link_dir
+
+      qudaDeviceSynchronize();
+    } else { // cpu field
+      void *send[2 * QUDA_MAX_DIM];
+      for (int d = 0; d < nDim; d++) {
+        send[d] = safe_malloc(nFace * surface[d] * nInternal * precision);
+        if (geometry == QUDA_COARSE_GEOMETRY) send[d + 4] = safe_malloc(nFace * surface[d] * nInternal * precision);
+      }
+
+      void *ghost_[2 * QUDA_MAX_DIM];
+      for (auto i = 0; i < geometry; i++) ghost_[i] = ghost[i].data();
+
+      // get the links into contiguous buffers
+      if (link_direction == QUDA_LINK_BACKWARDS || link_direction == QUDA_LINK_BIDIRECTIONAL) {
+        extractGaugeGhost(*this, send, true);
+
+        // communicate between nodes
+        exchange(ghost_, send, QUDA_FORWARDS);
+      }
+
+      // repeat if requested and links are bi-directional
+      if (link_direction == QUDA_LINK_FORWARDS || link_direction == QUDA_LINK_BIDIRECTIONAL) {
+        extractGaugeGhost(*this, send, true, nDim);
+        exchange(ghost_ + nDim, send + nDim, QUDA_FORWARDS);
+      }
+
+      for (int d = 0; d < geometry; d++) host_free(send[d]);
+    }
+  }
+
+  // This does the opposite of exchangeGhost and sends back the ghost
+  // zone to the node from which it came and injects it back into the
+  // field
+  void GaugeField::injectGhost(QudaLinkDirection link_direction)
+  {
+    if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD)
+      errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
+    if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY)
+      errorQuda("Invalid geometry=%d", geometry);
+    if (link_direction != QUDA_LINK_BACKWARDS) errorQuda("Invalid link_direction = %d", link_direction);
+    if (nFace == 0) errorQuda("nFace = 0");
+
+    if (location == QUDA_CUDA_FIELD_LOCATION) {
+      const int dir = 0; // sending backwards only
+      const lat_dim_t R = {nFace, nFace, nFace, nFace};
+      const bool no_comms_fill = false; // injection never does no_comms_fill
+      const bool bidir = false;         // communication is only ever done in one direction at once
+      createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge
+
+      // loop over backwards and forwards links (forwards links never sent but leave here just in case)
+      const QudaLinkDirection directions[] = {QUDA_LINK_BACKWARDS, QUDA_LINK_FORWARDS};
+      for (int link_dir = 0; link_dir < 2; link_dir++) {
+        if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;
+
+        void *send_d[2 * QUDA_MAX_DIM] = {};
+        void *recv_d[2 * QUDA_MAX_DIM] = {};
+
+        size_t offset = 0;
+        for (int d = 0; d < nDim; d++) {
+          // send backwards is first half of each ghost_send_buffer
+          send_d[d] = static_cast<char *>(ghost_send_buffer_d[bufferIndex]) + offset;
+          if (bidir) offset += ghost_face_bytes_aligned[d];
+          // receive from forwards is the second half of each ghost_recv_buffer
+          recv_d[d] = static_cast<char *>(ghost_recv_buffer_d[bufferIndex]) + offset;
+          offset += ghost_face_bytes_aligned[d];
+        }
+
+        if (isNative()) { // copy from padded region in gauge field into send buffer
+          copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, send_d, 0, 1 + 2 * link_dir);
+        } else { // copy from receive buffer into ghost array
+          for (int dim = 0; dim < nDim; dim++)
+            qudaMemcpy(send_d[dim], ghost[dim + link_dir * nDim].data(), ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
+        }
+        qudaDeviceSynchronize(); // need to synchronize before issueing copies in different streams - could replace with event post and wait
+
+        // issue receive preposts and host-to-device copies if needed
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim)) continue;
+          recvStart(dim, dir); // prepost the receive
+          if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) {
+            qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
+                            ghost_face_bytes[dim], qudaMemcpyDeviceToHost, device::get_stream(2 * dim + dir));
+          }
+        }
+
+        // if gdr enabled then synchronize
+        if (comm_gdr_enabled()) qudaDeviceSynchronize();
+
+        // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim)) continue;
+          if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled())
+            qudaStreamSynchronize(device::get_stream(2 * dim + dir));
+          sendStart(dim, dir, device::get_stream(2 * dim + dir)); // start sending
+        }
+
+        // complete communication and issue host-to-device copies if needed
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim)) continue;
+          commsComplete(dim, dir);
+          if (!comm_peer2peer_enabled(1 - dir, dim) && !comm_gdr_enabled()) {
+            qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1 - dir],
+                            from_face_dim_dir_h[bufferIndex][dim][1 - dir], ghost_face_bytes[dim],
+                            qudaMemcpyHostToDevice, device::get_stream(2 * dim + dir));
+          }
+        }
+
+        qudaDeviceSynchronize(); // synchronize before issuing kernel / copies in default stream - could replace with event post and wait
+
+        // fill in the halos for non-partitioned dimensions
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim) && no_comms_fill) {
+            qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
+          }
+        }
+
+        // get the links into contiguous buffers
+        extractGaugeGhost(*this, recv_d, false, link_dir * nDim);
+
+        bufferIndex = 1 - bufferIndex;
+      } // link_dir
+
+      qudaDeviceSynchronize();
+    } else {
+      void *recv[QUDA_MAX_DIM];
+      for (int d = 0; d < nDim; d++) recv[d] = safe_malloc(nFace * surface[d] * nInternal * precision);
+
+      void *ghost_[] = {ghost[0].data(), ghost[1].data(), ghost[2].data(), ghost[3].data(),
+                        ghost[4].data(), ghost[5].data(), ghost[6].data(), ghost[7].data()};
+
+      // communicate between nodes
+      exchange(recv, ghost_, QUDA_BACKWARDS);
+
+      // get the links into contiguous buffers
+      extractGaugeGhost(*this, recv, false);
+
+      for (int d = 0; d < QUDA_MAX_DIM; d++) host_free(recv[d]);
+    }
+  }
+
+  void GaugeField::exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill)
+  {
+    if (location == QUDA_CUDA_FIELD_LOCATION) {
+      const int b = bufferIndex;
+      void *send_d[QUDA_MAX_DIM], *recv_d[QUDA_MAX_DIM];
+
+      createComms(R, no_comms_fill);
+
+      size_t offset = 0;
+      for (int dim = 0; dim < nDim; dim++) {
+        if (!(comm_dim_partitioned(dim) || (no_comms_fill && R[dim]))) continue;
+        send_d[dim] = static_cast<char *>(ghost_send_buffer_d[b]) + offset;
+        recv_d[dim] = static_cast<char *>(ghost_recv_buffer_d[b]) + offset;
+
+        // silence cuda-memcheck initcheck errors that arise since we
+        // have an oversized ghost buffer when doing the extended exchange
+        qudaMemsetAsync(send_d[dim], 0, 2 * ghost_face_bytes_aligned[dim], device::get_default_stream());
+        offset += 2 * ghost_face_bytes_aligned[dim]; // factor of two from fwd/back
+      }
+
+      for (int dim = 0; dim < nDim; dim++) {
+        if (!(comm_dim_partitioned(dim) || (no_comms_fill && R[dim]))) continue;
+
+        // extract into a contiguous buffer
+        extractExtendedGaugeGhost(*this, dim, R, send_d, true);
+
+        if (comm_dim_partitioned(dim)) {
+          qudaDeviceSynchronize(); // synchronize before issuing mem copies in different streams - could replace with event post and wait
+
+          for (int dir = 0; dir < 2; dir++) recvStart(dim, dir);
+
+          for (int dir = 0; dir < 2; dir++) {
+            // issue host-to-device copies if needed
+            if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) {
+              qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
+                              ghost_face_bytes[dim], qudaMemcpyDeviceToHost, device::get_stream(dir));
+            }
+          }
+
+          // if either direction is not peer-to-peer then we need to synchronize
+          if (!comm_peer2peer_enabled(0, dim) || !comm_peer2peer_enabled(1, dim)) qudaDeviceSynchronize();
+
+          for (int dir = 0; dir < 2; dir++) sendStart(dim, dir, device::get_stream(dir));
+          for (int dir = 0; dir < 2; dir++) commsComplete(dim, dir);
+
+          for (int dir = 0; dir < 2; dir++) {
+            // issue host-to-device copies if needed
+            if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) {
+              qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][dir], from_face_dim_dir_h[bufferIndex][dim][dir],
+                              ghost_face_bytes[dim], qudaMemcpyHostToDevice, device::get_stream(dir));
+            }
+          }
+
+        } else { // if just doing a local exchange to fill halo then need to swap faces
+          qudaMemcpy(from_face_dim_dir_d[b][dim][1], my_face_dim_dir_d[b][dim][0], ghost_face_bytes[dim],
+                     qudaMemcpyDeviceToDevice);
+          qudaMemcpy(from_face_dim_dir_d[b][dim][0], my_face_dim_dir_d[b][dim][1], ghost_face_bytes[dim],
+                     qudaMemcpyDeviceToDevice);
+        }
+
+        // inject back into the gauge field
+        // need to synchronize the copy streams before rejoining the compute stream - could replace with event post and wait
+        qudaDeviceSynchronize();
+        extractExtendedGaugeGhost(*this, dim, R, recv_d, false);
+      }
+
+      bufferIndex = 1 - bufferIndex;
+      qudaDeviceSynchronize();
+    } else {
+      void *send[QUDA_MAX_DIM];
+      void *recv[QUDA_MAX_DIM];
+      size_t bytes[QUDA_MAX_DIM];
+      // store both parities and directions in each
+      for (int d = 0; d < nDim; d++) {
+        if (!(comm_dim_partitioned(d) || (no_comms_fill && R[d]))) continue;
+        bytes[d] = surface[d] * R[d] * geometry * nInternal * precision;
+        send[d] = safe_malloc(2 * bytes[d]);
+        recv[d] = safe_malloc(2 * bytes[d]);
+      }
+
+      for (int d = 0; d < nDim; d++) {
+        if (!(comm_dim_partitioned(d) || (no_comms_fill && R[d]))) continue;
+        // extract into a contiguous buffer
+        extractExtendedGaugeGhost(*this, d, R, send, true);
+
+        if (comm_dim_partitioned(d)) {
+          // do the exchange
+          MsgHandle *mh_recv_back;
+          MsgHandle *mh_recv_fwd;
+          MsgHandle *mh_send_fwd;
+          MsgHandle *mh_send_back;
+
+          mh_recv_back = comm_declare_receive_relative(recv[d], d, -1, bytes[d]);
+          mh_recv_fwd = comm_declare_receive_relative(((char *)recv[d]) + bytes[d], d, +1, bytes[d]);
+          mh_send_back = comm_declare_send_relative(send[d], d, -1, bytes[d]);
+          mh_send_fwd = comm_declare_send_relative(((char *)send[d]) + bytes[d], d, +1, bytes[d]);
+
+          comm_start(mh_recv_back);
+          comm_start(mh_recv_fwd);
+          comm_start(mh_send_fwd);
+          comm_start(mh_send_back);
+
+          comm_wait(mh_send_fwd);
+          comm_wait(mh_send_back);
+          comm_wait(mh_recv_back);
+          comm_wait(mh_recv_fwd);
+
+          comm_free(mh_send_fwd);
+          comm_free(mh_send_back);
+          comm_free(mh_recv_back);
+          comm_free(mh_recv_fwd);
+        } else {
+          memcpy(static_cast<char *>(recv[d]) + bytes[d], send[d], bytes[d]);
+          memcpy(recv[d], static_cast<char *>(send[d]) + bytes[d], bytes[d]);
+        }
+
+        // inject back into the gauge field
+        extractExtendedGaugeGhost(*this, d, R, recv, false);
+      }
+
+      for (int d = 0; d < nDim; d++) {
+        if (!(comm_dim_partitioned(d) || (no_comms_fill && R[d]))) continue;
+        host_free(send[d]);
+        host_free(recv[d]);
+      }
+    }
+  }
+
+  void GaugeField::exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill)
+  {
+    profile.TPSTART(QUDA_PROFILE_COMMS);
+    exchangeExtendedGhost(R, no_comms_fill);
+    profile.TPSTOP(QUDA_PROFILE_COMMS);
+  }
+
   void GaugeField::exchange(void **ghost_link, void **link_sendbuf, QudaDirection dir) const
   {
     MsgHandle *mh_send[4];
@@ -596,4 +1044,44 @@ namespace quda {
     backed_up = false;
   }
 
+  void GaugeField::copy_to_buffer(void *buffer) const
+  {
+    if (location == QUDA_CUDA_FIELD_LOCATION) {
+      qudaMemcpy(buffer, data(), Bytes(), qudaMemcpyDeviceToHost);
+    } else {
+      if (is_pointer_array(order)) {
+        char *dst_buffer = reinterpret_cast<char *>(buffer);
+        for (int d = 0; d < site_dim; d++) {
+          std::memcpy(&dst_buffer[d * bytes / site_dim], gauge_array[d].data(), bytes / site_dim);
+        }
+      } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER
+                 || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER
+                 || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
+        std::memcpy(buffer, data(), Bytes());
+      } else {
+        errorQuda("Unsupported order = %d", Order());
+      }
+    }
+  }
+
+  void GaugeField::copy_from_buffer(void *buffer)
+  {
+    if (location == QUDA_CUDA_FIELD_LOCATION) {
+      qudaMemcpy(data(), buffer, Bytes(), qudaMemcpyHostToDevice);
+    } else {
+      if (is_pointer_array(order)) {
+        const char *dst_buffer = reinterpret_cast<const char *>(buffer);
+        for (int d = 0; d < site_dim; d++) {
+          std::memcpy(gauge_array[d].data(), &dst_buffer[d * bytes / site_dim], bytes / site_dim);
+        }
+      } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER
+                 || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER
+                 || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
+        std::memcpy(data(), buffer, Bytes());
+      } else {
+        errorQuda("Unsupported order = %d", Order());
+      }
+    }
+  }
+
 } // namespace quda

From af7fb2c76f2e985df8bbdffc383e7bfb36b840a6 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 11 May 2023 13:28:46 -0700
Subject: [PATCH 05/60] Further steps towards gauge field unification
 (copy/load/save routines now unified)

---
 include/gauge_field.h       |  15 +--
 include/gauge_field_order.h |  84 +++++++-------
 lib/cpu_gauge_field.cpp     |  82 --------------
 lib/cuda_gauge_field.cpp    | 165 ++-------------------------
 lib/gauge_field.cpp         | 218 +++++++++++++++++++++++++++++++++++-
 5 files changed, 269 insertions(+), 295 deletions(-)

diff --git a/include/gauge_field.h b/include/gauge_field.h
index cfc38855f4..4dd2352484 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -496,7 +496,7 @@ namespace quda {
      * Generic gauge field copy
      * @param[in] src Source from which we are copying
      */
-    virtual void copy(const GaugeField &src) = 0;
+    void copy(const GaugeField &src);
 
     /**
        @brief Compute the L1 norm of the field
@@ -581,12 +581,6 @@ namespace quda {
   public:
     cudaGaugeField(const GaugeFieldParam &);
 
-    /**
-     * Generic gauge field copy
-     * @param[in] src Source from which we are copying
-     */
-    void copy(const GaugeField &src);
-
     /**
        @brief Download into this field from a CPU field
        @param[in] cpu The CPU field source
@@ -618,7 +612,6 @@ namespace quda {
 
   class cpuGaugeField : public GaugeField {
 
-    friend void cudaGaugeField::copy(const GaugeField &cpu);
     friend void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu);
     friend void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const;
 
@@ -631,12 +624,6 @@ namespace quda {
        extended.
     */
     cpuGaugeField(const GaugeFieldParam &param);
-
-    /**
-     * Generic gauge field copy
-     * @param[in] src Source from which we are copying
-     */
-    void copy(const GaugeField &src);
   };
 
   /**
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index c9afebde5d..451c8312c6 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -422,25 +422,26 @@ namespace quda {
     template <typename Float, int nColor, bool native_ghost, typename storeFloat>
     struct GhostAccessor<Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat> {
       using wrapper = fieldorder_wrapper<Float, storeFloat>;
-      complex<storeFloat> *ghost[8];
-      unsigned int ghostOffset[8];
-      Float scale;
-      Float scale_inv;
+      complex<storeFloat> *ghost[8] = {};
+      unsigned int ghostOffset[8] = {};
+      Float scale = static_cast<Float>(1.0);
+      Float scale_inv = static_cast<Float>(1.0);
       static constexpr bool fixed = fixed_point<Float,storeFloat>();
 
-      GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) :
-        scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
+      GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr)
       {
-        for (int d=0; d<4; d++) {
-	  ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
-	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d].data()));
-	  ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
+        if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) {
+          for (int d=0; d<4; d++) {
+            ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
+              static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d].data()));
+            ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
 
-	  ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
-	    ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
-	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4].data()));
-	  ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
-	}
+            ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
+              ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
+              static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4].data()));
+            ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
+          }
+        }
 
 	resetScale(U.Scale());
       }
@@ -543,25 +544,26 @@ namespace quda {
     template <typename Float, int nColor, bool native_ghost, typename storeFloat>
     struct GhostAccessor<Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat> {
       using wrapper = fieldorder_wrapper<Float, storeFloat>;
-      complex<storeFloat> *ghost[8];
-      unsigned int ghostOffset[8];
-      Float scale;
-      Float scale_inv;
+      complex<storeFloat> *ghost[8] = {};
+      unsigned int ghostOffset[8] = {};
+      Float scale = static_cast<Float>(1.0);
+      Float scale_inv = static_cast<Float>(1.0);
       static constexpr bool fixed = fixed_point<Float,storeFloat>();
 
-      GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) :
-        scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
+      GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr)
       {
-        for (int d=0; d<4; d++) {
-	  ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
-	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d].data()));
-	  ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
+        if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) {
+          for (int d=0; d<4; d++) {
+            ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
+              static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d].data()));
+            ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
 
-	  ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
-	    ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
-	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4].data()));
-	  ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
-	}
+            ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
+              ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
+              static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4].data()));
+            ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
+          }
+        }
 
 	resetScale(U.Scale());
       }
@@ -674,26 +676,26 @@ namespace quda {
     template <typename Float, int nColor, bool native_ghost, typename storeFloat>
     struct GhostAccessor<Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat> {
       using wrapper = fieldorder_wrapper<Float, storeFloat>;
-      complex<storeFloat> *ghost[8];
+      complex<storeFloat> *ghost[8] = {};
       const int volumeCB;
-      unsigned int ghostVolumeCB[8];
-      Float scale;
-      Float scale_inv;
+      unsigned int ghostVolumeCB[8] = {};
+      Float scale = static_cast<Float>(1.0);
+      Float scale_inv = static_cast<Float>(1.0);
       static constexpr bool fixed = fixed_point<Float,storeFloat>();
       Accessor<Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat> accessor;
 
       GhostAccessor(const GaugeField &U, void *gauge_, void **ghost_ = 0) :
         volumeCB(U.VolumeCB()),
-        scale(static_cast<Float>(1.0)),
-        scale_inv(static_cast<Float>(1.0)),
         accessor(U, gauge_, ghost_)
       {
         if constexpr (!native_ghost) assert(ghost_ != nullptr);
-        for (int d = 0; d < 4; d++) {
-          ghost[d] = !native_ghost ? static_cast<complex<storeFloat>*>(ghost_[d]) : nullptr;
-	  ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d);
-	  ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast<complex<storeFloat>*>(ghost_[d+4]) : nullptr;
-	  ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d);
+        if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) {
+          for (int d = 0; d < 4; d++) {
+            ghost[d] = !native_ghost ? static_cast<complex<storeFloat>*>(ghost_[d]) : nullptr;
+            ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d);
+            ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast<complex<storeFloat>*>(ghost_[d+4]) : nullptr;
+            ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d);
+          }
         }
         resetScale(U.Scale());
       }
diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp
index a6eb000b58..0c340504bd 100644
--- a/lib/cpu_gauge_field.cpp
+++ b/lib/cpu_gauge_field.cpp
@@ -10,90 +10,8 @@ namespace quda {
   cpuGaugeField::cpuGaugeField(const GaugeFieldParam &param) :
     GaugeField(param)
   {
-    // exchange the boundaries if a non-trivial field
-    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD)
-      if (create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) {
-        exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
-      }
-
     // compute the fat link max now in case it is needed later (i.e., for half precision)
     if (param.compute_fat_link_max) fat_link_max = this->abs_max();
   }
 
-  // defined in cudaGaugeField
-  void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry);
-  void **create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry);
-  void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry);
-  void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry);
-
-  void cpuGaugeField::copy(const GaugeField &src) {
-    if (this == &src) return;
-
-    checkField(src);
-
-    if (link_type == QUDA_ASQTAD_FAT_LINKS) {
-      fat_link_max = src.LinkMax();
-      if (fat_link_max == 0.0 && precision < QUDA_SINGLE_PRECISION) fat_link_max = src.abs_max();
-    } else {
-      fat_link_max = 1.0;
-    }
-
-    if (typeid(src) == typeid(cudaGaugeField)) {
-
-      if (reorder_location() == QUDA_CPU_FIELD_LOCATION) {
-
-	if (!src.isNative()) errorQuda("Only native order is supported");
-	void *buffer = pool_pinned_malloc(src.Bytes());
-        qudaMemcpy(buffer, src.data(), src.Bytes(), qudaMemcpyDeviceToHost);
-
-        copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, nullptr, buffer);
-        pool_pinned_free(buffer);
-
-      } else { // else on the GPU
-
-	void *buffer = create_gauge_buffer(bytes, order, geometry);
-	size_t ghost_bytes[8];
-	int dstNinternal = reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : 2*nColor*nColor;
-	for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * dstNinternal * precision;
-	void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, order, geometry) : nullptr;
-
-	if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED) {
-          copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr);
-          if (geometry == QUDA_COARSE_GEOMETRY)
-            copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr,
-                             3); // forwards links if bi-directional
-        } else {
-	  copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0);
-	}
-
-	if (order == QUDA_QDP_GAUGE_ORDER) {
-	  for (int d=0; d<geometry; d++) {
-            qudaMemcpy(gauge_array[d].data(), ((void **)buffer)[d], bytes / geometry, qudaMemcpyDeviceToHost);
-          }
-	} else {
-          qudaMemcpy(gauge.data(), buffer, bytes, qudaMemcpyHostToDevice);
-        }
-
-	if (order > 4 && ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
-	  for (int d=0; d<geometry; d++)
-            qudaMemcpy(Ghost()[d].data(), ghost_buffer[d], ghost_bytes[d], qudaMemcpyDeviceToHost);
-
-        free_gauge_buffer(buffer, order, geometry);
-	if (nFace > 0) free_ghost_buffer(ghost_buffer, order, geometry);
-      }
-
-    } else if (typeid(src) == typeid(cpuGaugeField)) {
-      // copy field and ghost zone directly
-      copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION);
-    } else {
-      errorQuda("Invalid gauge field type");
-    }
-
-    // if we have copied from a source without a pad then we need to exchange
-    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD &&
-	src.GhostExchange() != QUDA_GHOST_EXCHANGE_PAD) {
-      exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
-    }
-  }
-
 } // namespace quda
diff --git a/lib/cuda_gauge_field.cpp b/lib/cuda_gauge_field.cpp
index 3e5e60acdc..39d3a28d02 100644
--- a/lib/cuda_gauge_field.cpp
+++ b/lib/cuda_gauge_field.cpp
@@ -7,168 +7,19 @@
 
 namespace quda {
 
-  cudaGaugeField::cudaGaugeField(const GaugeFieldParam &param) : GaugeField(param)
-  {
-    // exchange the boundaries if a non-trivial field
-    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD)
-      if (create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) {
-        exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
-      }
-  }
-
-  void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
-    if (order == QUDA_QDP_GAUGE_ORDER) {
-      void **buffer = new void*[geometry];
-      for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes/geometry);
-      return ((void*)buffer);
-    } else {
-      return pool_device_malloc(bytes);
-    }
-
-  }
-
-  void **create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
-
-    if (order > 4) {
-      void **buffer = new void*[geometry];
-      for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes[d]);
-      return buffer;
-    } else {
-      return 0;
-    }
-
-  }
-
-  void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
-    if (order == QUDA_QDP_GAUGE_ORDER) {
-      for (int d=0; d<geometry; d++) pool_device_free(((void**)buffer)[d]);
-      delete []((void**)buffer);
-    } else {
-      pool_device_free(buffer);
-    }
-  }
-
-  void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
-    if (order > 4) {
-      for (int d=0; d<geometry; d++) pool_device_free(buffer[d]);
-      delete []buffer;
-    }
-  }
-
-  void cudaGaugeField::copy(const GaugeField &src) {
-    if (this == &src) return;
-
-    checkField(src);
-
-    if (link_type == QUDA_ASQTAD_FAT_LINKS) {
-      fat_link_max = src.LinkMax();
-      if (fat_link_max == 0.0 && precision < QUDA_SINGLE_PRECISION) fat_link_max = src.abs_max();
-    } else {
-      fat_link_max = 1.0;
-    }
+  cudaGaugeField::cudaGaugeField(const GaugeFieldParam &param) : GaugeField(param) {}
 
-    if (typeid(src) == typeid(cudaGaugeField)) {
+  void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu) { copy(cpu); }
 
-      if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
-        // copy field and ghost zone into this field
-        copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION);
-
-        if (geometry == QUDA_COARSE_GEOMETRY)
-          copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, nullptr, nullptr, nullptr, 3);
-      } else {
-        copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, nullptr);
-        if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
-      }
-
-    } else if (typeid(src) == typeid(cpuGaugeField)) {
-      if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do reorder on the CPU
-	void *buffer = pool_pinned_malloc(bytes);
-
-	if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
-	  // copy field and ghost zone into buffer
-          copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr);
-
-          if (geometry == QUDA_COARSE_GEOMETRY)
-            copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr, 0, 0, 3);
-        } else {
-          copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr);
-          if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
-	}
-
-        qudaMemcpy(gauge.data(), buffer, bytes, qudaMemcpyDefault);
-        pool_pinned_free(buffer);
-      } else { // else on the GPU
-
-        if (src.Order() == QUDA_MILC_SITE_GAUGE_ORDER ||
-            src.Order() == QUDA_BQCD_GAUGE_ORDER      ||
-            src.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
-	  // special case where we use zero-copy memory to read/write directly from application's array
-          void *src_d = get_mapped_device_pointer(src.data());
-
-          if (src.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
-            copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, data(), src_d);
-          } else {
-            errorQuda("Ghost copy not supported here");
-          }
-
-        } else {
-	  void *buffer = create_gauge_buffer(src.Bytes(), src.Order(), src.Geometry());
-	  size_t ghost_bytes[8];
-	  int srcNinternal = src.Reconstruct() != QUDA_RECONSTRUCT_NO ? src.Reconstruct() : 2*nColor*nColor;
-	  for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * srcNinternal * src.Precision();
-	  void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, src.Order(), geometry) : nullptr;
-
-	  if (src.Order() == QUDA_QDP_GAUGE_ORDER) {
-	    for (int d=0; d<geometry; d++) {
-              qudaMemcpy(((void **)buffer)[d], src.data(d), src.Bytes() / geometry, qudaMemcpyDefault);
-            }
-          } else {
-            qudaMemcpy(buffer, src.data(), src.Bytes(), qudaMemcpyDefault);
-          }
-
-          if (src.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD
-              && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
-            for (int d = 0; d < geometry; d++)
-              qudaMemcpy(ghost_buffer[d], src.Ghost()[d].data(), ghost_bytes[d], qudaMemcpyDefault);
-
-          if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
-            copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer);
-            if (geometry == QUDA_COARSE_GEOMETRY)
-              copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer, 3);
-          } else {
-            copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer);
-            if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
-          }
-          free_gauge_buffer(buffer, src.Order(), src.Geometry());
-          if (nFace > 0) free_ghost_buffer(ghost_buffer, src.Order(), geometry);
-        }
-      } // reorder_location
-    } else {
-      errorQuda("Invalid gauge field type");
-    }
-
-    // if we have copied from a source without a pad then we need to exchange
-    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() != QUDA_GHOST_EXCHANGE_PAD)
-      exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
-
-    staggeredPhaseApplied = src.StaggeredPhaseApplied();
-    staggeredPhaseType = src.StaggeredPhase();
-
-    qudaDeviceSynchronize(); // include sync here for accurate host-device profiling
-  }
-
-  void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu) {
-    copy(cpu);
-    qudaDeviceSynchronize();
-  }
-
-  void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu, TimeProfile &profile) {
+  void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu, TimeProfile &profile)
+  {
     profile.TPSTART(QUDA_PROFILE_H2D);
-    loadCPUField(cpu);
+    copy(cpu);
     profile.TPSTOP(QUDA_PROFILE_H2D);
   }
 
-  void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const
+  void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const { cpu.copy(*this); }
+#if 0
   {
     cpu.checkField(*this);
 
@@ -237,7 +88,7 @@ namespace quda {
 
     qudaDeviceSynchronize();
   }
-
+#endif
   void cudaGaugeField::saveCPUField(cpuGaugeField &cpu, TimeProfile &profile) const {
     profile.TPSTART(QUDA_PROFILE_D2H);
     saveCPUField(cpu);
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 6e23168e87..2f59c11760 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -205,6 +205,12 @@ namespace quda {
     }
 
     setTuningString();
+
+    // exchange the boundaries if a non-trivial field
+    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD)
+      if (create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) {
+        exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
+      }
   }
 
   GaugeField::~GaugeField() { }
@@ -840,7 +846,217 @@ namespace quda {
     }
   }
 
-  std::ostream& operator<<(std::ostream& output, const GaugeFieldParam& param) {
+  void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
+    if (order == QUDA_QDP_GAUGE_ORDER) {
+      void **buffer = new void*[geometry];
+      for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes/geometry);
+      return ((void*)buffer);
+    } else {
+      return pool_device_malloc(bytes);
+    }
+
+  }
+
+  void **create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
+
+    if (order > 4) {
+      void **buffer = new void*[geometry];
+      for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes[d]);
+      return buffer;
+    } else {
+      return 0;
+    }
+
+  }
+
+  void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
+    if (order == QUDA_QDP_GAUGE_ORDER) {
+      for (int d=0; d<geometry; d++) pool_device_free(((void**)buffer)[d]);
+      delete []((void**)buffer);
+    } else {
+      pool_device_free(buffer);
+    }
+  }
+
+  void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
+    if (order > 4) {
+      for (int d=0; d<geometry; d++) pool_device_free(buffer[d]);
+      delete []buffer;
+    }
+  }
+
+  void GaugeField::copy(const GaugeField &src)
+  {
+    if (this == &src) return;
+
+    checkField(src);
+
+    if (link_type == QUDA_ASQTAD_FAT_LINKS) {
+      fat_link_max = src.LinkMax();
+      if (fat_link_max == 0.0 && precision < QUDA_SINGLE_PRECISION) fat_link_max = src.abs_max();
+    } else {
+      fat_link_max = 1.0;
+    }
+
+    if (src.Location() == QUDA_CUDA_FIELD_LOCATION) {
+
+      if (location == QUDA_CUDA_FIELD_LOCATION) {
+        if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
+          // copy field and ghost zone into this field
+          copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION);
+
+          if (geometry == QUDA_COARSE_GEOMETRY)
+            copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, nullptr, nullptr, nullptr, 3);
+        } else {
+          copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, nullptr);
+          if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
+        }
+      } else { // CPU location 
+        if (reorder_location() == QUDA_CPU_FIELD_LOCATION) {
+
+          if (!src.isNative()) errorQuda("Only native order is supported");
+          void *buffer = pool_pinned_malloc(src.Bytes());
+          qudaMemcpy(buffer, src.data(), src.Bytes(), qudaMemcpyDeviceToHost);
+
+          if (GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
+            copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, nullptr, buffer);
+          } else {
+            copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, nullptr, buffer);
+          }
+          pool_pinned_free(buffer);
+
+        } else { // else reorder on the GPU
+
+          if (order == QUDA_MILC_SITE_GAUGE_ORDER ||
+              order == QUDA_BQCD_GAUGE_ORDER      ||
+              order == QUDA_TIFR_PADDED_GAUGE_ORDER) {
+            // special case where we use zero-copy memory to read/write directly from application's array
+            void *data_d = get_mapped_device_pointer(data());
+            if (GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
+              copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, data_d, nullptr);
+            } else {
+              errorQuda("Ghost copy not supported here");
+            }
+          } else {
+            void *buffer = create_gauge_buffer(bytes, order, geometry);
+            size_t ghost_bytes[8];
+            int dstNinternal = reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : 2*nColor*nColor;
+            for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * dstNinternal * precision;
+            void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, order, geometry) : nullptr;
+            
+            if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED) {
+              copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr);
+              if (geometry == QUDA_COARSE_GEOMETRY)
+                copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr,
+                                 3); // forwards links if bi-directional
+            } else {
+              copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0);
+            }
+          
+            if (order == QUDA_QDP_GAUGE_ORDER) {
+              for (int d=0; d<geometry; d++) {
+                qudaMemcpy(gauge_array[d].data(), ((void **)buffer)[d], bytes / geometry, qudaMemcpyDeviceToHost);
+              }
+            } else {
+              qudaMemcpy(gauge.data(), buffer, bytes, qudaMemcpyDeviceToHost);
+            }
+
+            if (order > 4 && ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
+              for (int d=0; d<geometry; d++)
+                qudaMemcpy(Ghost()[d].data(), ghost_buffer[d], ghost_bytes[d], qudaMemcpyDeviceToHost);
+
+            free_gauge_buffer(buffer, order, geometry);
+            if (nFace > 0) free_ghost_buffer(ghost_buffer, order, geometry);
+          } // order
+        }
+
+      }
+
+    } else if (src.Location() == QUDA_CPU_FIELD_LOCATION) {
+
+      if (location == QUDA_CPU_FIELD_LOCATION) {
+        // copy field and ghost zone directly
+        copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION);
+      } else {
+        if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do reorder on the CPU
+          void *buffer = pool_pinned_malloc(bytes);
+
+          if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
+            // copy field and ghost zone into buffer
+            copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr);
+
+            if (geometry == QUDA_COARSE_GEOMETRY)
+              copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr, 0, 0, 3);
+          } else {
+            copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr);
+            if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
+          }
+
+          qudaMemcpy(gauge.data(), buffer, bytes, qudaMemcpyDefault);
+          pool_pinned_free(buffer);
+        } else { // else on the GPU
+
+          if (src.Order() == QUDA_MILC_SITE_GAUGE_ORDER ||
+              src.Order() == QUDA_BQCD_GAUGE_ORDER      ||
+              src.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
+            // special case where we use zero-copy memory to read/write directly from application's array
+            void *src_d = get_mapped_device_pointer(src.data());
+
+            if (src.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
+              copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, data(), src_d);
+            } else {
+              errorQuda("Ghost copy not supported here");
+            }
+
+          } else {
+            void *buffer = create_gauge_buffer(src.Bytes(), src.Order(), src.Geometry());
+            size_t ghost_bytes[8];
+            int srcNinternal = src.Reconstruct() != QUDA_RECONSTRUCT_NO ? src.Reconstruct() : 2*nColor*nColor;
+            for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * srcNinternal * src.Precision();
+            void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, src.Order(), geometry) : nullptr;
+
+            if (src.Order() == QUDA_QDP_GAUGE_ORDER) {
+              for (int d=0; d<geometry; d++) {
+                qudaMemcpy(((void **)buffer)[d], src.data(d), src.Bytes() / geometry, qudaMemcpyDefault);
+              }
+            } else {
+              qudaMemcpy(buffer, src.data(), src.Bytes(), qudaMemcpyDefault);
+            }
+
+            if (src.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD
+                && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
+              for (int d = 0; d < geometry; d++)
+                qudaMemcpy(ghost_buffer[d], src.Ghost()[d].data(), ghost_bytes[d], qudaMemcpyDefault);
+
+            if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
+              copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer);
+              if (geometry == QUDA_COARSE_GEOMETRY)
+                copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer, 3);
+            } else {
+              copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer);
+              if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
+            }
+            free_gauge_buffer(buffer, src.Order(), src.Geometry());
+            if (nFace > 0) free_ghost_buffer(ghost_buffer, src.Order(), geometry);
+          }
+        } // reorder_location
+      } // this location
+    } else {
+      errorQuda("Invalid gauge field type");
+    }
+
+    // if we have copied from a source without a pad then we need to exchange
+    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() != QUDA_GHOST_EXCHANGE_PAD)
+      exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
+
+    staggeredPhaseApplied = src.StaggeredPhaseApplied();
+    staggeredPhaseType = src.StaggeredPhase();
+
+    qudaDeviceSynchronize(); // include sync here for accurate host-device profiling
+  }
+
+  std::ostream& operator<<(std::ostream& output, const GaugeFieldParam& param)
+  {
     output << static_cast<const LatticeFieldParam &>(param);
     output << "nColor = " << param.nColor << std::endl;
     output << "nFace = " << param.nFace << std::endl;

From 0671db1111295922c29f11be7d6dad436c4ce34e Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 11 May 2023 14:15:37 -0700
Subject: [PATCH 06/60] Removed legacy load/save CPUField routines, replaced
 with GaugeField::copy

---
 include/gauge_field.h               |  30 --------
 lib/cpu_gauge_field.cpp             |   7 +-
 lib/cuda_gauge_field.cpp            |  86 ---------------------
 lib/gauge_field.cpp                 |   3 +
 lib/interface_quda.cpp              | 113 +++++++++++++++++-----------
 lib/staggered_kd_build_xinv.cu      |   2 +-
 tests/hisq_paths_force_test.cpp     |  12 +--
 tests/hisq_unitarize_force_test.cpp |   6 +-
 tests/pack_test.cpp                 |   8 +-
 tests/unitarize_link_test.cpp       |   4 +-
 10 files changed, 90 insertions(+), 181 deletions(-)

diff --git a/include/gauge_field.h b/include/gauge_field.h
index 4dd2352484..7648ba7f9b 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -581,40 +581,10 @@ namespace quda {
   public:
     cudaGaugeField(const GaugeFieldParam &);
 
-    /**
-       @brief Download into this field from a CPU field
-       @param[in] cpu The CPU field source
-    */
-    void loadCPUField(const cpuGaugeField &cpu);
-
-    /**
-       @brief Download into this field from a CPU field.  Overloaded
-       variant that includes profiling
-       @param[in] cpu The CPU field source
-       @param[in] profile Time profile to record the transfer
-    */
-    void loadCPUField(const cpuGaugeField &cpu, TimeProfile &profile);
-
-    /**
-       @brief Upload from this field into a CPU field
-       @param[out] cpu The CPU field source
-    */
-    void saveCPUField(cpuGaugeField &cpu) const;
-
-    /**
-       @brief Upload from this field into a CPU field.  Overloaded
-       variant that includes profiling.
-       @param[out] cpu The CPU field source
-       @param[in] profile Time profile to record the transfer
-    */
-    void saveCPUField(cpuGaugeField &cpu, TimeProfile &profile) const;
   };
 
   class cpuGaugeField : public GaugeField {
 
-    friend void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu);
-    friend void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const;
-
   public:
     /**
        @brief Constructor for cpuGaugeField from a GaugeFieldParam
diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp
index 0c340504bd..8927fdb2d3 100644
--- a/lib/cpu_gauge_field.cpp
+++ b/lib/cpu_gauge_field.cpp
@@ -7,11 +7,6 @@
 
 namespace quda {
 
-  cpuGaugeField::cpuGaugeField(const GaugeFieldParam &param) :
-    GaugeField(param)
-  {
-    // compute the fat link max now in case it is needed later (i.e., for half precision)
-    if (param.compute_fat_link_max) fat_link_max = this->abs_max();
-  }
+  cpuGaugeField::cpuGaugeField(const GaugeFieldParam &param) : GaugeField(param) {}
 
 } // namespace quda
diff --git a/lib/cuda_gauge_field.cpp b/lib/cuda_gauge_field.cpp
index 39d3a28d02..e4d56bdfce 100644
--- a/lib/cuda_gauge_field.cpp
+++ b/lib/cuda_gauge_field.cpp
@@ -9,90 +9,4 @@ namespace quda {
 
   cudaGaugeField::cudaGaugeField(const GaugeFieldParam &param) : GaugeField(param) {}
 
-  void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu) { copy(cpu); }
-
-  void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu, TimeProfile &profile)
-  {
-    profile.TPSTART(QUDA_PROFILE_H2D);
-    copy(cpu);
-    profile.TPSTOP(QUDA_PROFILE_H2D);
-  }
-
-  void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const { cpu.copy(*this); }
-#if 0
-  {
-    cpu.checkField(*this);
-
-    if (reorder_location() == QUDA_CUDA_FIELD_LOCATION) {
-
-      if (cpu.Order() == QUDA_MILC_SITE_GAUGE_ORDER ||
-          cpu.Order() == QUDA_BQCD_GAUGE_ORDER      ||
-          cpu.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
-	// special case where we use zero-copy memory to read/write directly from application's array
-        void *cpu_d = get_mapped_device_pointer(cpu.data());
-        if (cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
-          copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, cpu_d, nullptr);
-        } else {
-          errorQuda("Ghost copy not supported here");
-        }
-      } else {
-	void *buffer = create_gauge_buffer(cpu.Bytes(), cpu.Order(), cpu.Geometry());
-
-	// Allocate space for ghost zone if required
-	size_t ghost_bytes[8];
-	int cpuNinternal = cpu.Reconstruct() != QUDA_RECONSTRUCT_NO ? cpu.Reconstruct() : 2*nColor*nColor;
-	for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * cpuNinternal * cpu.Precision();
-	void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, cpu.Order(), geometry) : nullptr;
-
-	if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
-          copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr);
-          if (geometry == QUDA_COARSE_GEOMETRY)
-            copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr, 3);
-        } else {
-          copyExtendedGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr);
-        }
-
-        if (cpu.Order() == QUDA_QDP_GAUGE_ORDER) {
-          for (int d = 0; d < geometry; d++)
-            qudaMemcpy(cpu.data(d), ((void **)buffer)[d], cpu.Bytes() / geometry, qudaMemcpyDefault);
-        } else {
-          qudaMemcpy(cpu.data(), buffer, cpu.Bytes(), qudaMemcpyDefault);
-        }
-
-        if (cpu.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD
-            && cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
-          for (int d = 0; d < geometry; d++)
-            qudaMemcpy(cpu.Ghost()[d].data(), ghost_buffer[d], ghost_bytes[d], qudaMemcpyDefault);
-
-        free_gauge_buffer(buffer, cpu.Order(), cpu.Geometry());
-        if (nFace > 0) free_ghost_buffer(ghost_buffer, cpu.Order(), geometry);
-      }
-    } else if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do copy then host-side reorder
-
-      void *buffer = pool_pinned_malloc(bytes);
-      qudaMemcpy(buffer, gauge.data(), bytes, qudaMemcpyDefault);
-
-      if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
-        copyGenericGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, nullptr, buffer);
-      } else {
-        copyExtendedGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, nullptr, buffer);
-      }
-      pool_pinned_free(buffer);
-
-    } else {
-      errorQuda("Invalid pack location %d", reorder_location());
-    }
-
-    cpu.staggeredPhaseApplied = staggeredPhaseApplied;
-    cpu.staggeredPhaseType = staggeredPhaseType;
-
-    qudaDeviceSynchronize();
-  }
-#endif
-  void cudaGaugeField::saveCPUField(cpuGaugeField &cpu, TimeProfile &profile) const {
-    profile.TPSTART(QUDA_PROFILE_D2H);
-    saveCPUField(cpu);
-    profile.TPSTOP(QUDA_PROFILE_D2H);
-  }
-
 } // namespace quda
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 2f59c11760..40cb1bf9b6 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -211,6 +211,9 @@ namespace quda {
       if (create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) {
         exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
       }
+
+    // compute the fat link max now in case it is needed later (i.e., for half precision)
+    if (param.compute_fat_link_max) fat_link_max = this->abs_max();
   }
 
   GaugeField::~GaugeField() { }
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index dcb9873caf..27930a3b9a 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -799,7 +799,7 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
   }
 
   profileGauge.TPSTART(QUDA_PROFILE_D2H);
-  cudaGauge->saveCPUField(cpuGauge);
+  cpuGauge.copy(*cudaGauge);
   profileGauge.TPSTOP(QUDA_PROFILE_D2H);
 
   if (param->type == QUDA_SMEARED_LINKS) { delete cudaGauge; }
@@ -3852,7 +3852,9 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
   cudaGaugeField *cudaInLink = new cudaGaugeField(gParam);
   profileFatLink.TPSTOP(QUDA_PROFILE_INIT);
 
-  cudaInLink->loadCPUField(cpuInLink, profileFatLink);
+  profileFatLink.TPSTART(QUDA_PROFILE_H2D);
+  cudaInLink->copy(cpuInLink);
+  profileFatLink.TPSTOP(QUDA_PROFILE_H2D);
   cudaGaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink);
 
   profileFatLink.TPSTART(QUDA_PROFILE_FREE);
@@ -3874,7 +3876,9 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
     longKSLink(cudaLongLink, *cudaInLinkEx, path_coeff);
     profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-    cudaLongLink->saveCPUField(cpuLongLink, profileFatLink);
+    profileFatLink.TPSTART(QUDA_PROFILE_D2H);
+    cpuLongLink.copy(*cudaLongLink);
+    profileFatLink.TPSTOP(QUDA_PROFILE_D2H);
 
     profileFatLink.TPSTART(QUDA_PROFILE_FREE);
     delete cudaLongLink;
@@ -3889,7 +3893,11 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
   fatKSLink(cudaFatLink, *cudaInLinkEx, path_coeff);
   profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-  if (fatlink) cudaFatLink->saveCPUField(cpuFatLink, profileFatLink);
+  if (fatlink) {
+    profileFatLink.TPSTART(QUDA_PROFILE_D2H);
+    cpuFatLink.copy(*cudaFatLink);
+    profileFatLink.TPSTOP(QUDA_PROFILE_D2H);
+  }
 
   profileFatLink.TPSTART(QUDA_PROFILE_FREE);
   delete cudaInLinkEx;
@@ -3914,7 +3922,9 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
       errorQuda("Error in unitarization component of the hisq fattening: %d failures", *num_failures_h);
     profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-    cudaUnitarizedLink->saveCPUField(cpuUnitarizedLink, profileFatLink);
+    profileFatLink.TPSTART(QUDA_PROFILE_D2H);
+    cpuUnitarizedLink.copy(*cudaUnitarizedLink);
+    profileFatLink.TPSTOP(QUDA_PROFILE_D2H);
 
     profileFatLink.TPSTART(QUDA_PROFILE_FREE);
     delete cudaUnitarizedLink;
@@ -3954,7 +3964,9 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
     cudaGaugeField *cudaInLink = new cudaGaugeField(gParam);
     profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
 
-    cudaInLink->loadCPUField(cpuInLink, profileGaussianSmear);
+    profileGaussianSmear.TPSTART(QUDA_PROFILE_H2D);
+    cudaInLink->copy(cpuInLink);
+    profileGaussianSmear.TPSTOP(QUDA_PROFILE_H2D);
     //
     cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileGaussianSmear);
     //
@@ -3990,8 +4002,10 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
   gaugeSmeared->exchangeGhost();
 
   profileGaussianSmear.TPSTOP(QUDA_PROFILE_COMPUTE);
-  //
-  gaugeSmeared->saveCPUField(cpuTwoLink, profileGaussianSmear);
+
+  profileGaussianSmear.TPSTART(QUDA_PROFILE_D2H);
+  cpuTwoLink.copy(*gaugeSmeared);
+  profileGaussianSmear.TPSTOP(QUDA_PROFILE_D2H);
 
   profileGaussianSmear.TPSTART(QUDA_PROFILE_FREE);
 
@@ -4031,7 +4045,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
 
     profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
-    cudaSiteLink->loadCPUField(*cpuSiteLink);
+    cudaSiteLink->copy(*cpuSiteLink);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);
 
     profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);
@@ -4065,7 +4079,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
     if (!qudaGaugeParam->overwrite_mom) {
       profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
-      cudaMom->loadCPUField(*cpuMom);
+      cudaMom->copy(*cpuMom);
       profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);
     }
   }
@@ -4103,7 +4117,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
 
   if (qudaGaugeParam->return_result_mom) {
     profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);
-    cudaMom->saveCPUField(*cpuMom);
+    cpuMom->copy(*cudaMom);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);
   }
 
@@ -4166,7 +4180,7 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
     profileGaugePath.TPSTOP(QUDA_PROFILE_INIT);
 
     profileGaugePath.TPSTART(QUDA_PROFILE_H2D);
-    cudaSiteLink->loadCPUField(*cpuSiteLink);
+    cudaSiteLink->copy(*cpuSiteLink);
     profileGaugePath.TPSTOP(QUDA_PROFILE_H2D);
 
     profileGaugePath.TPSTART(QUDA_PROFILE_INIT);
@@ -4185,7 +4199,7 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
   profileGaugePath.TPSTOP(QUDA_PROFILE_INIT);
   if (!qudaGaugeParam->overwrite_gauge) {
     profileGaugePath.TPSTART(QUDA_PROFILE_H2D);
-    cudaOut->loadCPUField(*cpuOut);
+    cudaOut->copy(*cpuOut);
     profileGaugePath.TPSTOP(QUDA_PROFILE_H2D);
   }
 
@@ -4211,7 +4225,7 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
   profileGaugePath.TPSTOP(QUDA_PROFILE_COMPUTE);
 
   profileGaugePath.TPSTART(QUDA_PROFILE_D2H);
-  cudaOut->saveCPUField(*cpuOut);
+  cpuOut->copy(*cudaOut);
   profileGaugePath.TPSTOP(QUDA_PROFILE_D2H);
 
   profileGaugePath.TPSTART(QUDA_PROFILE_FREE);
@@ -4274,12 +4288,12 @@ void momResidentQuda(void *mom, QudaGaugeParam *param)
   if (param->make_resident_mom) {
     // we are downloading the momentum from the host
     profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
-    momResident->loadCPUField(cpuMom);
+    momResident->copy(cpuMom);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);
   } else if (param->return_result_mom) {
     // we are uploading the momentum to the host
     profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);
-    momResident->saveCPUField(cpuMom);
+    cpuMom.copy(*momResident);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);
 
     profileGaugeForce.TPSTART(QUDA_PROFILE_FREE);
@@ -4348,7 +4362,7 @@ void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
   auto* cudaGauge = new cudaGaugeField(gParam);
 
   if (gauge) {
-    cudaGauge->loadCPUField(*cpuGauge);
+    cudaGauge->copy(*cpuGauge);
     delete cpuGauge;
   }
 
@@ -4363,7 +4377,7 @@ void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param)
   gParam.geometry = cudaGauge->Geometry();
 
   cpuGaugeField cpuGauge(gParam);
-  cudaGauge->saveCPUField(cpuGauge);
+  cpuGauge.copy(*cudaGauge);
 }
 
 void destroyGaugeFieldQuda(void *gauge)
@@ -4424,7 +4438,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
     cudaMom = momResident;
   } else {
     // download the initial momentum (FIXME make an option just to return?)
-    cudaMom->loadCPUField(cpuMom);
+    cudaMom->copy(cpuMom);
   }
 
   // resident gauge field is required
@@ -4508,7 +4522,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
 
   if (gauge_param->return_result_mom) {
     // copy the momentum field back to the host
-    cudaMom->saveCPUField(cpuMom);
+    cpuMom.copy(*cudaMom);
   }
 
   if (gauge_param->make_resident_mom) {
@@ -4762,7 +4776,10 @@ void computeHISQForceQuda(void* const milc_momentum,
   cudaGaugeField *cudaWLink = new cudaGaugeField(wParam);
   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
-  cudaWLink->loadCPUField(cpuWLink, profileHISQForce);
+  profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
+  cudaWLink->copy(cpuWLink);
+  profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
+
   cudaWLink->exchangeExtendedGhost(cudaWLink->R(), profileHISQForce);
 
   cudaInForce->exchangeExtendedGhost(R, profileHISQForce);
@@ -4807,7 +4824,9 @@ void computeHISQForceQuda(void* const milc_momentum,
   cudaGaugeField *cudaVLink = new cudaGaugeField(vParam);
   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
-  cudaVLink->loadCPUField(cpuVLink, profileHISQForce);
+  profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
+  cudaVLink->copy(cpuVLink);
+  profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
   cudaVLink->exchangeExtendedGhost(cudaVLink->R(), profileHISQForce);
 
   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
@@ -4840,7 +4859,9 @@ void computeHISQForceQuda(void* const milc_momentum,
   cudaGaugeField *cudaULink = new cudaGaugeField(uParam);
   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
-  cudaULink->loadCPUField(cpuULink, profileHISQForce);
+  profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
+  cudaULink->copy(cpuULink);
+  profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
   cudaULink->exchangeExtendedGhost(cudaULink->R(), profileHISQForce);
 
   // Compute Fat7-staple term
@@ -4870,7 +4891,11 @@ void computeHISQForceQuda(void* const milc_momentum,
 
   if (gParam->return_result_mom) {
     // Close the paths, make anti-hermitian, and store in compressed format
-    if (gParam->return_result_mom) cudaMom->saveCPUField(*cpuMom, profileHISQForce);
+    if (gParam->return_result_mom) {
+      profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
+      cpuMom->copy(*cudaMom);
+      profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
+    }
   }
 
   profileHISQForce.TPSTART(QUDA_PROFILE_FREE);
@@ -5049,7 +5074,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
 
   // copy the outer product field back to the host
   profileCloverForce.TPSTART(QUDA_PROFILE_D2H);
-  cudaMom.saveCPUField(cpuMom);
+  cpuMom.copy(cudaMom);
   profileCloverForce.TPSTOP(QUDA_PROFILE_D2H);
 
   profileCloverForce.TPSTART(QUDA_PROFILE_FREE);
@@ -5117,7 +5142,7 @@ void updateGaugeFieldQuda(void* gauge,
   profileGaugeUpdate.TPSTART(QUDA_PROFILE_H2D);
 
   if (!param->use_resident_gauge) {   // load fields onto the device
-    cudaInGauge->loadCPUField(*cpuGauge);
+    cudaInGauge->copy(*cpuGauge);
   } else { // or use resident fields already present
     if (!gaugePrecise) errorQuda("No resident gauge field allocated");
     cudaInGauge = gaugePrecise;
@@ -5125,7 +5150,7 @@ void updateGaugeFieldQuda(void* gauge,
   }
 
   if (!param->use_resident_mom) {
-    cudaMom->loadCPUField(*cpuMom);
+    cudaMom->copy(*cpuMom);
   } else {
     if (!momResident) errorQuda("No resident mom field allocated");
     cudaMom = momResident;
@@ -5143,7 +5168,7 @@ void updateGaugeFieldQuda(void* gauge,
   if (param->return_result_gauge) {
     // copy the gauge field back to the host
     profileGaugeUpdate.TPSTART(QUDA_PROFILE_D2H);
-    cudaOutGauge->saveCPUField(*cpuGauge);
+    cpuGauge->copy(*cudaOutGauge);
     profileGaugeUpdate.TPSTOP(QUDA_PROFILE_D2H);
   }
 
@@ -5198,7 +5223,7 @@ void updateGaugeFieldQuda(void* gauge,
      gaugePrecise = nullptr;
    } else {
      profileProject.TPSTART(QUDA_PROFILE_H2D);
-     cudaGauge->loadCPUField(*cpuGauge);
+     cudaGauge->copy(*cpuGauge);
      profileProject.TPSTOP(QUDA_PROFILE_H2D);
    }
 
@@ -5215,9 +5240,11 @@ void updateGaugeFieldQuda(void* gauge,
    if(*num_failures_h>0)
      errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
 
-   profileProject.TPSTART(QUDA_PROFILE_D2H);
-   if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge);
-   profileProject.TPSTOP(QUDA_PROFILE_D2H);
+   if (param->return_result_gauge) {
+     profileProject.TPSTART(QUDA_PROFILE_D2H);
+     cpuGauge->copy(*cudaGauge);
+     profileProject.TPSTOP(QUDA_PROFILE_D2H);
+   }
 
    if (param->make_resident_gauge) {
      if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
@@ -5258,7 +5285,7 @@ void updateGaugeFieldQuda(void* gauge,
      cudaGauge = gaugePrecise;
    } else {
      profilePhase.TPSTART(QUDA_PROFILE_H2D);
-     cudaGauge->loadCPUField(*cpuGauge);
+     cudaGauge->copy(*cpuGauge);
      profilePhase.TPSTOP(QUDA_PROFILE_H2D);
    }
 
@@ -5271,9 +5298,11 @@ void updateGaugeFieldQuda(void* gauge,
 
    profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-   profilePhase.TPSTART(QUDA_PROFILE_D2H);
-   if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge);
-   profilePhase.TPSTOP(QUDA_PROFILE_D2H);
+   if (param->return_result_gauge) {
+     profilePhase.TPSTART(QUDA_PROFILE_D2H);
+     cpuGauge->copy(*cudaGauge);
+     profilePhase.TPSTOP(QUDA_PROFILE_D2H);
+   }
 
    if (param->make_resident_gauge) {
      if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
@@ -5319,7 +5348,7 @@ double momActionQuda(void* momentum, QudaGaugeParam* param)
 
   profileMomAction.TPSTART(QUDA_PROFILE_H2D);
   if (!param->use_resident_mom) {
-    cudaMom->loadCPUField(*cpuMom);
+    cudaMom->copy(*cpuMom);
   } else {
     if (!momResident) errorQuda("No resident mom field allocated");
     cudaMom = momResident;
@@ -5803,7 +5832,7 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u
   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_INIT);
   GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_H2D);
 
-  cudaInGauge->loadCPUField(*cpuGauge);
+  cudaInGauge->copy(*cpuGauge);
 
   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_H2D);
 
@@ -5829,7 +5858,7 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u
 
   // copy the gauge field back to the host
   GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_D2H);
-  cudaInGauge->saveCPUField(*cpuGauge);
+  cpuGauge->copy(*cudaInGauge);
   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_D2H);
 
   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_TOTAL);
@@ -5881,9 +5910,7 @@ int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir,  const
   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_INIT);
 
   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_H2D);
-
-  cudaInGauge->loadCPUField(*cpuGauge);
-
+  cudaInGauge->copy(*cpuGauge);
   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_H2D);
 
   // perform the update
@@ -5895,7 +5922,7 @@ int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir,  const
 
   // copy the gauge field back to the host
   GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_D2H);
-  cudaInGauge->saveCPUField(*cpuGauge);
+  cpuGauge->copy(*cudaInGauge);
   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_D2H);
 
   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_TOTAL);
diff --git a/lib/staggered_kd_build_xinv.cu b/lib/staggered_kd_build_xinv.cu
index 34ddb23137..2ed47976f4 100644
--- a/lib/staggered_kd_build_xinv.cu
+++ b/lib/staggered_kd_build_xinv.cu
@@ -193,7 +193,7 @@ namespace quda {
         tmp_U = std::make_unique<cpuGaugeField>(gf_param);
 
         //Copy the cuda gauge field to the cpu
-        gauge.saveCPUField(reinterpret_cast<cpuGaugeField&>(*tmp_U));
+        tmp_U.get()->copy(gauge);
 
       } else if (location == QUDA_CUDA_FIELD_LOCATION) {
 
diff --git a/tests/hisq_paths_force_test.cpp b/tests/hisq_paths_force_test.cpp
index 116780a790..58b8299223 100644
--- a/tests/hisq_paths_force_test.cpp
+++ b/tests/hisq_paths_force_test.cpp
@@ -376,15 +376,15 @@ static void hisq_force_startup()
    * Copy to and exchange gauge and outer product fields on the device *
    ********************************************************************/
   cpuGauge_ex->exchangeExtendedGhost(R, true);
-  cudaGauge_ex->loadCPUField(*cpuGauge);
+  cudaGauge_ex->copy(*cpuGauge);
   cudaGauge_ex->exchangeExtendedGhost(cudaGauge_ex->R());
 
   cpuOprod_ex->exchangeExtendedGhost(R, true);
-  cudaOprod_ex->loadCPUField(*cpuOprod);
+  cudaOprod_ex->copy(*cpuOprod);
   cudaOprod_ex->exchangeExtendedGhost(cudaOprod_ex->R());
 
   cpuLongLinkOprod_ex->exchangeExtendedGhost(R, true);
-  cudaLongLinkOprod_ex->loadCPUField(*cpuLongLinkOprod);
+  cudaLongLinkOprod_ex->copy(*cpuLongLinkOprod);
   cudaLongLinkOprod_ex->exchangeExtendedGhost(cudaLongLinkOprod_ex->R());
 
   /**********************
@@ -460,7 +460,7 @@ static int hisq_force_test(bool lepage)
 
     copyExtendedGauge(*cpuForce, *cpuForce_ex, QUDA_CPU_FIELD_LOCATION);
     copyExtendedGauge(*cudaForce, *cudaForce_ex, QUDA_CUDA_FIELD_LOCATION);
-    cudaForce->saveCPUField(*hostVerifyForce);
+    hostVerifyForce->copy(*cudaForce);
 
     int res = 1;
     for (int dir = 0; dir < 4; dir++) {
@@ -497,7 +497,7 @@ static int hisq_force_test(bool lepage)
 
       copyExtendedGauge(*cpuForce, *cpuForce_ex, QUDA_CPU_FIELD_LOCATION);
       copyExtendedGauge(*cudaForce, *cudaForce_ex, QUDA_CUDA_FIELD_LOCATION);
-      cudaForce->saveCPUField(*hostVerifyForce);
+      hostVerifyForce->copy(*cudaForce);
 
       int res = 1;
       for (int dir = 0; dir < 4; dir++) {
@@ -526,7 +526,7 @@ static int hisq_force_test(bool lepage)
     host_timer.stop();
     host_time_sec += host_timer.last();
 
-    cudaMom->saveCPUField(*cpuMom);
+    cpuMom->copy(*cudaMom);
   }
 
   int accuracy_level = 3;
diff --git a/tests/hisq_unitarize_force_test.cpp b/tests/hisq_unitarize_force_test.cpp
index 7a9d19255c..d27b09bfd8 100644
--- a/tests/hisq_unitarize_force_test.cpp
+++ b/tests/hisq_unitarize_force_test.cpp
@@ -89,8 +89,8 @@ static void hisq_force_init()
 
   gParam.order = QUDA_QDP_GAUGE_ORDER;
 
-  cudaFatLink->loadCPUField(*cpuFatLink);
-  cudaOprod->loadCPUField(*cpuOprod);
+  cudaFatLink->copy(*cpuFatLink);
+  cudaOprod->copy(*cpuOprod);
 }
 
 static void hisq_force_end()
@@ -135,7 +135,7 @@ TEST(hisq_force_unitarize, verify)
     quda::fermion_force::unitarizeForceCPU(*cpuResult, *cpuOprod, *cpuFatLink);
   }
 
-  cudaResult->saveCPUField(*cpuReference);
+  cpuReference->copy(*cudaResult);
 
   printfQuda("Comparing CPU and GPU results\n");
   int res[4];
diff --git a/tests/pack_test.cpp b/tests/pack_test.cpp
index 3c5974ddc8..694c993895 100644
--- a/tests/pack_test.cpp
+++ b/tests/pack_test.cpp
@@ -116,12 +116,12 @@ void packTest()
     cudaGaugeField cudaCpsGauge(cpsParam);
 
     host_timer.start();
-    cudaCpsGauge.loadCPUField(cpsCpuGauge);
+    cudaCpsGauge.copy(cpsCpuGauge);
     host_timer.stop();
     printfQuda("CPS Gauge send time = %e seconds\n", host_timer.last());
 
     host_timer.start();
-    cudaCpsGauge.saveCPUField(cpsCpuGauge);
+    cpuCpuGauge.copy(cudaCpsGauge);
     host_timer.stop();
     printfQuda("CPS Gauge restore time = %e seconds\n", host_timer.last());
   }
@@ -140,12 +140,12 @@ void packTest()
     cudaGaugeField cudaQdpGauge(qdpParam);
 
     host_timer.start();
-    cudaQdpGauge.loadCPUField(qdpCpuGauge);
+    cudaQdpGauge.copy(qdpCpuGauge);
     host_timer.stop();
     printfQuda("QDP Gauge send time = %e seconds\n", host_timer.last());
 
     host_timer.start();
-    cudaQdpGauge.saveCPUField(qdpCpuGauge);
+    qdpCpuGauge.copy(cudaQdpGauge);
     host_timer.stop();
     printfQuda("QDP Gauge restore time = %e seconds\n", host_timer.last());
   }
diff --git a/tests/unitarize_link_test.cpp b/tests/unitarize_link_test.cpp
index 1c4849ba4a..2d9dc14210 100644
--- a/tests/unitarize_link_test.cpp
+++ b/tests/unitarize_link_test.cpp
@@ -40,7 +40,7 @@ const double unittol = (prec == QUDA_DOUBLE_PRECISION) ? 1e-10 : 1e-6;
 TEST(unitarization, verify)
 {
   unitarizeLinksCPU(*cpuULink, *cpuFatLink);
-  cudaULink->saveCPUField(*cudaResult);
+  cudaResult->copy(*cudaULink);
 
   int res = compare_floats(cudaResult->data(), cpuULink->data(), 4 * cudaResult->Volume() * gauge_site_size, unittol,
                            cpu_prec);
@@ -151,7 +151,7 @@ static int unitarize_link_test(int &test_rc)
 
     computeKSLinkQuda(fatlink, NULL, NULL, inlink, act_path_coeff, &qudaGaugeParam);
 
-    cudaFatLink->loadCPUField(*cpuFatLink);
+    cudaFatLink->copy(*cpuFatLink);
   }
 
   quda::setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error,

From f5e8eaced32d193fd29f58e4283e98afa7d33fd6 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 12 May 2023 18:07:17 -0700
Subject: [PATCH 07/60] Removal of cpuGaugeField and cudaGaugeField, we have
 now only GaugeField

---
 include/dirac_quda.h                          |  66 ++--
 include/gauge_field.h                         |  27 +-
 include/gauge_field_order.h                   |  64 ++--
 include/lattice_field.h                       |   3 -
 include/multigrid.h                           |  14 +-
 include/quda.h                                |   4 +-
 include/quda_milc_interface.h                 |   2 +-
 include/staggered_kd_build_xinv.h             |   4 +-
 lib/CMakeLists.txt                            |   2 +-
 lib/coarse_op.in.cu                           |   6 +-
 lib/coarse_op_preconditioned.in.cu            |   8 +-
 lib/coarsecoarse_op_mma.in.cu                 |   4 +-
 lib/cpu_gauge_field.cpp                       |  12 -
 lib/cuda_gauge_field.cpp                      |  12 -
 lib/dirac_coarse.cpp                          |  34 +-
 lib/dirac_improved_staggered_kd.cpp           |   4 +-
 lib/dirac_staggered_kd.cpp                    |   2 +-
 lib/gauge_field.cpp                           |  48 +--
 lib/gauge_observable.cpp                      |   2 +-
 lib/gauge_polyakov_loop.cu                    |   4 +-
 lib/interface_quda.cpp                        | 310 +++++++++---------
 lib/lattice_field.cpp                         |   2 +-
 lib/milc_interface.cpp                        |   2 +-
 lib/multigrid.cpp                             |  14 +-
 lib/staggered_coarse_op.in.cpp                |  12 +-
 lib/staggered_coarse_op.in.cu                 |  22 +-
 lib/staggered_kd_build_xinv.cu                |  24 +-
 tests/covdev_test.cpp                         |   4 +-
 tests/gauge_alg_test.cpp                      |   8 +-
 tests/gauge_path_test.cpp                     |  14 +-
 tests/heatbath_test.cpp                       |   4 +-
 tests/hisq_paths_force_test.cpp               |  72 ++--
 tests/hisq_unitarize_force_test.cpp           |  28 +-
 .../domain_wall_dslash_reference.cpp          |   6 +-
 tests/host_reference/dslash_test_helpers.cpp  |   6 +-
 .../host_reference/gauge_force_reference.cpp  |   3 +
 tests/host_reference/hisq_force_reference.cpp |  10 +-
 tests/host_reference/hisq_force_reference.h   |  10 +-
 .../wilson_dslash_reference.cpp               |   2 +-
 tests/multigrid_benchmark_test.cpp            |  10 +-
 tests/multigrid_evolve_test.cpp               |   9 +-
 tests/pack_test.cpp                           |  10 +-
 tests/staggered_dslash_test_utils.h           |   8 +-
 tests/unitarize_link_test.cpp                 |  14 +-
 44 files changed, 422 insertions(+), 504 deletions(-)
 delete mode 100644 lib/cpu_gauge_field.cpp
 delete mode 100644 lib/cuda_gauge_field.cpp

diff --git a/include/dirac_quda.h b/include/dirac_quda.h
index dd47a6f8ba..e10437651f 100644
--- a/include/dirac_quda.h
+++ b/include/dirac_quda.h
@@ -51,9 +51,9 @@ namespace quda {
 
     QudaMatPCType matpcType;
     QudaDagType dagger;
-    cudaGaugeField *gauge;
-    cudaGaugeField *fatGauge;  // used by staggered only
-    cudaGaugeField *longGauge; // used by staggered only
+    GaugeField *gauge;
+    GaugeField *fatGauge;  // used by staggered only
+    GaugeField *longGauge; // used by staggered only
     int laplace3D;
     CloverField *clover;
     GaugeField *xInvKD; // used for the Kahler-Dirac operator only
@@ -164,7 +164,7 @@ namespace quda {
     friend class DiracG5M;
 
   protected:
-    cudaGaugeField *gauge;
+    GaugeField *gauge;
     double kappa;
     double mass;
     int laplace3D;
@@ -446,7 +446,7 @@ namespace quda {
 
         @return Error for non-staggered operators
     */
-    virtual cudaGaugeField *getStaggeredShortLinkField() const
+    virtual GaugeField *getStaggeredShortLinkField() const
     {
       errorQuda("Invalid dirac type %d", getDiracType());
       return nullptr;
@@ -457,7 +457,7 @@ namespace quda {
 
         @return Error for non-improved staggered operators
     */
-    virtual cudaGaugeField *getStaggeredLongLinkField() const
+    virtual GaugeField *getStaggeredLongLinkField() const
     {
       errorQuda("Invalid dirac type %d", getDiracType());
       return nullptr;
@@ -472,7 +472,7 @@ namespace quda {
      *  @param long_gauge_in Updated long links
      *  @param clover_in Updated clover field
      */
-    virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *)
+    virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *)
     {
       gauge = gauge_in;
     }
@@ -619,7 +619,7 @@ namespace quda {
      *  @param long_gauge_in Updated long links
      *  @param clover_in Updated clover field
      */
-    virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *clover_in)
+    virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *clover_in)
     {
       DiracWilson::updateFields(gauge_in, nullptr, nullptr, nullptr);
       clover = clover_in;
@@ -975,7 +975,7 @@ namespace quda {
   class DiracMobiusPC : public DiracMobius {
 
   protected:
-    mutable cudaGaugeField *extended_gauge;
+    mutable GaugeField *extended_gauge;
 
   private:
   public:
@@ -1223,7 +1223,7 @@ namespace quda {
      *  @param long_gauge_in Updated long links
      *  @param clover_in Updated clover field
      */
-    virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *clover_in)
+    virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *clover_in)
     {
       DiracWilson::updateFields(gauge_in, nullptr, nullptr, nullptr);
       clover = clover_in;
@@ -1361,7 +1361,7 @@ namespace quda {
 
        @return Gauge field
    */
-    virtual cudaGaugeField *getStaggeredShortLinkField() const { return gauge; }
+    virtual GaugeField *getStaggeredShortLinkField() const { return gauge; }
 
     /**
      * @brief Create the coarse staggered operator.
@@ -1496,7 +1496,7 @@ namespace quda {
      *  @param long_gauge_in Updated long links
      *  @param clover_in Updated clover field
      */
-    virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in,
+    virtual void updateFields(GaugeField *gauge_in, GaugeField *fat_gauge_in, GaugeField *long_gauge_in,
                               CloverField *clover_in);
 
     /**
@@ -1537,8 +1537,8 @@ namespace quda {
   class DiracImprovedStaggered : public Dirac {
 
   protected:
-    cudaGaugeField *fatGauge;
-    cudaGaugeField *longGauge;
+    GaugeField *fatGauge;
+    GaugeField *longGauge;
 
   public:
     DiracImprovedStaggered(const DiracParam &param);
@@ -1565,14 +1565,14 @@ namespace quda {
 
         @return fat link field
     */
-    virtual cudaGaugeField *getStaggeredShortLinkField() const { return fatGauge; }
+    virtual GaugeField *getStaggeredShortLinkField() const { return fatGauge; }
 
     /**
         @brief return the long link field for staggered operators for MG setup
 
         @return long link field
     */
-    virtual cudaGaugeField *getStaggeredLongLinkField() const { return longGauge; }
+    virtual GaugeField *getStaggeredLongLinkField() const { return longGauge; }
 
     /**
      *  @brief Update the internal gauge, fat gauge, long gauge, clover field pointer as appropriate.
@@ -1583,7 +1583,7 @@ namespace quda {
      *  @param long_gauge_in Updated long links
      *  @param clover_in Updated clover field
      */
-    virtual void updateFields(cudaGaugeField *, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in, CloverField *)
+    virtual void updateFields(GaugeField *, GaugeField *fat_gauge_in, GaugeField *long_gauge_in, CloverField *)
     {
       Dirac::updateFields(fat_gauge_in, nullptr, nullptr, nullptr);
       fatGauge = fat_gauge_in;
@@ -1732,7 +1732,7 @@ namespace quda {
      *  @param long_gauge_in Updated long links
      *  @param clover_in Updated clover field
      */
-    virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in,
+    virtual void updateFields(GaugeField *gauge_in, GaugeField *fat_gauge_in, GaugeField *long_gauge_in,
                               CloverField *clover_in);
 
     /**
@@ -1785,15 +1785,15 @@ namespace quda {
     const bool allow_truncation; /** Whether or not we let coarsening drop improvements, for ex dropping long links for small aggregate sizes */
     const bool use_mma;            /** Whether to use tensor cores or not */
 
-    mutable cpuGaugeField *Y_h; /** CPU copy of the coarse link field */
-    mutable cpuGaugeField *X_h; /** CPU copy of the coarse clover term */
-    mutable cpuGaugeField *Xinv_h; /** CPU copy of the inverse coarse clover term */
-    mutable cpuGaugeField *Yhat_h; /** CPU copy of the preconditioned coarse link field */
+    mutable GaugeField *Y_h; /** CPU copy of the coarse link field */
+    mutable GaugeField *X_h; /** CPU copy of the coarse clover term */
+    mutable GaugeField *Xinv_h; /** CPU copy of the inverse coarse clover term */
+    mutable GaugeField *Yhat_h; /** CPU copy of the preconditioned coarse link field */
 
-    mutable cudaGaugeField *Y_d; /** GPU copy of the coarse link field */
-    mutable cudaGaugeField *X_d; /** GPU copy of the coarse clover term */
-    mutable cudaGaugeField *Xinv_d; /** GPU copy of inverse coarse clover term */
-    mutable cudaGaugeField *Yhat_d; /** GPU copy of the preconditioned coarse link field */
+    mutable GaugeField *Y_d; /** GPU copy of the coarse link field */
+    mutable GaugeField *X_d; /** GPU copy of the coarse clover term */
+    mutable GaugeField *Xinv_d; /** GPU copy of inverse coarse clover term */
+    mutable GaugeField *Yhat_d; /** GPU copy of the preconditioned coarse link field */
 
     /**
        @brief Initialize the coarse gauge fields.  Location is
@@ -1852,9 +1852,9 @@ namespace quda {
        @param[in] Xinv_d GPU coarse inverse clover field
        @param[in] Yhat_d GPU coarse preconditioned link field
      */
-    DiracCoarse(const DiracParam &param, cpuGaugeField *Y_h, cpuGaugeField *X_h, cpuGaugeField *Xinv_h,
-                cpuGaugeField *Yhat_h, cudaGaugeField *Y_d = nullptr, cudaGaugeField *X_d = nullptr,
-                cudaGaugeField *Xinv_d = nullptr, cudaGaugeField *Yhat_d = nullptr);
+    DiracCoarse(const DiracParam &param, GaugeField *Y_h, GaugeField *X_h, GaugeField *Xinv_h,
+                GaugeField *Yhat_h, GaugeField *Y_d = nullptr, GaugeField *X_d = nullptr,
+                GaugeField *Xinv_d = nullptr, GaugeField *Yhat_d = nullptr);
 
     /**
        @param[in] dirac Another operator instance to clone from (shallow copy)
@@ -1944,7 +1944,7 @@ namespace quda {
 
     virtual QudaDiracType getDiracType() const { return QUDA_COARSE_DIRAC; }
 
-    virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *)
+    virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *)
     {
       Dirac::updateFields(gauge_in, nullptr, nullptr, nullptr);
       warningQuda("Coarse gauge links cannot be trivially updated for DiracCoarse(PC). Perform an MG update instead.");
@@ -2008,9 +2008,9 @@ namespace quda {
        @param[in] Xinv_d GPU coarse inverse clover field
        @param[in] Yhat_d GPU coarse preconditioned link field
      */
-    DiracCoarsePC(const DiracParam &param, cpuGaugeField *Y_h, cpuGaugeField *X_h, cpuGaugeField *Xinv_h,
-                  cpuGaugeField *Yhat_h, cudaGaugeField *Y_d = nullptr, cudaGaugeField *X_d = nullptr,
-                  cudaGaugeField *Xinv_d = nullptr, cudaGaugeField *Yhat_d = nullptr);
+    DiracCoarsePC(const DiracParam &param, GaugeField *Y_h, GaugeField *X_h, GaugeField *Xinv_h,
+                  GaugeField *Yhat_h, GaugeField *Y_d = nullptr, GaugeField *X_d = nullptr,
+                  GaugeField *Xinv_d = nullptr, GaugeField *Yhat_d = nullptr);
 
     /**
        @param[in] dirac Another operator instance to clone from (shallow copy)
diff --git a/include/gauge_field.h b/include/gauge_field.h
index 7648ba7f9b..23fb8939e3 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -575,27 +575,6 @@ namespace quda {
     void copy_from_buffer(void *buffer);
   };
 
-  class cudaGaugeField : public GaugeField
-  {
-
-  public:
-    cudaGaugeField(const GaugeFieldParam &);
-
-  };
-
-  class cpuGaugeField : public GaugeField {
-
-  public:
-    /**
-       @brief Constructor for cpuGaugeField from a GaugeFieldParam
-       @param[in,out] param Parameter struct - note that in the case
-       that we are wrapping host-side extended fields, this param is
-       modified for subsequent creation of fields that are not
-       extended.
-    */
-    cpuGaugeField(const GaugeFieldParam &param);
-  };
-
   /**
      @brief This is a debugging function, where we cast a gauge field
      into a spinor field so we can compute its L1 norm.
@@ -666,8 +645,8 @@ namespace quda {
      @param recon The reconsturction type
      @return the pointer to the extended gauge field
   */
-  cudaGaugeField *createExtendedGauge(cudaGaugeField &in, const lat_dim_t &R, TimeProfile &profile,
-                                      bool redundant_comms = false, QudaReconstructType recon = QUDA_RECONSTRUCT_INVALID);
+  GaugeField *createExtendedGauge(GaugeField &in, const lat_dim_t &R, TimeProfile &profile,
+                                  bool redundant_comms = false, QudaReconstructType recon = QUDA_RECONSTRUCT_INVALID);
 
   /**
      This function is used for creating an exteneded (cpu) gauge field from the input,
@@ -676,7 +655,7 @@ namespace quda {
      @param R By how many do we want to extend the gauge field in each direction
      @return the pointer to the extended gauge field
   */
-  cpuGaugeField *createExtendedGauge(void **gauge, QudaGaugeParam &gauge_param, const lat_dim_t &R);
+  GaugeField *createExtendedGauge(void **gauge, QudaGaugeParam &gauge_param, const lat_dim_t &R);
 
   /**
      This function is used for  extracting the gauge ghost zone from a
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 451c8312c6..82ae78b29d 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -430,17 +430,17 @@ namespace quda {
 
       GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr)
       {
-        if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) {
-          for (int d=0; d<4; d++) {
-            ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
-              static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d].data()));
-            ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
-
-            ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
-              ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
-              static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4].data()));
-            ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
-          }
+        for (int d=0; d<4; d++) {
+          ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
+            U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ?
+            static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d].data())) : nullptr;
+          ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
+
+          ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
+            ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
+            U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ?
+            static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4].data())) : nullptr;
+          ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
         }
 
 	resetScale(U.Scale());
@@ -552,17 +552,16 @@ namespace quda {
 
       GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr)
       {
-        if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) {
-          for (int d=0; d<4; d++) {
-            ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
-              static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d].data()));
-            ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
+        for (int d=0; d<4; d++) {
+          ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
+            U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d].data())) : nullptr;
+          ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
 
-            ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
-              ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
-              static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4].data()));
-            ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
-          }
+          ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
+            ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
+            U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ?
+            static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4].data())) : nullptr;
+          ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
         }
 
 	resetScale(U.Scale());
@@ -689,13 +688,11 @@ namespace quda {
         accessor(U, gauge_, ghost_)
       {
         if constexpr (!native_ghost) assert(ghost_ != nullptr);
-        if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) {
-          for (int d = 0; d < 4; d++) {
-            ghost[d] = !native_ghost ? static_cast<complex<storeFloat>*>(ghost_[d]) : nullptr;
-            ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d);
-            ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast<complex<storeFloat>*>(ghost_[d+4]) : nullptr;
-            ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d);
-          }
+        for (int d = 0; d < 4; d++) {
+          ghost[d] = !native_ghost ? static_cast<complex<storeFloat>*>(ghost_[d]) : nullptr;
+          ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d);
+          ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast<complex<storeFloat>*>(ghost_[d+4]) : nullptr;
+          ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d);
         }
         resetScale(U.Scale());
       }
@@ -1752,7 +1749,7 @@ namespace quda {
 
       /**
          @brief The LegacyOrder defines the ghost zone storage and ordering for
-         all cpuGaugeFields, which use the same ghost zone storage.
+         all non-native fields, which use the same ghost zone storage.
       */
       template <typename Float, int length_> struct LegacyOrder {
         static constexpr int length = length_;
@@ -1776,11 +1773,10 @@ namespace quda {
           if (geometry == QUDA_COARSE_GEOMETRY)
             errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
 
-          if (u.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) {
-            for (int i = 0; i < 4; i++) {
-              ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i].data());
-              faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth
-            }
+          for (int i = 0; i < 4; i++) {
+            ghost[i] = (ghost_) ? ghost_[i] :
+              u.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? (Float *)(u.Ghost()[i].data()) : nullptr;
+            faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth
           }
         }
 
diff --git a/include/lattice_field.h b/include/lattice_field.h
index 887fc248e7..38653350cc 100644
--- a/include/lattice_field.h
+++ b/include/lattice_field.h
@@ -34,9 +34,6 @@ namespace quda {
   class cudaEigVecSet;
 
   class GaugeField;
-  class cpuGaugeField;
-  class cudaGaugeField;
-
   class CloverField;
 
   enum class QudaOffsetCopyMode { COLLECT, DISPERSE };
diff --git a/include/multigrid.h b/include/multigrid.h
index 32273032e8..5204f32b25 100644
--- a/include/multigrid.h
+++ b/include/multigrid.h
@@ -382,9 +382,9 @@ namespace quda {
        @brief This method only resets the KD operators with the updated fine links and rebuilds
               the KD inverse
      */
-    void resetStaggeredKD(cudaGaugeField *gauge_in, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in,
-                          cudaGaugeField *gauge_sloppy_in, cudaGaugeField *fat_gauge_sloppy_in,
-                          cudaGaugeField *long_gauge_sloppy_in, double mass);
+    void resetStaggeredKD(GaugeField *gauge_in, GaugeField *fat_gauge_in, GaugeField *long_gauge_in,
+                          GaugeField *gauge_sloppy_in, GaugeField *fat_gauge_sloppy_in,
+                          GaugeField *long_gauge_sloppy_in, double mass);
 
     /**
        @brief Dump the null-space vectors to disk.  Will recurse dumping all levels.
@@ -595,13 +595,13 @@ namespace quda {
      operator we are constructing the coarse grid operator from.
      For staggered, should always be QUDA_MATPC_INVALID.
    */
-  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge,
-                         const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
+  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge,
+                         const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
                          QudaDiracType dirac, QudaMatPCType matpc);
 
   template <int fineColor, int coarseColor>
-  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge,
-                         const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
+  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge,
+                         const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
                          QudaDiracType dirac, QudaMatPCType matpc);
 
   /**
diff --git a/include/quda.h b/include/quda.h
index 31ed24bd01..b697ef7400 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -62,7 +62,7 @@ extern "C" {
 
     QudaGaugeFixed gauge_fix; /**< Whether the input gauge field is in the axial gauge or not */
 
-    int ga_pad;       /**< The pad size that the cudaGaugeField will use (default=0) */
+    int ga_pad;       /**< The pad size that native GaugeFields will use (default=0) */
 
     int site_ga_pad;  /**< Used by link fattening and the gauge and fermion forces */
 
@@ -1488,7 +1488,7 @@ extern "C" {
   void  saveGaugeFieldQuda(void* outGauge, void* inGauge, QudaGaugeParam* param);
 
   /**
-   * Reinterpret gauge as a pointer to cudaGaugeField and call destructor.
+   * Reinterpret gauge as a pointer to a GaugeField and call destructor.
    *
    * @param gauge Gauge field to be freed
    */
diff --git a/include/quda_milc_interface.h b/include/quda_milc_interface.h
index 23275eedb2..88904d481d 100644
--- a/include/quda_milc_interface.h
+++ b/include/quda_milc_interface.h
@@ -1014,7 +1014,7 @@ extern "C" {
 			  void* inGauge);
 
   /**
-   * Reinterpret gauge as a pointer to cudaGaugeField and call destructor.
+   * Reinterpret gauge as a pointer to a GaugeField and call destructor.
    *
    * @param gauge Gauge field to be freed
    */
diff --git a/include/staggered_kd_build_xinv.h b/include/staggered_kd_build_xinv.h
index fdf57eccf8..2bd1b4f600 100644
--- a/include/staggered_kd_build_xinv.h
+++ b/include/staggered_kd_build_xinv.h
@@ -14,7 +14,7 @@ namespace quda
      @param mass [in] Mass of the original staggered operator w/out factor of 2 convention
      @param dagger_approximation[in] Whether or not to use the dagger approximation, using the dagger of X instead of Xinv
   */
-  void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const cudaGaugeField &gauge, const double mass,
+  void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const GaugeField &gauge, const double mass,
                                         const bool dagger_approximation);
 
   /**
@@ -34,7 +34,7 @@ namespace quda
      @param dagger_approximation[in] Whether or not to use the dagger approximation, using the dagger of X instead of Xinv
      @return constructed Xinv
   */
-  std::shared_ptr<GaugeField> AllocateAndBuildStaggeredKahlerDiracInverse(const cudaGaugeField &gauge, const double mass,
+  std::shared_ptr<GaugeField> AllocateAndBuildStaggeredKahlerDiracInverse(const GaugeField &gauge, const double mass,
                                                                           const bool dagger_approximation);
 
 } // namespace quda
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index dd354ca735..37a83e001c 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -36,7 +36,7 @@ set (QUDA_OBJS
   field_cache.cpp
   gauge_covdev.cpp dirac.cpp
   clover_field.cpp lattice_field.cpp gauge_field.cpp
-  cpu_gauge_field.cpp cuda_gauge_field.cpp extract_gauge_ghost.cu
+  extract_gauge_ghost.cu
   gauge_norm.cu gauge_update_quda.cu
   max_clover.cu dirac_clover.cpp dirac_wilson.cpp dirac_staggered.cpp
   dirac_staggered_kd.cpp dirac_clover_hasenbusch_twist.cpp
diff --git a/lib/coarse_op.in.cu b/lib/coarse_op.in.cu
index 320c14bf12..0684e0e97a 100644
--- a/lib/coarse_op.in.cu
+++ b/lib/coarse_op.in.cu
@@ -173,17 +173,17 @@ namespace quda {
       gf_param.nFace = 1;
       gf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
 
-      U = new cpuGaugeField(gf_param);
+      U = new GaugeField(gf_param);
 
       //Copy the cuda gauge field to the cpu
-      static_cast<const cudaGaugeField&>(gauge).saveCPUField(*static_cast<cpuGaugeField*>(U));
+      U->copy(gauge);
     } else if (location == QUDA_CUDA_FIELD_LOCATION && gauge.Reconstruct() != QUDA_RECONSTRUCT_NO) {
       //Create a copy of the gauge field with no reconstruction, required for fine-grained access
       GaugeFieldParam gf_param(gauge);
       gf_param.reconstruct = QUDA_RECONSTRUCT_NO;
       gf_param.order = QUDA_FLOAT2_GAUGE_ORDER;
       gf_param.setPrecision(gf_param.Precision());
-      U = new cudaGaugeField(gf_param);
+      U = new GaugeField(gf_param);
 
       U->copy(gauge);
     }
diff --git a/lib/coarse_op_preconditioned.in.cu b/lib/coarse_op_preconditioned.in.cu
index b80e018a8e..ae41a3cde7 100644
--- a/lib/coarse_op_preconditioned.in.cu
+++ b/lib/coarse_op_preconditioned.in.cu
@@ -160,7 +160,7 @@ namespace quda
           GaugeFieldParam param(X);
           param.order = gOrder_milc;
           param.setPrecision(X.Precision() < QUDA_SINGLE_PRECISION ? QUDA_SINGLE_PRECISION : X.Precision());
-          output = cudaGaugeField::Create(param);
+          output = new GaugeField(param);
           if (copy_content) output->copy(X);
         }
         return output;
@@ -180,9 +180,7 @@ namespace quda
       if (!use_mma) { delete Xinv_aos; }
 
     } else if (X.Location() == QUDA_CPU_FIELD_LOCATION && X.Order() == QUDA_QDP_GAUGE_ORDER) {
-      const cpuGaugeField *X_h = static_cast<const cpuGaugeField*>(&X);
-      cpuGaugeField *Xinv_h = static_cast<cpuGaugeField*>(&Xinv);
-      blas::flops += invert(Xinv_h->data<void *>(0), X_h->data<void *>(0), n, X_h->Volume(), X.Precision(), X.Location());
+      blas::flops += invert(Xinv.data<void *>(0), X.data<void *>(0), n, X.Volume(), X.Precision(), X.Location());
     } else {
       errorQuda("Unsupported location=%d and order=%d", X.Location(), X.Order());
     }
@@ -206,7 +204,7 @@ namespace quda
             param.order = order;
             // if we did the exchange on AoS order, then this zero initialize wouldn't be needed
             if (!copy_content) param.create = QUDA_ZERO_FIELD_CREATE;
-            output = cudaGaugeField::Create(param);
+            output = new GaugeField(param);
             if (copy_content) output->copy(X);
           }
           return output;
diff --git a/lib/coarsecoarse_op_mma.in.cu b/lib/coarsecoarse_op_mma.in.cu
index ee18191dbb..8ccd052a1c 100644
--- a/lib/coarsecoarse_op_mma.in.cu
+++ b/lib/coarsecoarse_op_mma.in.cu
@@ -40,10 +40,10 @@ namespace quda {
       } else {
         GaugeFieldParam param(X);
         param.order = order;
-        output = cudaGaugeField::Create(param);
+        output = new GaugeField(param);
         if (copy_content) output->copy(X);
       }
-      return static_cast<cudaGaugeField *>(output);
+      return static_cast<GaugeField *>(output);
     };
 
     auto Y_order = create_gauge_copy(Y, gOrder, false);
diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp
deleted file mode 100644
index 8927fdb2d3..0000000000
--- a/lib/cpu_gauge_field.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#include <quda_internal.h>
-#include <timer.h>
-#include <gauge_field.h>
-#include <assert.h>
-#include <string.h>
-#include <typeinfo>
-
-namespace quda {
-
-  cpuGaugeField::cpuGaugeField(const GaugeFieldParam &param) : GaugeField(param) {}
-
-} // namespace quda
diff --git a/lib/cuda_gauge_field.cpp b/lib/cuda_gauge_field.cpp
deleted file mode 100644
index e4d56bdfce..0000000000
--- a/lib/cuda_gauge_field.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#include <cstring>
-#include <typeinfo>
-#include <gauge_field.h>
-#include <timer.h>
-#include <blas_quda.h>
-#include <device.h>
-
-namespace quda {
-
-  cudaGaugeField::cudaGaugeField(const GaugeFieldParam &param) : GaugeField(param) {}
-
-} // namespace quda
diff --git a/lib/dirac_coarse.cpp b/lib/dirac_coarse.cpp
index 697f86bf9e..053f71b8f4 100644
--- a/lib/dirac_coarse.cpp
+++ b/lib/dirac_coarse.cpp
@@ -33,10 +33,10 @@ namespace quda {
     initializeCoarse();
   }
 
-  DiracCoarse::DiracCoarse(const DiracParam &param, cpuGaugeField *Y_h, cpuGaugeField *X_h, cpuGaugeField *Xinv_h,
-                           cpuGaugeField *Yhat_h, // cpu link fields
-                           cudaGaugeField *Y_d, cudaGaugeField *X_d, cudaGaugeField *Xinv_d,
-                           cudaGaugeField *Yhat_d) // gpu link field
+  DiracCoarse::DiracCoarse(const DiracParam &param, GaugeField *Y_h, GaugeField *X_h, GaugeField *Xinv_h,
+                           GaugeField *Yhat_h, // cpu link fields
+                           GaugeField *Y_d, GaugeField *X_d, GaugeField *Xinv_d,
+                           GaugeField *Yhat_d) // gpu link field
     :
     Dirac(param),
     mass(param.mass),
@@ -138,16 +138,16 @@ namespace quda {
     int pad = std::max( { (x[0]*x[1]*x[2])/2, (x[1]*x[2]*x[3])/2, (x[0]*x[2]*x[3])/2, (x[0]*x[1]*x[3])/2 } );
     gParam.pad = gpu ? gParam.nFace * pad * 2 : 0; // factor of 2 since we have to store bi-directional ghost zone
 
-    if (gpu) Y_d = new cudaGaugeField(gParam);
-    else     Y_h = new cpuGaugeField(gParam);
+    if (gpu) Y_d = new GaugeField(gParam);
+    else     Y_h = new GaugeField(gParam);
 
     gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
     gParam.nFace = 0;
     gParam.geometry = QUDA_SCALAR_GEOMETRY;
     gParam.pad = 0;
 
-    if (gpu) X_d = new cudaGaugeField(gParam);
-    else     X_h = new cpuGaugeField(gParam);
+    if (gpu) X_d = new GaugeField(gParam);
+    else     X_h = new GaugeField(gParam);
   }
 
   void DiracCoarse::createYhat(bool gpu) const
@@ -180,8 +180,8 @@ namespace quda {
     int pad = std::max( { (x[0]*x[1]*x[2])/2, (x[1]*x[2]*x[3])/2, (x[0]*x[2]*x[3])/2, (x[0]*x[1]*x[3])/2 } );
     gParam.pad = gpu ? gParam.nFace * pad * 2 : 0; // factor of 2 since we have to store bi-directional ghost zone
 
-    if (gpu) Yhat_d = new cudaGaugeField(gParam);
-    else     Yhat_h = new cpuGaugeField(gParam);
+    if (gpu) Yhat_d = new GaugeField(gParam);
+    else     Yhat_h = new GaugeField(gParam);
 
     gParam.setPrecision(gpu ? X_d->Precision() : X_h->Precision());
     gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
@@ -189,8 +189,8 @@ namespace quda {
     gParam.geometry = QUDA_SCALAR_GEOMETRY;
     gParam.pad = 0;
 
-    if (gpu) Xinv_d = new cudaGaugeField(gParam);
-    else     Xinv_h = new cpuGaugeField(gParam);
+    if (gpu) Xinv_d = new GaugeField(gParam);
+    else     Xinv_h = new GaugeField(gParam);
   }
 
   void DiracCoarse::initializeCoarse()
@@ -224,8 +224,8 @@ namespace quda {
         Y_param.order = gOrder;
         X_param.order = gOrder;
 
-        GaugeField *Y_order = cudaGaugeField::Create(Y_param);
-        GaugeField *X_order = cudaGaugeField::Create(X_param);
+        GaugeField *Y_order = GaugeField::Create(Y_param);
+        GaugeField *X_order = GaugeField::Create(X_param);
 
         dirac->createCoarseOp(*Y_order, *X_order, *transfer, kappa, mass, Mu(), MuFactor(), AllowTruncation());
 
@@ -438,9 +438,9 @@ namespace quda {
     /* do nothing */
   }
 
-  DiracCoarsePC::DiracCoarsePC(const DiracParam &param, cpuGaugeField *Y_h, cpuGaugeField *X_h, cpuGaugeField *Xinv_h,
-                               cpuGaugeField *Yhat_h, cudaGaugeField *Y_d, cudaGaugeField *X_d, cudaGaugeField *Xinv_d,
-                               cudaGaugeField *Yhat_d) :
+  DiracCoarsePC::DiracCoarsePC(const DiracParam &param, GaugeField *Y_h, GaugeField *X_h, GaugeField *Xinv_h,
+                               GaugeField *Yhat_h, GaugeField *Y_d, GaugeField *X_d, GaugeField *Xinv_d,
+                               GaugeField *Yhat_d) :
     DiracCoarse(param, Y_h, X_h, Xinv_h, Yhat_h, Y_d, X_d, Xinv_d, Yhat_d)
   {
   }
diff --git a/lib/dirac_improved_staggered_kd.cpp b/lib/dirac_improved_staggered_kd.cpp
index fdba112b7f..39e6080cd6 100644
--- a/lib/dirac_improved_staggered_kd.cpp
+++ b/lib/dirac_improved_staggered_kd.cpp
@@ -154,8 +154,8 @@ namespace quda
     // Should we support "preparing" and "reconstructing"?
   }
 
-  void DiracImprovedStaggeredKD::updateFields(cudaGaugeField *, cudaGaugeField *fat_gauge_in,
-                                              cudaGaugeField *long_gauge_in, CloverField *)
+  void DiracImprovedStaggeredKD::updateFields(GaugeField *, GaugeField *fat_gauge_in,
+                                              GaugeField *long_gauge_in, CloverField *)
   {
     Dirac::updateFields(fat_gauge_in, nullptr, nullptr, nullptr);
     fatGauge = fat_gauge_in;
diff --git a/lib/dirac_staggered_kd.cpp b/lib/dirac_staggered_kd.cpp
index 9271c8afc3..db339402da 100644
--- a/lib/dirac_staggered_kd.cpp
+++ b/lib/dirac_staggered_kd.cpp
@@ -150,7 +150,7 @@ namespace quda
     // Should we support "preparing" and "reconstructing"?
   }
 
-  void DiracStaggeredKD::updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *)
+  void DiracStaggeredKD::updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *)
   {
     Dirac::updateFields(gauge_in, nullptr, nullptr, nullptr);
   }
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 40cb1bf9b6..61ea7ab505 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -256,7 +256,7 @@ namespace quda {
 
   void GaugeField::createGhostZone(const lat_dim_t &R, bool no_comms_fill, bool bidir) const
   {
-    if (typeid(*this) == typeid(cpuGaugeField)) return;
+    if (location == QUDA_CPU_FIELD_LOCATION) return;
 
     // if this is not a bidirectional exchange then we are doing a
     // scalar exchange, e.g., only the link matrix in the direcion we
@@ -288,26 +288,14 @@ namespace quda {
 
     if (phase != QUDA_STAGGERED_PHASE_INVALID) staggeredPhaseType = phase;
     applyGaugePhase(*this);
-    if (ghostExchange==QUDA_GHOST_EXCHANGE_PAD) {
-      if (typeid(*this)==typeid(cudaGaugeField)) {
-	static_cast<cudaGaugeField&>(*this).exchangeGhost();
-      } else {
-	static_cast<cpuGaugeField&>(*this).exchangeGhost();
-      }
-    }
+    if (ghostExchange==QUDA_GHOST_EXCHANGE_PAD) exchangeGhost();
     staggeredPhaseApplied = true;
   }
 
   void GaugeField::removeStaggeredPhase() {
     if (!staggeredPhaseApplied) errorQuda("No staggered phases to remove");
     applyGaugePhase(*this);
-    if (ghostExchange==QUDA_GHOST_EXCHANGE_PAD) {
-      if (typeid(*this)==typeid(cudaGaugeField)) {
-	static_cast<cudaGaugeField&>(*this).exchangeGhost();
-      } else {
-	static_cast<cpuGaugeField&>(*this).exchangeGhost();
-      }
-    }
+    if (ghostExchange==QUDA_GHOST_EXCHANGE_PAD) exchangeGhost();
     staggeredPhaseApplied = false;
   }
 
@@ -1144,27 +1132,15 @@ namespace quda {
     return Checksum(*this, mini);
   }
 
-  GaugeField* GaugeField::Create(const GaugeFieldParam &param) {
-
-    GaugeField *field = nullptr;
-    if (param.location == QUDA_CPU_FIELD_LOCATION) {
-      field = new cpuGaugeField(param);
-    } else if (param.location== QUDA_CUDA_FIELD_LOCATION) {
-      field = new cudaGaugeField(param);
-    } else {
-      errorQuda("Invalid field location %d", param.location);
-    }
-
-    return field;
-  }
+  GaugeField* GaugeField::Create(const GaugeFieldParam &param) { return new GaugeField(param); }
 
   // helper for creating extended gauge fields
-  cudaGaugeField *createExtendedGauge(cudaGaugeField &in, const lat_dim_t &R, TimeProfile &profile,
-                                      bool redundant_comms, QudaReconstructType recon)
+  GaugeField *createExtendedGauge(GaugeField &in, const lat_dim_t &R, TimeProfile &profile,
+                                  bool redundant_comms, QudaReconstructType recon)
   {
     profile.TPSTART(QUDA_PROFILE_INIT);
     GaugeFieldParam gParamEx(in);
-    gParamEx.location = QUDA_CUDA_FIELD_LOCATION;
+    //gParamEx.location = QUDA_CUDA_FIELD_LOCATION;
     gParamEx.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
     gParamEx.pad = 0;
     gParamEx.nFace = 1;
@@ -1177,10 +1153,10 @@ namespace quda {
     if (recon != QUDA_RECONSTRUCT_INVALID) gParamEx.reconstruct = recon;
     gParamEx.setPrecision(gParamEx.Precision(), true);
 
-    auto *out = new cudaGaugeField(gParamEx);
+    auto *out = new GaugeField(gParamEx);
 
     // copy input field into the extended device gauge field
-    copyExtendedGauge(*out, in, QUDA_CUDA_FIELD_LOCATION);
+    copyExtendedGauge(*out, in, QUDA_CUDA_FIELD_LOCATION); // wrong location if both fields cpu
 
     profile.TPSTOP(QUDA_PROFILE_INIT);
 
@@ -1191,10 +1167,10 @@ namespace quda {
   }
 
   // helper for creating extended (cpu) gauge fields
-  cpuGaugeField *createExtendedGauge(void **gauge, QudaGaugeParam &gauge_param, const lat_dim_t &R)
+  GaugeField *createExtendedGauge(void **gauge, QudaGaugeParam &gauge_param, const lat_dim_t &R)
   {
     GaugeFieldParam gauge_field_param(gauge_param, gauge);
-    cpuGaugeField cpu(gauge_field_param);
+    GaugeField cpu(gauge_field_param);
 
     gauge_field_param.location = QUDA_CPU_FIELD_LOCATION;
     gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
@@ -1203,7 +1179,7 @@ namespace quda {
       gauge_field_param.x[d] += 2 * R[d];
       gauge_field_param.r[d] = R[d];
     }
-    cpuGaugeField *padded_cpu = new cpuGaugeField(gauge_field_param);
+    GaugeField *padded_cpu = new GaugeField(gauge_field_param);
 
     copyExtendedGauge(*padded_cpu, cpu, QUDA_CPU_FIELD_LOCATION);
     padded_cpu->exchangeExtendedGhost(R, true); // Do comm to fill halo = true
diff --git a/lib/gauge_observable.cpp b/lib/gauge_observable.cpp
index 42d07e19cc..b825a2ad81 100644
--- a/lib/gauge_observable.cpp
+++ b/lib/gauge_observable.cpp
@@ -66,7 +66,7 @@ namespace quda
     tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
     tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
     tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
-    cudaGaugeField gaugeFmunu(tensorParam);
+    GaugeField gaugeFmunu(tensorParam);
     profile.TPSTOP(QUDA_PROFILE_INIT);
 
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
diff --git a/lib/gauge_polyakov_loop.cu b/lib/gauge_polyakov_loop.cu
index a61027dc81..99ae5ea149 100644
--- a/lib/gauge_polyakov_loop.cu
+++ b/lib/gauge_polyakov_loop.cu
@@ -164,14 +164,14 @@ namespace quda {
       // as a function of the number of ranks in the `t` dimension
       gParam.setPrecision(QUDA_DOUBLE_PRECISION);
 
-      std::unique_ptr<GaugeField> product_field = std::make_unique<cudaGaugeField>(gParam);
+      std::unique_ptr<GaugeField> product_field = std::make_unique<GaugeField>(gParam);
       GaugeField& product_field_ref = reinterpret_cast<GaugeField&>(*product_field.get());
 
       // Create the field we reduce into
       x[3] = comm_dim(3);
       gParam.x = x;
       gParam.create = QUDA_NULL_FIELD_CREATE;
-      condensed_field = std::make_unique<cudaGaugeField>(gParam);
+      condensed_field = std::make_unique<GaugeField>(gParam);
       GaugeField& condensed_field_ref = reinterpret_cast<GaugeField&>(*condensed_field.get());
       profile.TPSTOP(QUDA_PROFILE_INIT);
 
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 27930a3b9a..5ed54e37f2 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -72,28 +72,28 @@ static bool redundant_comms = false;
 #include <blas_lapack.h>
 
 
-cudaGaugeField *gaugePrecise = nullptr;
-cudaGaugeField *gaugeSloppy = nullptr;
-cudaGaugeField *gaugePrecondition = nullptr;
-cudaGaugeField *gaugeRefinement = nullptr;
-cudaGaugeField *gaugeEigensolver = nullptr;
-cudaGaugeField *gaugeExtended = nullptr;
-
-cudaGaugeField *gaugeFatPrecise = nullptr;
-cudaGaugeField *gaugeFatSloppy = nullptr;
-cudaGaugeField *gaugeFatPrecondition = nullptr;
-cudaGaugeField *gaugeFatRefinement = nullptr;
-cudaGaugeField *gaugeFatEigensolver = nullptr;
-cudaGaugeField *gaugeFatExtended = nullptr;
-
-cudaGaugeField *gaugeLongPrecise = nullptr;
-cudaGaugeField *gaugeLongSloppy = nullptr;
-cudaGaugeField *gaugeLongPrecondition = nullptr;
-cudaGaugeField *gaugeLongRefinement = nullptr;
-cudaGaugeField *gaugeLongEigensolver = nullptr;
-cudaGaugeField *gaugeLongExtended = nullptr;
-
-cudaGaugeField *gaugeSmeared = nullptr;
+GaugeField *gaugePrecise = nullptr;
+GaugeField *gaugeSloppy = nullptr;
+GaugeField *gaugePrecondition = nullptr;
+GaugeField *gaugeRefinement = nullptr;
+GaugeField *gaugeEigensolver = nullptr;
+GaugeField *gaugeExtended = nullptr;
+
+GaugeField *gaugeFatPrecise = nullptr;
+GaugeField *gaugeFatSloppy = nullptr;
+GaugeField *gaugeFatPrecondition = nullptr;
+GaugeField *gaugeFatRefinement = nullptr;
+GaugeField *gaugeFatEigensolver = nullptr;
+GaugeField *gaugeFatExtended = nullptr;
+
+GaugeField *gaugeLongPrecise = nullptr;
+GaugeField *gaugeLongSloppy = nullptr;
+GaugeField *gaugeLongPrecondition = nullptr;
+GaugeField *gaugeLongRefinement = nullptr;
+GaugeField *gaugeLongEigensolver = nullptr;
+GaugeField *gaugeLongExtended = nullptr;
+
+GaugeField *gaugeSmeared = nullptr;
 
 CloverField *cloverPrecise = nullptr;
 CloverField *cloverSloppy = nullptr;
@@ -101,8 +101,8 @@ CloverField *cloverPrecondition = nullptr;
 CloverField *cloverRefinement = nullptr;
 CloverField *cloverEigensolver = nullptr;
 
-cudaGaugeField *momResident = nullptr;
-cudaGaugeField *extendedGaugeResident = nullptr;
+GaugeField *momResident = nullptr;
+GaugeField *extendedGaugeResident = nullptr;
 
 std::vector<ColorSpinorField> solutionResident;
 
@@ -536,8 +536,8 @@ static bool invalidate_clover = true;
  * @param refinement[in/out] Reference the to pointer of a given "refinement" field.
  * @param eigensolver[in/out] Reference then to pointer of a given "eigensolver" field.
  */
-void freeUniqueSloppyGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&sloppy, cudaGaugeField *&precondition,
-                                  cudaGaugeField *&refinement, cudaGaugeField *&eigensolver);
+void freeUniqueSloppyGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition,
+                                  GaugeField *&refinement, GaugeField *&eigensolver);
 
 /**
  * Abstraction utility that cleans up the full set of sloppy fields, as well as
@@ -552,8 +552,8 @@ void freeUniqueSloppyGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&slo
  * @param extended[in/out] Reference to the pointer of a given "extended" field.
  * @param preserve_precise[in] Whether (true) or not (false) to preserve the precise field.
  */
-void freeUniqueGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&sloppy, cudaGaugeField *&precondition,
-                            cudaGaugeField *&refinement, cudaGaugeField *&eigensolver, cudaGaugeField *&extended,
+void freeUniqueGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition,
+                            GaugeField *&refinement, GaugeField *&eigensolver, GaugeField *&extended,
                             bool preserve_precise);
 
 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
@@ -571,8 +571,8 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 
   if (gauge_param.order <= 4) gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   GaugeField *in = (param->location == QUDA_CPU_FIELD_LOCATION) ?
-    static_cast<GaugeField*>(new cpuGaugeField(gauge_param)) :
-    static_cast<GaugeField*>(new cudaGaugeField(gauge_param));
+    static_cast<GaugeField*>(new GaugeField(gauge_param)) :
+    static_cast<GaugeField*>(new GaugeField(gauge_param));
 
   if (in->Order() == QUDA_BQCD_GAUGE_ORDER) {
     static size_t checksum = SIZE_MAX;
@@ -610,7 +610,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
   }
 
   // if not preserving then copy the gauge field passed in
-  cudaGaugeField *precise = nullptr;
+  GaugeField *precise = nullptr;
 
   // switch the parameters for creating the mirror precise cuda gauge field
   gauge_param.create = QUDA_NULL_FIELD_CREATE;
@@ -620,7 +620,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
   gauge_param.pad = param->ga_pad;
   gauge_param.location = QUDA_CUDA_FIELD_LOCATION;
 
-  precise = new cudaGaugeField(gauge_param);
+  precise = new GaugeField(gauge_param);
 
   if (param->use_resident_gauge) {
     if(gaugePrecise == nullptr) errorQuda("No resident gauge field");
@@ -655,44 +655,44 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
   // switch the parameters for creating the mirror sloppy cuda gauge field
   gauge_param.reconstruct = param->reconstruct_sloppy;
   gauge_param.setPrecision(param->cuda_prec_sloppy, true);
-  cudaGaugeField *sloppy = nullptr;
+  GaugeField *sloppy = nullptr;
   if (param->cuda_prec == param->cuda_prec_sloppy && param->reconstruct == param->reconstruct_sloppy) {
     sloppy = precise;
   } else {
-    sloppy = new cudaGaugeField(gauge_param);
+    sloppy = new GaugeField(gauge_param);
     sloppy->copy(*precise);
   }
 
   // switch the parameters for creating the mirror preconditioner cuda gauge field
   gauge_param.reconstruct = param->reconstruct_precondition;
   gauge_param.setPrecision(param->cuda_prec_precondition, true);
-  cudaGaugeField *precondition = nullptr;
+  GaugeField *precondition = nullptr;
   if (param->cuda_prec == param->cuda_prec_precondition && param->reconstruct == param->reconstruct_precondition) {
     precondition = precise;
   } else if (param->cuda_prec_sloppy == param->cuda_prec_precondition
              && param->reconstruct_sloppy == param->reconstruct_precondition) {
     precondition = sloppy;
   } else {
-    precondition = new cudaGaugeField(gauge_param);
+    precondition = new GaugeField(gauge_param);
     precondition->copy(*precise);
   }
 
   // switch the parameters for creating the refinement cuda gauge field
   gauge_param.reconstruct = param->reconstruct_refinement_sloppy;
   gauge_param.setPrecision(param->cuda_prec_refinement_sloppy, true);
-  cudaGaugeField *refinement = nullptr;
+  GaugeField *refinement = nullptr;
   if (param->cuda_prec_sloppy == param->cuda_prec_refinement_sloppy
       && param->reconstruct_sloppy == param->reconstruct_refinement_sloppy) {
     refinement = sloppy;
   } else {
-    refinement = new cudaGaugeField(gauge_param);
+    refinement = new GaugeField(gauge_param);
     refinement->copy(*sloppy);
   }
 
   // switch the parameters for creating the eigensolver cuda gauge field
   gauge_param.reconstruct = param->reconstruct_eigensolver;
   gauge_param.setPrecision(param->cuda_prec_eigensolver, true);
-  cudaGaugeField *eigensolver = nullptr;
+  GaugeField *eigensolver = nullptr;
   if (param->cuda_prec == param->cuda_prec_eigensolver && param->reconstruct == param->reconstruct_eigensolver) {
     eigensolver = precise;
   } else if (param->cuda_prec_precondition == param->cuda_prec_eigensolver
@@ -702,14 +702,14 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
              && param->reconstruct_sloppy == param->reconstruct_eigensolver) {
     eigensolver = sloppy;
   } else {
-    eigensolver = new cudaGaugeField(gauge_param);
+    eigensolver = new GaugeField(gauge_param);
     eigensolver->copy(*precise);
   }
 
   profileGauge.TPSTOP(QUDA_PROFILE_COMPUTE);
 
   // create an extended preconditioning field
-  cudaGaugeField* extended = nullptr;
+  GaugeField* extended = nullptr;
   if (param->overlap){
     lat_dim_t R; // domain-overlap widths in different directions
     for (int i=0; i<4; ++i) R[i] = param->overlap*commDimPartitioned(i);
@@ -780,8 +780,8 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 
   // Set the specific cpu parameters and create the cpu gauge field
   GaugeFieldParam gauge_param(*param, h_gauge);
-  cpuGaugeField cpuGauge(gauge_param);
-  cudaGaugeField *cudaGauge = nullptr;
+  GaugeField cpuGauge(gauge_param);
+  GaugeField *cudaGauge = nullptr;
   switch (param->type) {
   case QUDA_WILSON_LINKS: cudaGauge = gaugePrecise; break;
   case QUDA_ASQTAD_FAT_LINKS: cudaGauge = gaugeFatPrecise; break;
@@ -792,7 +792,7 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
     gauge_param.setPrecision(param->cuda_prec, true);
     gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
     gauge_param.pad = param->ga_pad;
-    cudaGauge = new cudaGaugeField(gauge_param);
+    cudaGauge = new GaugeField(gauge_param);
     copyExtendedGauge(*cudaGauge, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
     break;
   default: errorQuda("Invalid gauge type");
@@ -1047,8 +1047,8 @@ void freeGaugeQuda(void)
 }
 
 // These utility functions are declared w/doxygen above
-void freeUniqueSloppyGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&sloppy, cudaGaugeField *&precondition,
-                                  cudaGaugeField *&refinement, cudaGaugeField *&eigensolver)
+void freeUniqueSloppyGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition,
+                                  GaugeField *&refinement, GaugeField *&eigensolver)
 {
   // In theory, we're checking for aliasing and freeing fields in the opposite order
   // from which they were allocated... but in any case, we're doing an all-to-all
@@ -1073,8 +1073,8 @@ void freeUniqueSloppyGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&slo
   sloppy = nullptr;
 }
 
-void freeUniqueGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&sloppy, cudaGaugeField *&precondition,
-                            cudaGaugeField *&refinement, cudaGaugeField *&eigensolver, cudaGaugeField *&extended,
+void freeUniqueGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition,
+                            GaugeField *&refinement, GaugeField *&eigensolver, GaugeField *&extended,
                             bool preserve_precise)
 {
   freeUniqueSloppyGaugeUtility(precise, sloppy, precondition, refinement, eigensolver);
@@ -1135,7 +1135,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
     if (gauge_param.Precision() == gaugePrecise->Precision() && gauge_param.reconstruct == gaugePrecise->Reconstruct()) {
       gaugeSloppy = gaugePrecise;
     } else {
-      gaugeSloppy = new cudaGaugeField(gauge_param);
+      gaugeSloppy = new GaugeField(gauge_param);
       gaugeSloppy->copy(*gaugePrecise);
     }
 
@@ -1151,7 +1151,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
                && gauge_param.reconstruct == gaugeSloppy->Reconstruct()) {
       gaugePrecondition = gaugeSloppy;
     } else {
-      gaugePrecondition = new cudaGaugeField(gauge_param);
+      gaugePrecondition = new GaugeField(gauge_param);
       gaugePrecondition->copy(*gaugePrecise);
     }
 
@@ -1164,7 +1164,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
     if (gauge_param.Precision() == gaugeSloppy->Precision() && gauge_param.reconstruct == gaugeSloppy->Reconstruct()) {
       gaugeRefinement = gaugeSloppy;
     } else {
-      gaugeRefinement = new cudaGaugeField(gauge_param);
+      gaugeRefinement = new GaugeField(gauge_param);
       gaugeRefinement->copy(*gaugeSloppy);
     }
 
@@ -1183,7 +1183,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
                && gauge_param.reconstruct == gaugePrecondition->Reconstruct()) {
       gaugeEigensolver = gaugePrecondition;
     } else {
-      gaugeEigensolver = new cudaGaugeField(gauge_param);
+      gaugeEigensolver = new GaugeField(gauge_param);
       gaugeEigensolver->copy(*gaugePrecise);
     }
   }
@@ -1201,7 +1201,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
         && gauge_param.reconstruct == gaugeFatPrecise->Reconstruct()) {
       gaugeFatSloppy = gaugeFatPrecise;
     } else {
-      gaugeFatSloppy = new cudaGaugeField(gauge_param);
+      gaugeFatSloppy = new GaugeField(gauge_param);
       gaugeFatSloppy->copy(*gaugeFatPrecise);
     }
 
@@ -1217,7 +1217,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
                && gauge_param.reconstruct == gaugeFatSloppy->Reconstruct()) {
       gaugeFatPrecondition = gaugeFatSloppy;
     } else {
-      gaugeFatPrecondition = new cudaGaugeField(gauge_param);
+      gaugeFatPrecondition = new GaugeField(gauge_param);
       gaugeFatPrecondition->copy(*gaugeFatPrecise);
     }
 
@@ -1230,7 +1230,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
         && gauge_param.reconstruct == gaugeFatSloppy->Reconstruct()) {
       gaugeFatRefinement = gaugeFatSloppy;
     } else {
-      gaugeFatRefinement = new cudaGaugeField(gauge_param);
+      gaugeFatRefinement = new GaugeField(gauge_param);
       gaugeFatRefinement->copy(*gaugeFatSloppy);
     }
 
@@ -1249,7 +1249,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
                && gauge_param.reconstruct == gaugeFatPrecondition->Reconstruct()) {
       gaugeFatEigensolver = gaugeFatPrecondition;
     } else {
-      gaugeFatEigensolver = new cudaGaugeField(gauge_param);
+      gaugeFatEigensolver = new GaugeField(gauge_param);
       gaugeFatEigensolver->copy(*gaugeFatPrecise);
     }
   }
@@ -1268,7 +1268,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
         && gauge_param.reconstruct == gaugeLongPrecise->Reconstruct()) {
       gaugeLongSloppy = gaugeLongPrecise;
     } else {
-      gaugeLongSloppy = new cudaGaugeField(gauge_param);
+      gaugeLongSloppy = new GaugeField(gauge_param);
       gaugeLongSloppy->copy(*gaugeLongPrecise);
     }
 
@@ -1285,7 +1285,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
                && gauge_param.reconstruct == gaugeLongSloppy->Reconstruct()) {
       gaugeLongPrecondition = gaugeLongSloppy;
     } else {
-      gaugeLongPrecondition = new cudaGaugeField(gauge_param);
+      gaugeLongPrecondition = new GaugeField(gauge_param);
       gaugeLongPrecondition->copy(*gaugeLongPrecise);
     }
 
@@ -1299,7 +1299,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
         && gauge_param.reconstruct == gaugeLongSloppy->Reconstruct()) {
       gaugeLongRefinement = gaugeLongSloppy;
     } else {
-      gaugeLongRefinement = new cudaGaugeField(gauge_param);
+      gaugeLongRefinement = new GaugeField(gauge_param);
       gaugeLongRefinement->copy(*gaugeLongSloppy);
     }
 
@@ -1319,7 +1319,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
                && gauge_param.reconstruct == gaugeLongPrecondition->Reconstruct()) {
       gaugeLongEigensolver = gaugeLongPrecondition;
     } else {
-      gaugeLongEigensolver = new cudaGaugeField(gauge_param);
+      gaugeLongEigensolver = new GaugeField(gauge_param);
       gaugeLongEigensolver->copy(*gaugeLongPrecise);
     }
   }
@@ -2068,9 +2068,9 @@ void checkClover(QudaInvertParam *param) {
   if (cloverEigensolver == nullptr) errorQuda("Eigensolver clover field doesn't exist");
 }
 
-quda::cudaGaugeField *checkGauge(QudaInvertParam *param)
+quda::GaugeField *checkGauge(QudaInvertParam *param)
 {
-  quda::cudaGaugeField *cudaGauge = nullptr;
+  quda::GaugeField *cudaGauge = nullptr;
   if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
     if (gaugePrecise == nullptr) errorQuda("Precise gauge field doesn't exist");
 
@@ -2241,7 +2241,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
   checkEigParam(eig_param);
 
   // Check that the gauge field is valid
-  cudaGaugeField *cudaGauge = checkGauge(inv_param);
+  GaugeField *cudaGauge = checkGauge(inv_param);
 
   // Set all timing statistics to zero
   inv_param->secs = 0;
@@ -2386,7 +2386,7 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr
   blas_lapack::set_native(param->native_blas_lapack);
 
   checkMultigridParam(&mg_param);
-  cudaGaugeField *cudaGauge = checkGauge(param);
+  GaugeField *cudaGauge = checkGauge(param);
 
   // check MG params (needs to go somewhere else)
   if (mg_param.n_level > QUDA_MAX_MG_LEVEL)
@@ -2624,7 +2624,7 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
 
   profile.TPSTART(QUDA_PROFILE_INIT);
 
-  cudaGaugeField *cudaGauge = checkGauge(param);
+  GaugeField *cudaGauge = checkGauge(param);
   eig_param.secs   = 0;
   eig_param.gflops = 0;
 
@@ -2710,7 +2710,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
   checkInvertParam(param, hp_x, hp_b);
 
   // check the gauge fields have been created
-  cudaGaugeField *cudaGauge = checkGauge(param);
+  GaugeField *cudaGauge = checkGauge(param);
 
   // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in
   // solve_type and solution_type, rather than in separate members of QudaInvertParam.  We're stuck with it
@@ -3292,15 +3292,15 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
 
     if (!is_staggered) {
       gf_param->create = QUDA_NULL_FIELD_CREATE;
-      collected_gauge = new quda::cpuGaugeField(*gf_param);
+      collected_gauge = new quda::GaugeField(*gf_param);
       std::vector<quda::GaugeField *> v_g(1);
       v_g[0] = in;
       quda::split_field(*collected_gauge, v_g, split_key);
     } else {
       milc_fatlink_param->create = QUDA_NULL_FIELD_CREATE;
       milc_longlink_param->create = QUDA_NULL_FIELD_CREATE;
-      collected_milc_fatlink_field = new quda::cpuGaugeField(*milc_fatlink_param);
-      collected_milc_longlink_field = new quda::cpuGaugeField(*milc_longlink_param);
+      collected_milc_fatlink_field = new quda::GaugeField(*milc_fatlink_param);
+      collected_milc_longlink_field = new quda::GaugeField(*milc_longlink_param);
       std::vector<quda::GaugeField *> v_g(1);
       v_g[0] = milc_fatlink_field;
       quda::split_field(*collected_milc_fatlink_field, v_g, split_key);
@@ -3835,27 +3835,27 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
 
   GaugeFieldParam gParam(*param, fatlink, QUDA_GENERAL_LINKS);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  cpuGaugeField cpuFatLink(gParam);   // create the host fatlink
+  GaugeField cpuFatLink(gParam);   // create the host fatlink
   gParam.gauge = longlink;
-  cpuGaugeField cpuLongLink(gParam);  // create the host longlink
+  GaugeField cpuLongLink(gParam);  // create the host longlink
   gParam.gauge = ulink;
-  cpuGaugeField cpuUnitarizedLink(gParam);
+  GaugeField cpuUnitarizedLink(gParam);
   gParam.link_type = param->type;
   gParam.gauge = inlink;
-  cpuGaugeField cpuInLink(gParam);    // create the host sitelink
+  GaugeField cpuInLink(gParam);    // create the host sitelink
 
   // create the device fields
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.reconstruct = param->reconstruct;
   gParam.setPrecision(param->cuda_prec, true);
   gParam.create = QUDA_NULL_FIELD_CREATE;
-  cudaGaugeField *cudaInLink = new cudaGaugeField(gParam);
+  GaugeField *cudaInLink = new GaugeField(gParam);
   profileFatLink.TPSTOP(QUDA_PROFILE_INIT);
 
   profileFatLink.TPSTART(QUDA_PROFILE_H2D);
   cudaInLink->copy(cpuInLink);
   profileFatLink.TPSTOP(QUDA_PROFILE_H2D);
-  cudaGaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink);
+  GaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink);
 
   profileFatLink.TPSTART(QUDA_PROFILE_FREE);
   delete cudaInLink;
@@ -3869,7 +3869,7 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
 
   if (longlink) {
     profileFatLink.TPSTART(QUDA_PROFILE_INIT);
-    cudaGaugeField *cudaLongLink = new cudaGaugeField(gParam);
+    GaugeField *cudaLongLink = new GaugeField(gParam);
     profileFatLink.TPSTOP(QUDA_PROFILE_INIT);
 
     profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
@@ -3886,7 +3886,7 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
   }
 
   profileFatLink.TPSTART(QUDA_PROFILE_INIT);
-  cudaGaugeField *cudaFatLink = new cudaGaugeField(gParam);
+  GaugeField *cudaFatLink = new GaugeField(gParam);
   profileFatLink.TPSTOP(QUDA_PROFILE_INIT);
 
   profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
@@ -3913,7 +3913,7 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
     quda::setUnitarizeLinksConstants(unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error,
                                      svd_abs_error);
 
-    cudaGaugeField *cudaUnitarizedLink = new cudaGaugeField(gParam);
+    GaugeField *cudaUnitarizedLink = new GaugeField(gParam);
 
     profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
     *num_failures_h = 0;
@@ -3947,21 +3947,21 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
 
   GaugeFieldParam gParam(*param, inlink, QUDA_GENERAL_LINKS);
   gParam.gauge     = twolink;
-  cpuGaugeField cpuTwoLink(gParam);  // create the host twolink
+  GaugeField cpuTwoLink(gParam);  // create the host twolink
   profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
 
-  cudaGaugeField *cudaInLinkEx = nullptr;
+  GaugeField *cudaInLinkEx = nullptr;
 
   if(inlink) {
     gParam.link_type = param->type;
     gParam.gauge     = inlink;
-    cpuGaugeField cpuInLink(gParam);    // create the host sitelink
+    GaugeField cpuInLink(gParam);    // create the host sitelink
 
     // create the device fields
     gParam.reconstruct = param->reconstruct;
     gParam.setPrecision(param->cuda_prec, true);
     gParam.create = QUDA_NULL_FIELD_CREATE;
-    cudaGaugeField *cudaInLink = new cudaGaugeField(gParam);
+    GaugeField *cudaInLink = new GaugeField(gParam);
     profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
 
     profileGaussianSmear.TPSTART(QUDA_PROFILE_H2D);
@@ -3991,7 +3991,7 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
   profileGaussianSmear.TPSTART(QUDA_PROFILE_INIT);
 
   freeUniqueGaugeQuda(QUDA_SMEARED_LINKS);
-  gaugeSmeared = new cudaGaugeField(gsParam);
+  gaugeSmeared = new GaugeField(gsParam);
 
   
   profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
@@ -4028,9 +4028,9 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
   gParam.location = QUDA_CPU_FIELD_LOCATION;
   gParam.site_offset = qudaGaugeParam->gauge_offset;
   gParam.site_size = qudaGaugeParam->site_size;
-  cpuGaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new cpuGaugeField(gParam) : nullptr;
+  GaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new GaugeField(gParam) : nullptr;
 
-  cudaGaugeField* cudaSiteLink = nullptr;
+  GaugeField* cudaSiteLink = nullptr;
 
   if (qudaGaugeParam->use_resident_gauge) {
     if (!gaugePrecise) errorQuda("No resident gauge field to use");
@@ -4041,7 +4041,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
     gParam.setPrecision(qudaGaugeParam->cuda_prec, true);
     gParam.location = QUDA_CUDA_FIELD_LOCATION;
 
-    cudaSiteLink = new cudaGaugeField(gParam);
+    cudaSiteLink = new GaugeField(gParam);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
 
     profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
@@ -4060,9 +4060,9 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
 
   gParamMom.site_offset = qudaGaugeParam->mom_offset;
   gParamMom.site_size = qudaGaugeParam->site_size;
-  cpuGaugeField* cpuMom = (!qudaGaugeParam->use_resident_mom) ? new cpuGaugeField(gParamMom) : nullptr;
+  GaugeField* cpuMom = (!qudaGaugeParam->use_resident_mom) ? new GaugeField(gParamMom) : nullptr;
 
-  cudaGaugeField* cudaMom = nullptr;
+  GaugeField* cudaMom = nullptr;
   if (qudaGaugeParam->use_resident_mom) {
     if (!momResident) errorQuda("No resident momentum field to use");
     cudaMom = momResident;
@@ -4075,7 +4075,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
     gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
     gParamMom.setPrecision(qudaGaugeParam->cuda_prec, true);
     gParamMom.create = QUDA_ZERO_FIELD_CREATE;
-    cudaMom = new cudaGaugeField(gParamMom);
+    cudaMom = new GaugeField(gParamMom);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
     if (!qudaGaugeParam->overwrite_mom) {
       profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
@@ -4084,7 +4084,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
     }
   }
 
-  cudaGaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugeForce);
+  GaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugeForce);
   // apply / remove phase as appropriate
   if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase();
 
@@ -4163,9 +4163,9 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
   gParam.location = QUDA_CPU_FIELD_LOCATION;
   gParam.site_offset = qudaGaugeParam->gauge_offset;
   gParam.site_size = qudaGaugeParam->site_size;
-  cpuGaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new cpuGaugeField(gParam) : nullptr;
+  GaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new GaugeField(gParam) : nullptr;
 
-  cudaGaugeField *cudaSiteLink = nullptr;
+  GaugeField *cudaSiteLink = nullptr;
 
   if (qudaGaugeParam->use_resident_gauge) {
     if (!gaugePrecise) errorQuda("No resident gauge field to use");
@@ -4176,7 +4176,7 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
     gParam.reconstruct = qudaGaugeParam->reconstruct;
     gParam.setPrecision(qudaGaugeParam->cuda_prec, true);
 
-    cudaSiteLink = new cudaGaugeField(gParam);
+    cudaSiteLink = new GaugeField(gParam);
     profileGaugePath.TPSTOP(QUDA_PROFILE_INIT);
 
     profileGaugePath.TPSTART(QUDA_PROFILE_H2D);
@@ -4190,12 +4190,12 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
   gParamOut.location = QUDA_CPU_FIELD_LOCATION;
   gParamOut.site_offset = qudaGaugeParam->gauge_offset;
   gParamOut.site_size = qudaGaugeParam->site_size;
-  cpuGaugeField *cpuOut = new cpuGaugeField(gParamOut);
+  GaugeField *cpuOut = new GaugeField(gParamOut);
   gParamOut.location = QUDA_CUDA_FIELD_LOCATION;
   gParamOut.create = qudaGaugeParam->overwrite_gauge ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
   gParamOut.reconstruct = QUDA_RECONSTRUCT_NO;
   gParamOut.setPrecision(qudaGaugeParam->cuda_prec, true);
-  cudaGaugeField *cudaOut = new cudaGaugeField(gParamOut);
+  GaugeField *cudaOut = new GaugeField(gParamOut);
   profileGaugePath.TPSTOP(QUDA_PROFILE_INIT);
   if (!qudaGaugeParam->overwrite_gauge) {
     profileGaugePath.TPSTART(QUDA_PROFILE_H2D);
@@ -4203,7 +4203,7 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
     profileGaugePath.TPSTOP(QUDA_PROFILE_H2D);
   }
 
-  cudaGaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugePath);
+  GaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugePath);
   // apply / remove phase as appropriate
   if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase();
 
@@ -4265,7 +4265,7 @@ void momResidentQuda(void *mom, QudaGaugeParam *param)
   gParamMom.site_offset = param->mom_offset;
   gParamMom.site_size = param->site_size;
 
-  cpuGaugeField cpuMom(gParamMom);
+  GaugeField cpuMom(gParamMom);
 
   if (param->make_resident_mom && !param->return_result_mom) {
     if (momResident) delete momResident;
@@ -4275,7 +4275,7 @@ void momResidentQuda(void *mom, QudaGaugeParam *param)
     gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
     gParamMom.setPrecision(param->cuda_prec, true);
     gParamMom.create = QUDA_ZERO_FIELD_CREATE;
-    momResident = new cudaGaugeField(gParamMom);
+    momResident = new GaugeField(gParamMom);
   } else if (param->return_result_mom && !param->make_resident_mom) {
     if (!momResident) errorQuda("No resident momentum to return");
   } else {
@@ -4314,7 +4314,7 @@ void createCloverQuda(QudaInvertParam* invertParam)
   // for clover we optimize to only send depth 1 halos in y/z/t (FIXME - make work for x, make robust in general)
   lat_dim_t R;
   for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d));
-  cudaGaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profileClover, false, recon);
+  GaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profileClover, false, recon);
 
   profileClover.TPSTART(QUDA_PROFILE_INIT);
 
@@ -4333,7 +4333,7 @@ void createCloverQuda(QudaInvertParam* invertParam)
   tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
   tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
-  cudaGaugeField Fmunu(tensorParam);
+  GaugeField Fmunu(tensorParam);
   profileClover.TPSTOP(QUDA_PROFILE_INIT);
   profileClover.TPSTART(QUDA_PROFILE_COMPUTE);
   computeFmunu(Fmunu, *ex);
@@ -4354,12 +4354,12 @@ void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
   if (geometry != QUDA_SCALAR_GEOMETRY && geometry != QUDA_VECTOR_GEOMETRY)
     errorQuda("Only scalar and vector geometries are supported\n");
 
-  cpuGaugeField *cpuGauge = nullptr;
-  if (gauge) cpuGauge = new cpuGaugeField(gParam);
+  GaugeField *cpuGauge = nullptr;
+  if (gauge) cpuGauge = new GaugeField(gParam);
 
   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.create = QUDA_ZERO_FIELD_CREATE;
-  auto* cudaGauge = new cudaGaugeField(gParam);
+  auto* cudaGauge = new GaugeField(gParam);
 
   if (gauge) {
     cudaGauge->copy(*cpuGauge);
@@ -4371,18 +4371,18 @@ void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
 
 void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param)
 {
-  auto* cudaGauge = reinterpret_cast<cudaGaugeField*>(inGauge);
+  auto* cudaGauge = reinterpret_cast<GaugeField*>(inGauge);
 
   GaugeFieldParam gParam(*param, gauge, QUDA_GENERAL_LINKS);
   gParam.geometry = cudaGauge->Geometry();
 
-  cpuGaugeField cpuGauge(gParam);
+  GaugeField cpuGauge(gParam);
   cpuGauge.copy(*cudaGauge);
 }
 
 void destroyGaugeFieldQuda(void *gauge)
 {
-  auto* g = reinterpret_cast<cudaGaugeField*>(gauge);
+  auto* g = reinterpret_cast<GaugeField*>(gauge);
   delete g;
 }
 
@@ -4398,7 +4398,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   gParam.location = QUDA_CPU_FIELD_LOCATION;
   gParam.reconstruct = gauge_param->reconstruct;
   gParam.t_boundary = QUDA_PERIODIC_T;
-  cpuGaugeField cpuMom(gParam);
+  GaugeField cpuMom(gParam);
 
   // create the device momentum field
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
@@ -4406,13 +4406,13 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   gParam.create = QUDA_ZERO_FIELD_CREATE; // FIXME
   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
-  cudaGaugeField *cudaMom = !gauge_param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;
+  GaugeField *cudaMom = !gauge_param->use_resident_mom ? new GaugeField(gParam) : nullptr;
 
   // create temporary field for quark-field outer product
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.create = QUDA_ZERO_FIELD_CREATE;
-  cudaGaugeField cudaForce(gParam);
+  GaugeField cudaForce(gParam);
   GaugeField *cudaForce_[2] = {&cudaForce};
 
   ColorSpinorParam qParam;
@@ -4606,9 +4606,9 @@ void computeHISQForceQuda(void* const milc_momentum,
   oParam.setPrecision(gParam->cpu_prec, true);
   oParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
 
-  cudaGaugeField *stapleOprod = new cudaGaugeField(oParam);
-  cudaGaugeField *oneLinkOprod = new cudaGaugeField(oParam);
-  cudaGaugeField *naikOprod = new cudaGaugeField(oParam);
+  GaugeField *stapleOprod = new GaugeField(oParam);
+  GaugeField *oneLinkOprod = new GaugeField(oParam);
+  GaugeField *naikOprod = new GaugeField(oParam);
 
   double act_path_coeff[6] = {0, 1, level2_coeff[2], level2_coeff[3], level2_coeff[4], level2_coeff[5]};
   // You have to look at the MILC routine to understand the following
@@ -4712,11 +4712,11 @@ void computeHISQForceQuda(void* const milc_momentum,
     oParam.r[dir] = R[dir];
   }
 
-  cudaGaugeField *cudaInForce = new cudaGaugeField(oParam);
+  GaugeField *cudaInForce = new GaugeField(oParam);
   copyExtendedGauge(*cudaInForce, *stapleOprod, QUDA_CUDA_FIELD_LOCATION);
   delete stapleOprod;
 
-  cudaGaugeField *cudaOutForce = new cudaGaugeField(oParam);
+  GaugeField *cudaOutForce = new GaugeField(oParam);
   copyExtendedGauge(*cudaOutForce, *oneLinkOprod, QUDA_CUDA_FIELD_LOCATION);
   delete oneLinkOprod;
 
@@ -4729,7 +4729,7 @@ void computeHISQForceQuda(void* const milc_momentum,
   param.reconstruct = QUDA_RECONSTRUCT_10;
   param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   param.gauge = milc_momentum;
-  cpuGaugeField *cpuMom = (!gParam->use_resident_mom) ? new cpuGaugeField(param) : nullptr;
+  GaugeField *cpuMom = (!gParam->use_resident_mom) ? new GaugeField(param) : nullptr;
 
   param.location = QUDA_CUDA_FIELD_LOCATION;
   param.create = QUDA_ZERO_FIELD_CREATE;
@@ -4750,15 +4750,15 @@ void computeHISQForceQuda(void* const milc_momentum,
   wParam.link_type = QUDA_GENERAL_LINKS;
   wParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   wParam.gauge = (void *)w_link;
-  cpuGaugeField cpuWLink(wParam);
+  GaugeField cpuWLink(wParam);
 
   GaugeFieldParam vParam(wParam);
   vParam.gauge = (void *)v_link;
-  cpuGaugeField cpuVLink(vParam);
+  GaugeField cpuVLink(vParam);
 
   GaugeFieldParam uParam(vParam);
   uParam.gauge = (void *)u_link;
-  cpuGaugeField cpuULink(uParam);
+  GaugeField cpuULink(uParam);
 
   // Load the W field, which contains U(3) matrices, to the device
   gParam_field.ga_pad = 3 * pad_size;
@@ -4773,7 +4773,7 @@ void computeHISQForceQuda(void* const milc_momentum,
   wParam.create = QUDA_NULL_FIELD_CREATE;
   wParam.setPrecision(gParam->cpu_prec, true);
 
-  cudaGaugeField *cudaWLink = new cudaGaugeField(wParam);
+  GaugeField *cudaWLink = new GaugeField(wParam);
   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
   profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
@@ -4821,7 +4821,7 @@ void computeHISQForceQuda(void* const milc_momentum,
   vParam.setPrecision(gParam->cpu_prec, true);
   vParam.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
   vParam.pad = 3 * pad_size;
-  cudaGaugeField *cudaVLink = new cudaGaugeField(vParam);
+  GaugeField *cudaVLink = new GaugeField(vParam);
   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
   profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
@@ -4856,7 +4856,7 @@ void computeHISQForceQuda(void* const milc_momentum,
   uParam.setPrecision(gParam->cpu_prec, true);
   uParam.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
   uParam.pad = 3 * pad_size;
-  cudaGaugeField *cudaULink = new cudaGaugeField(uParam);
+  GaugeField *cudaULink = new GaugeField(uParam);
   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
   profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
@@ -4874,7 +4874,7 @@ void computeHISQForceQuda(void* const milc_momentum,
   delete cudaInForce;
   profileHISQForce.TPSTOP(QUDA_PROFILE_FREE);
   profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
-  cudaGaugeField* cudaMom = new cudaGaugeField(momParam);
+  GaugeField* cudaMom = new GaugeField(momParam);
   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
@@ -4930,20 +4930,20 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
   fParam.location = QUDA_CPU_FIELD_LOCATION;
   fParam.reconstruct = QUDA_RECONSTRUCT_10;
   fParam.order = gauge_param->gauge_order;
-  cpuGaugeField cpuMom(fParam);
+  GaugeField cpuMom(fParam);
 
   // create the device momentum field
   fParam.location = QUDA_CUDA_FIELD_LOCATION;
   fParam.create = QUDA_ZERO_FIELD_CREATE;
   fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
-  cudaGaugeField cudaMom(fParam);
+  GaugeField cudaMom(fParam);
 
   // create the device force field
   fParam.link_type = QUDA_GENERAL_LINKS;
   fParam.create = QUDA_ZERO_FIELD_CREATE;
   fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   fParam.reconstruct = QUDA_RECONSTRUCT_NO;
-  cudaGaugeField cudaForce(fParam);
+  GaugeField cudaForce(fParam);
 
   ColorSpinorParam qParam;
   qParam.location = QUDA_CUDA_FIELD_LOCATION;
@@ -4988,11 +4988,11 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
 		solutionResident.size(), nvector);
   }
 
-  cudaGaugeField &gaugeEx = *extendedGaugeResident;
+  GaugeField &gaugeEx = *extendedGaugeResident;
 
   // create oprod and trace fields
   fParam.geometry = QUDA_TENSOR_GEOMETRY;
-  cudaGaugeField oprod(fParam);
+  GaugeField oprod(fParam);
 
   profileCloverForce.TPSTOP(QUDA_PROFILE_INIT);
   profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);
@@ -5040,11 +5040,11 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
   computeCloverForce(cudaForce, *gaugePrecise, quarkX, quarkP, force_coeff);
 
   // In double precision the clover derivative is faster with no reconstruct
-  cudaGaugeField *u = &gaugeEx;
+  GaugeField *u = &gaugeEx;
   if (gaugeEx.Reconstruct() == QUDA_RECONSTRUCT_12 && gaugeEx.Precision() == QUDA_DOUBLE_PRECISION) {
     GaugeFieldParam param(gaugeEx);
     param.reconstruct = QUDA_RECONSTRUCT_NO;
-    u = new cudaGaugeField(param);
+    u = new GaugeField(param);
     u -> copy(gaugeEx);
   }
 
@@ -5060,7 +5060,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
 
   computeCloverSigmaOprod(oprod, quarkX, quarkP, ferm_epsilon);
 
-  cudaGaugeField *oprodEx = createExtendedGauge(oprod, R, profileCloverForce);
+  GaugeField *oprodEx = createExtendedGauge(oprod, R, profileCloverForce);
 
   profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);
 
@@ -5112,7 +5112,7 @@ void updateGaugeFieldQuda(void* gauge,
   gParam.site_offset = param->gauge_offset;
   gParam.site_size = param->site_size;
   bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
-  cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;
+  GaugeField *cpuGauge = need_cpu ? new GaugeField(gParam) : nullptr;
 
   GaugeFieldParam gParamMom(*param, momentum);
   gParamMom.reconstruct = (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ?
@@ -5120,7 +5120,7 @@ void updateGaugeFieldQuda(void* gauge,
   gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParamMom.site_offset = param->mom_offset;
   gParamMom.site_size = param->site_size;
-  cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParamMom) : nullptr;
+  GaugeField *cpuMom = !param->use_resident_mom ? new GaugeField(gParamMom) : nullptr;
 
   // create the device fields
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
@@ -5130,12 +5130,12 @@ void updateGaugeFieldQuda(void* gauge,
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   gParam.pad = 0;
-  cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;
+  GaugeField *cudaMom = !param->use_resident_mom ? new GaugeField(gParam) : nullptr;
 
   gParam.link_type = QUDA_SU3_LINKS;
   gParam.reconstruct = param->reconstruct;
-  cudaGaugeField *cudaInGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;
-  auto *cudaOutGauge = new cudaGaugeField(gParam);
+  GaugeField *cudaInGauge = !param->use_resident_gauge ? new GaugeField(gParam) : nullptr;
+  auto *cudaOutGauge = new GaugeField(gParam);
 
   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_INIT);
 
@@ -5207,14 +5207,14 @@ void updateGaugeFieldQuda(void* gauge,
    gParam.site_offset = param->gauge_offset;
    gParam.site_size = param->site_size;
    bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
-   cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;
+   GaugeField *cpuGauge = need_cpu ? new GaugeField(gParam) : nullptr;
 
    // create the device fields
    gParam.location = QUDA_CUDA_FIELD_LOCATION;
    gParam.create = QUDA_NULL_FIELD_CREATE;
    gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
    gParam.reconstruct = param->reconstruct;
-   cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;
+   GaugeField *cudaGauge = !param->use_resident_gauge ? new GaugeField(gParam) : nullptr;
    profileProject.TPSTOP(QUDA_PROFILE_INIT);
 
    if (param->use_resident_gauge) {
@@ -5270,14 +5270,14 @@ void updateGaugeFieldQuda(void* gauge,
    GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS);
    bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
    gParam.location = QUDA_CPU_FIELD_LOCATION;
-   cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;
+   GaugeField *cpuGauge = need_cpu ? new GaugeField(gParam) : nullptr;
 
    // create the device fields
    gParam.location = QUDA_CUDA_FIELD_LOCATION;
    gParam.create = QUDA_NULL_FIELD_CREATE;
    gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
    gParam.reconstruct = param->reconstruct;
-   cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;
+   GaugeField *cudaGauge = !param->use_resident_gauge ? new GaugeField(gParam) : nullptr;
    profilePhase.TPSTOP(QUDA_PROFILE_INIT);
 
    if (param->use_resident_gauge) {
@@ -5334,7 +5334,7 @@ double momActionQuda(void* momentum, QudaGaugeParam* param)
   gParam.site_offset = param->mom_offset;
   gParam.site_size = param->site_size;
 
-  cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParam) : nullptr;
+  GaugeField *cpuMom = !param->use_resident_mom ? new GaugeField(gParam) : nullptr;
 
   // create the device fields
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
@@ -5342,7 +5342,7 @@ double momActionQuda(void* momentum, QudaGaugeParam* param)
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
   gParam.setPrecision(param->cuda_prec, true);
 
-  cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;
+  GaugeField *cudaMom = !param->use_resident_mom ? new GaugeField(gParam) : nullptr;
 
   profileMomAction.TPSTOP(QUDA_PROFILE_INIT);
 
@@ -5384,7 +5384,7 @@ void gaussGaugeQuda(unsigned long long seed, double sigma)
 
   if (!gaugePrecise) errorQuda("Cannot generate Gauss GaugeField as there is no resident gauge field");
 
-  cudaGaugeField *data = gaugePrecise;
+  GaugeField *data = gaugePrecise;
 
   profileGauss.TPSTART(QUDA_PROFILE_COMPUTE);
   quda::gaugeGauss(*data, seed, sigma);
@@ -5404,7 +5404,7 @@ void gaussMomQuda(unsigned long long seed, double sigma)
 
   if (!momResident) errorQuda("Cannot generate Gauss GaugeField as there is no resident momentum field");
 
-  cudaGaugeField *data = momResident;
+  GaugeField *data = momResident;
 
   profileGauss.TPSTART(QUDA_PROFILE_COMPUTE);
   quda::gaugeGauss(*data, seed, sigma);
@@ -5422,7 +5422,7 @@ void plaqQuda(double plaq[3])
 
   if (!gaugePrecise) errorQuda("Cannot compute plaquette as there is no resident gauge field");
 
-  cudaGaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq);
+  GaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq);
   extendedGaugeResident = data;
 
   profilePlaq.TPSTART(QUDA_PROFILE_COMPUTE);
@@ -5493,13 +5493,13 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param,
   pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);
 
-  cudaGaugeField *precise = nullptr;
+  GaugeField *precise = nullptr;
 
   if (gaugeSmeared != nullptr) {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Wuppertal smearing done with gaugeSmeared\n");
     GaugeFieldParam gParam(*gaugePrecise);
     gParam.create = QUDA_NULL_FIELD_CREATE;
-    precise = new cudaGaugeField(gParam);
+    precise = new GaugeField(gParam);
     copyExtendedGauge(*precise, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
     precise->exchangeGhost();
   } else {
@@ -5586,9 +5586,9 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
     gParam.nFace = 3; // FIXME: need a QudaLinkType with nFace=2.
     gParam.pad = gParam.pad*gParam.nFace;
     //
-    gaugeSmeared = new cudaGaugeField(gParam);
+    gaugeSmeared = new GaugeField(gParam);
     
-    cudaGaugeField *two_link_ext = createExtendedGauge(*gaugePrecise, R, profileGauge);//aux field
+    GaugeField *two_link_ext = createExtendedGauge(*gaugePrecise, R, profileGauge);//aux field
     
     computeTwoLink(*gaugeSmeared, *two_link_ext);
     
@@ -5714,7 +5714,7 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
 
   GaugeFieldParam gParam(*gaugeSmeared);
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
-  auto *cudaGaugeTemp = new cudaGaugeField(gParam);
+  auto *cudaGaugeTemp = new GaugeField(gParam);
 
   int measurement_n = 0; // The nth measurement to take
   gaugeObservablesQuda(&obs_param[measurement_n]);
@@ -5820,14 +5820,14 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u
   gParam.location = QUDA_CPU_FIELD_LOCATION;
   gParam.site_offset = param->gauge_offset;
   gParam.site_size = param->site_size;
-  auto *cpuGauge = new cpuGaugeField(gParam);
+  auto *cpuGauge = new GaugeField(gParam);
 
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.link_type = param->type;
   gParam.reconstruct = param->reconstruct;
   gParam.setPrecision(gParam.Precision(), true);
-  auto *cudaInGauge = new cudaGaugeField(gParam);
+  auto *cudaInGauge = new GaugeField(gParam);
 
   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_INIT);
   GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_H2D);
@@ -5836,7 +5836,7 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u
 
   GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_H2D);
 
-  cudaGaugeField *cudaInGaugeEx = nullptr;
+  GaugeField *cudaInGaugeEx = nullptr;
 
   if (comm_size() == 1) {
     // perform the update
@@ -5898,14 +5898,14 @@ int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir,  const
   gParam.location = QUDA_CPU_FIELD_LOCATION;
   gParam.site_offset = param->gauge_offset;
   gParam.site_size = param->site_size;
-  auto *cpuGauge = new cpuGaugeField(gParam);
+  auto *cpuGauge = new GaugeField(gParam);
 
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.link_type = param->type;
   gParam.reconstruct = param->reconstruct;
   gParam.setPrecision(gParam.Precision(), true);
-  auto *cudaInGauge = new cudaGaugeField(gParam);
+  auto *cudaInGauge = new GaugeField(gParam);
 
   GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_INIT);
 
@@ -6000,7 +6000,7 @@ void gaugeObservablesQuda(QudaGaugeObservableParam *param)
 
   if (!gaugePrecise) errorQuda("Cannot compute Polyakov loop as there is no resident gauge field");
 
-  cudaGaugeField *gauge = nullptr;
+  GaugeField *gauge = nullptr;
   if (!gaugeSmeared) {
     if (!extendedGaugeResident) extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGaugeObs);
     gauge = extendedGaugeResident;
diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp
index 8becce7c7b..8b4b123776 100644
--- a/lib/lattice_field.cpp
+++ b/lib/lattice_field.cpp
@@ -613,7 +613,7 @@ namespace quda {
       const ColorSpinorField &csField = static_cast<const ColorSpinorField&>(*this);
       if (csField.FieldOrder() == 2 || csField.FieldOrder() == 4)
 	return static_cast<int>(csField.FieldOrder());
-    } else if (typeid(*this) == typeid(const cudaGaugeField)) {
+    } else if (typeid(*this) == typeid(const GaugeField)) {
       const GaugeField &gField = static_cast<const GaugeField&>(*this);
       if (gField.Order() == 2 || gField.Order() == 4)
 	return static_cast<int>(gField.Order());
diff --git a/lib/milc_interface.cpp b/lib/milc_interface.cpp
index 781bdb4461..8f33083574 100644
--- a/lib/milc_interface.cpp
+++ b/lib/milc_interface.cpp
@@ -2573,7 +2573,7 @@ void* qudaCreateGaugeField(void* gauge, int geometry, int precision)
 void qudaSaveGaugeField(void* gauge, void* inGauge)
 {
   qudamilc_called<true>(__func__);
-  cudaGaugeField* cudaGauge = reinterpret_cast<cudaGaugeField*>(inGauge);
+  auto cudaGauge = reinterpret_cast<GaugeField*>(inGauge);
   QudaGaugeParam qudaGaugeParam = newMILCGaugeParam(localDim, cudaGauge->Precision(), QUDA_GENERAL_LINKS);
   saveGaugeFieldQuda(gauge, inGauge, &qudaGaugeParam);
   qudamilc_called<false>(__func__);
diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp
index b358c83c55..929849fdec 100644
--- a/lib/multigrid.cpp
+++ b/lib/multigrid.cpp
@@ -245,9 +245,9 @@ namespace quda
     popLevel();
   }
 
-  void MG::resetStaggeredKD(cudaGaugeField *gauge_in, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in,
-                            cudaGaugeField *gauge_sloppy_in, cudaGaugeField *fat_gauge_sloppy_in,
-                            cudaGaugeField *long_gauge_sloppy_in, double mass)
+  void MG::resetStaggeredKD(GaugeField *gauge_in, GaugeField *fat_gauge_in, GaugeField *long_gauge_in,
+                            GaugeField *gauge_sloppy_in, GaugeField *fat_gauge_sloppy_in,
+                            GaugeField *long_gauge_sloppy_in, double mass)
   {
     if (param.level != 0) errorQuda("The staggered KD operator can only be updated from level 0");
 
@@ -509,8 +509,8 @@ namespace quda
     bool is_coarse_naive_staggered = is_naive_staggered
       || (is_improved_staggered && param.mg_global.transfer_type[param.level] == QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG);
 
-    cudaGaugeField *fine_gauge = diracSmoother->getStaggeredShortLinkField();
-    cudaGaugeField *sloppy_gauge = mixed_precision_setup ? diracSmootherSloppy->getStaggeredShortLinkField() : fine_gauge;
+    auto fine_gauge = diracSmoother->getStaggeredShortLinkField();
+    auto sloppy_gauge = mixed_precision_setup ? diracSmootherSloppy->getStaggeredShortLinkField() : fine_gauge;
 
     xInvKD = AllocateAndBuildStaggeredKahlerDiracInverse(
       *fine_gauge, diracSmoother->Mass(), param.mg_global.staggered_kd_dagger_approximation == QUDA_BOOLEAN_TRUE);
@@ -523,7 +523,7 @@ namespace quda
       // true is to force FLOAT2
       xinv_param.setPrecision(param.mg_global.invert_param->cuda_prec_precondition, true);
 
-      xInvKD_sloppy = std::shared_ptr<GaugeField>(reinterpret_cast<GaugeField *>(new cudaGaugeField(xinv_param)));
+      xInvKD_sloppy = std::shared_ptr<GaugeField>(reinterpret_cast<GaugeField *>(new GaugeField(xinv_param)));
       xInvKD_sloppy->copy(*xInvKD);
 
       ColorSpinorParam sloppy_tmp_param(*tmp_coarse);
@@ -544,7 +544,7 @@ namespace quda
     diracParamKD.mu_factor = 1.0;          // doesn't matter
     diracParamKD.dagger = QUDA_DAG_NO;
     diracParamKD.matpcType = QUDA_MATPC_EVEN_EVEN; // We can use this to track left vs right block jacobi in the future
-    diracParamKD.gauge = const_cast<cudaGaugeField *>(fine_gauge);
+    diracParamKD.gauge = fine_gauge;
     diracParamKD.xInvKD = xInvKD.get(); // FIXME: pulling a raw unmanaged pointer out of a unique_ptr...
     diracParamKD.dirac
       = const_cast<Dirac *>(diracSmoother); // used to determine if the outer solve is preconditioned or not
diff --git a/lib/staggered_coarse_op.in.cpp b/lib/staggered_coarse_op.in.cpp
index 9560f79b58..bab3e1ffba 100644
--- a/lib/staggered_coarse_op.in.cpp
+++ b/lib/staggered_coarse_op.in.cpp
@@ -7,8 +7,8 @@ namespace quda
   };
 
   template <int fineColor, int coarseColor, int... N>
-  void StaggeredCoarseOp2(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge,
-                          const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
+  void StaggeredCoarseOp2(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge,
+                          const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
                           QudaDiracType dirac, QudaMatPCType matpc, IntList<coarseColor, N...>)
   {
     if (Y.Ncolor() / 2 == coarseColor) {
@@ -24,8 +24,8 @@ namespace quda
   }
 
   template <int fineColor, int... N>
-  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge,
-                         const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
+  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge,
+                         const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
                          QudaDiracType dirac, QudaMatPCType matpc, IntList<fineColor, N...>)
   {
     if (gauge.Ncolor() == fineColor) {
@@ -43,8 +43,8 @@ namespace quda
     }
   }
 
-  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge,
-                         const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
+  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge,
+                         const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
                          QudaDiracType dirac, QudaMatPCType matpc)
   {
     if constexpr (is_enabled_spin(1) && is_enabled_multigrid()) {
diff --git a/lib/staggered_coarse_op.in.cu b/lib/staggered_coarse_op.in.cu
index 103b242655..3a03467d9a 100644
--- a/lib/staggered_coarse_op.in.cu
+++ b/lib/staggered_coarse_op.in.cu
@@ -306,8 +306,8 @@ namespace quda {
   constexpr int coarseColor = @QUDA_MULTIGRID_NVEC@;
 
   template <>
-  void StaggeredCoarseOp<fineColor, coarseColor>(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge,
-                                                 const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass,
+  void StaggeredCoarseOp<fineColor, coarseColor>(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge,
+                                                 const GaugeField &longGauge, const GaugeField &XinvKD, double mass,
                                                  bool allow_truncation, QudaDiracType dirac, QudaMatPCType matpc)
   {
     QudaPrecision precision = checkPrecision(T.Vectors(X.Location()), X, Y);
@@ -351,11 +351,11 @@ namespace quda {
       gf_param.nFace = 1;
       gf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
 
-      tmp_U = std::make_unique<cpuGaugeField>(gf_param);
+      tmp_U = std::make_unique<GaugeField>(gf_param);
       need_tmp_U = true;
 
       //Copy the cuda gauge field to the cpu
-      gauge.saveCPUField(reinterpret_cast<cpuGaugeField&>(*tmp_U));
+      tmp_U.get()->copy(gauge);
 
             // Create either a real or a dummy L field
       GaugeFieldParam lgf_param(longGauge.X(), precision, QUDA_RECONSTRUCT_NO, pad, longGauge.Geometry());
@@ -373,12 +373,12 @@ namespace quda {
       lgf_param.nFace = 3;
       lgf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
 
-      tmp_L = std::make_unique<cpuGaugeField>(lgf_param);
+      tmp_L = std::make_unique<GaugeField>(lgf_param);
       need_tmp_L = true;
 
       //Copy the cuda gauge field to the cpu
       if (dirac == QUDA_ASQTAD_DIRAC || dirac == QUDA_ASQTADPC_DIRAC || dirac == QUDA_ASQTADKD_DIRAC)
-        longGauge.saveCPUField(reinterpret_cast<cpuGaugeField&>(*tmp_L));
+        tmp_L.get()->copy(longGauge);
 
       // Create either a real or a dummy Xinv field
       GaugeFieldParam xgf_param(XinvKD.X(), precision, QUDA_RECONSTRUCT_NO, pad, XinvKD.Geometry());
@@ -400,7 +400,7 @@ namespace quda {
       xgf_param.nFace = 0;
       xgf_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
 
-      tmp_Xinv = std::make_unique<cpuGaugeField>(xgf_param);
+      tmp_Xinv = std::make_unique<GaugeField>(xgf_param);
       need_tmp_Xinv = true;
 
       //Copy the cuda gauge field to the cpu
@@ -419,7 +419,7 @@ namespace quda {
         lgf_param.order = QUDA_FLOAT2_GAUGE_ORDER;
         lgf_param.setPrecision(lgf_param.Precision());
         lgf_param.create = QUDA_NULL_FIELD_CREATE;
-        tmp_L = std::make_unique<cudaGaugeField>(lgf_param);
+        tmp_L = std::make_unique<GaugeField>(lgf_param);
         need_tmp_L = true;
       } else if ((dirac == QUDA_ASQTAD_DIRAC || dirac == QUDA_ASQTADPC_DIRAC || dirac == QUDA_ASQTADKD_DIRAC) && longGauge.Reconstruct() != QUDA_RECONSTRUCT_NO) {
         // create a copy of the gauge field with no reconstruction
@@ -427,7 +427,7 @@ namespace quda {
         lgf_param.reconstruct = QUDA_RECONSTRUCT_NO;
         lgf_param.order = QUDA_FLOAT2_GAUGE_ORDER;
         lgf_param.setPrecision(lgf_param.Precision());
-        tmp_L = std::make_unique<cudaGaugeField>(lgf_param);
+        tmp_L = std::make_unique<GaugeField>(lgf_param);
 
         tmp_L->copy(longGauge);
         tmp_L->exchangeGhost();
@@ -443,7 +443,7 @@ namespace quda {
         xgf_param.order = QUDA_FLOAT2_GAUGE_ORDER;
         xgf_param.setPrecision(xgf_param.Precision());
         xgf_param.create = QUDA_NULL_FIELD_CREATE;
-        tmp_Xinv = std::make_unique<cudaGaugeField>(xgf_param);
+        tmp_Xinv = std::make_unique<GaugeField>(xgf_param);
         need_tmp_Xinv = true;
       }
       // no need to worry about XinvKD's reconstruct
@@ -454,7 +454,7 @@ namespace quda {
         gf_param.reconstruct = QUDA_RECONSTRUCT_NO;
         gf_param.order = QUDA_FLOAT2_GAUGE_ORDER;
         gf_param.setPrecision(gf_param.Precision());
-        tmp_U = std::make_unique<cudaGaugeField>(gf_param);
+        tmp_U = std::make_unique<GaugeField>(gf_param);
         need_tmp_U = true;
 
         tmp_U->copy(gauge);
diff --git a/lib/staggered_kd_build_xinv.cu b/lib/staggered_kd_build_xinv.cu
index 2ed47976f4..b1195d9f4e 100644
--- a/lib/staggered_kd_build_xinv.cu
+++ b/lib/staggered_kd_build_xinv.cu
@@ -113,7 +113,7 @@ namespace quda {
      @param mass[in] Mass of staggered fermion
      @param dagger_approximation[in] Whether or not to use the dagger approximation, using the dagger of X instead of Xinv
    */
-  void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const cudaGaugeField &gauge, const double mass, const bool dagger_approximation)
+  void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const GaugeField &gauge, const double mass, const bool dagger_approximation)
   {
     using namespace blas_lapack;
     auto invert = use_native() ? native::BatchInvertMatrix : generic::BatchInvertMatrix;
@@ -154,13 +154,7 @@ namespace quda {
       gParam.geometry = QUDA_SCALAR_GEOMETRY;
       gParam.pad = 0;
 
-      if (location == QUDA_CUDA_FIELD_LOCATION)
-        xInvMilcOrder = std::make_unique<cudaGaugeField>(gParam);
-      else if (location == QUDA_CPU_FIELD_LOCATION)
-        xInvMilcOrder = std::make_unique<cpuGaugeField>(gParam);
-      else
-        errorQuda("Invalid field location %d", location);
-
+      xInvMilcOrder = std::make_unique<GaugeField>(gParam);
     }
 
     // Step 2: build a host or device gauge field as appropriate, but
@@ -190,7 +184,7 @@ namespace quda {
         gf_param.nFace = 1;
         gf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
 
-        tmp_U = std::make_unique<cpuGaugeField>(gf_param);
+        tmp_U = std::make_unique<GaugeField>(gf_param);
 
         //Copy the cuda gauge field to the cpu
         tmp_U.get()->copy(gauge);
@@ -202,7 +196,7 @@ namespace quda {
         gf_param.reconstruct = QUDA_RECONSTRUCT_NO;
         gf_param.order = QUDA_FLOAT2_GAUGE_ORDER; // guaranteed for no recon
         gf_param.setPrecision( QUDA_SINGLE_PRECISION );
-        tmp_U = std::make_unique<cudaGaugeField>(gf_param);
+        tmp_U = std::make_unique<GaugeField>(gf_param);
 
         tmp_U->copy(gauge);
       }
@@ -216,10 +210,8 @@ namespace quda {
     if (location == QUDA_CUDA_FIELD_LOCATION) {
       x_param.order = QUDA_FLOAT2_GAUGE_ORDER;
       x_param.setPrecision(x_param.Precision());
-      tmp_X = std::make_unique<cudaGaugeField>(x_param);
-    } else {
-      tmp_X = std::make_unique<cpuGaugeField>(x_param);
     }
+    tmp_X = std::make_unique<GaugeField>(x_param);
     GaugeField& X = *tmp_X;
 
     // Step 4: Calculate X from U
@@ -241,7 +233,7 @@ namespace quda {
         GaugeFieldParam param(*xInvMilcOrder);
         param.order = QUDA_MILC_GAUGE_ORDER; // MILC order == QDP order for Xinv
         param.setPrecision(QUDA_SINGLE_PRECISION);
-        cudaGaugeField X_(param);
+        GaugeField X_(param);
         
         X_.copy(X);
 
@@ -268,7 +260,7 @@ namespace quda {
 
 
   // Allocates and calculates the inverse KD block, returning Xinv
-  std::shared_ptr<GaugeField> AllocateAndBuildStaggeredKahlerDiracInverse(const cudaGaugeField &gauge, const double mass, const bool dagger_approximation)
+  std::shared_ptr<GaugeField> AllocateAndBuildStaggeredKahlerDiracInverse(const GaugeField &gauge, const double mass, const bool dagger_approximation)
   {
     GaugeFieldParam gParam(gauge);
     gParam.reconstruct = QUDA_RECONSTRUCT_NO;
@@ -282,7 +274,7 @@ namespace quda {
     // latter true is to force FLOAT2
     gParam.setPrecision(gauge.Precision(), true);
 
-    std::shared_ptr<GaugeField> Xinv(reinterpret_cast<GaugeField*>(new cudaGaugeField(gParam)));
+    std::shared_ptr<GaugeField> Xinv(reinterpret_cast<GaugeField*>(new GaugeField(gParam)));
 
     BuildStaggeredKahlerDiracInverse(*Xinv, gauge, mass, dagger_approximation);
 
diff --git a/tests/covdev_test.cpp b/tests/covdev_test.cpp
index ebe5e784b8..0a5d5d38c7 100644
--- a/tests/covdev_test.cpp
+++ b/tests/covdev_test.cpp
@@ -25,7 +25,7 @@ using namespace quda;
 QudaGaugeParam gauge_param;
 QudaInvertParam inv_param;
 
-cpuGaugeField *cpuLink = nullptr;
+GaugeField *cpuLink = nullptr;
 
 std::unique_ptr<ColorSpinorField> spinor, spinorOut, spinorRef;
 std::unique_ptr<ColorSpinorField> cudaSpinor, cudaSpinorOut;
@@ -94,7 +94,7 @@ void init(int argc, char **argv)
   // cpuLink is only used for ghost allocation
   GaugeFieldParam cpuParam(gauge_param, links);
   cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-  cpuLink = new cpuGaugeField(cpuParam);
+  cpuLink = new GaugeField(cpuParam);
 
   printfQuda("Links sending...");
   loadGaugeQuda(links, &gauge_param);
diff --git a/tests/gauge_alg_test.cpp b/tests/gauge_alg_test.cpp
index adf22a0f30..00ba1f3689 100644
--- a/tests/gauge_alg_test.cpp
+++ b/tests/gauge_alg_test.cpp
@@ -108,7 +108,7 @@ class GaugeAlgTest : public ::testing::Test
           gParam.x[d] += 2 * gParam.r[d];
         }
 
-        U = new cudaGaugeField(gParam);
+        U = new GaugeField(gParam);
 
         RNG randstates(*U, 1234);
 
@@ -160,12 +160,12 @@ class GaugeAlgTest : public ::testing::Test
           for (int d = 0; d < 4; d++)
             if (comm_dim_partitioned(d)) R[d] = 2;
           static TimeProfile GaugeFix("GaugeFix");
-          cudaGaugeField *tmp = new cudaGaugeField(gauge_field_param);
+          GaugeField *tmp = new GaugeField(gauge_field_param);
           tmp->copy(*host);
           U = createExtendedGauge(*tmp, R, GaugeFix);
           delete tmp;
         } else {
-          U = new cudaGaugeField(gauge_field_param);
+          U = new GaugeField(gauge_field_param);
           U->copy(*host);
         }
 
@@ -266,7 +266,7 @@ class GaugeAlgTest : public ::testing::Test
     gParam.reconstruct = param.reconstruct;
     gParam.setPrecision(gParam.Precision(), true);
 
-    cudaGaugeField *gauge = new cudaGaugeField(gParam);
+    GaugeField *gauge = new GaugeField(gParam);
 
     // copy into regular field
     copyExtendedGauge(*gauge, *U, QUDA_CUDA_FIELD_LOCATION);
diff --git a/tests/gauge_path_test.cpp b/tests/gauge_path_test.cpp
index 56c2146b4b..7d37c9faad 100644
--- a/tests/gauge_path_test.cpp
+++ b/tests/gauge_path_test.cpp
@@ -128,13 +128,13 @@ void gauge_force_test(bool compute_force = true)
   param.create = QUDA_NULL_FIELD_CREATE;
   param.order = QUDA_QDP_GAUGE_ORDER;
   param.location = QUDA_CPU_FIELD_LOCATION;
-  quda::cpuGaugeField U_qdp(param);
+  quda::GaugeField U_qdp(param);
 
   // fills the gauge field with random numbers
   createSiteLinkCPU(U_qdp, gauge_param.cpu_prec, 0);
 
   param.order = QUDA_MILC_GAUGE_ORDER;
-  quda::cpuGaugeField U_milc(param);
+  quda::GaugeField U_milc(param);
   if (gauge_order == QUDA_MILC_GAUGE_ORDER) U_milc.copy(U_qdp);
   if (compute_force) {
     param.reconstruct = QUDA_RECONSTRUCT_10;
@@ -143,11 +143,11 @@ void gauge_force_test(bool compute_force = true)
     param.reconstruct = QUDA_RECONSTRUCT_NO;
   }
   param.create = QUDA_ZERO_FIELD_CREATE;
-  quda::cpuGaugeField Mom_milc(param);
-  quda::cpuGaugeField Mom_ref_milc(param);
+  quda::GaugeField Mom_milc(param);
+  quda::GaugeField Mom_ref_milc(param);
 
   param.order = QUDA_QDP_GAUGE_ORDER;
-  quda::cpuGaugeField Mom_qdp(param);
+  quda::GaugeField Mom_qdp(param);
 
   // initialize some data in cpuMom
   if (compute_force) {
@@ -260,13 +260,13 @@ void gauge_loop_test()
   param.create = QUDA_NULL_FIELD_CREATE;
   param.order = QUDA_QDP_GAUGE_ORDER;
   param.location = QUDA_CPU_FIELD_LOCATION;
-  quda::cpuGaugeField U_qdp(param);
+  quda::GaugeField U_qdp(param);
 
   // fills the gauge field with random numbers
   createSiteLinkCPU(U_qdp, gauge_param.cpu_prec, 0);
 
   param.order = QUDA_MILC_GAUGE_ORDER;
-  quda::cpuGaugeField U_milc(param);
+  quda::GaugeField U_milc(param);
   if (gauge_order == QUDA_MILC_GAUGE_ORDER) U_milc.copy(U_qdp);
 
   void *sitelink = nullptr;
diff --git a/tests/heatbath_test.cpp b/tests/heatbath_test.cpp
index d840fda4c7..4ad648958b 100644
--- a/tests/heatbath_test.cpp
+++ b/tests/heatbath_test.cpp
@@ -110,7 +110,7 @@ int main(int argc, char **argv)
     gParam.link_type = gauge_param.type;
     gParam.reconstruct = gauge_param.reconstruct;
     gParam.setPrecision(gParam.Precision(), true);
-    cudaGaugeField *gauge = new cudaGaugeField(gParam);
+    GaugeField *gauge = new GaugeField(gParam);
 
     int pad = 0;
     lat_dim_t y;
@@ -126,7 +126,7 @@ int main(int argc, char **argv)
     gParamEx.t_boundary = gParam.t_boundary;
     gParamEx.nFace = 1;
     for (int dir = 0; dir < 4; ++dir) gParamEx.r[dir] = R[dir];
-    cudaGaugeField *gaugeEx = new cudaGaugeField(gParamEx);
+    GaugeField *gaugeEx = new GaugeField(gParamEx);
     // CURAND random generator initialization
     RNG *randstates = new RNG(*gauge, 1234);
 
diff --git a/tests/hisq_paths_force_test.cpp b/tests/hisq_paths_force_test.cpp
index 58b8299223..07f7b4e17b 100644
--- a/tests/hisq_paths_force_test.cpp
+++ b/tests/hisq_paths_force_test.cpp
@@ -15,34 +15,34 @@
 
 using namespace quda;
 
-cpuGaugeField *cpuGauge = NULL;
-cudaGaugeField *cudaForce = NULL;
-cpuGaugeField *cpuForce = NULL;
-cpuGaugeField *hostVerifyForce = NULL;
+GaugeField *cpuGauge = NULL;
+GaugeField *cudaForce = NULL;
+GaugeField *cpuForce = NULL;
+GaugeField *hostVerifyForce = NULL;
 
-cudaGaugeField *cudaMom = NULL;
-cpuGaugeField *cpuMom = NULL;
-cpuGaugeField *refMom = NULL;
+GaugeField *cudaMom = NULL;
+GaugeField *cpuMom = NULL;
+GaugeField *refMom = NULL;
 
 QudaGaugeFieldOrder gauge_order = QUDA_QDP_GAUGE_ORDER;
 
-cpuGaugeField *cpuOprod = NULL;
-cudaGaugeField *cudaOprod = NULL;
-cpuGaugeField *cpuLongLinkOprod = NULL;
-cudaGaugeField *cudaLongLinkOprod = NULL;
+GaugeField *cpuOprod = NULL;
+GaugeField *cudaOprod = NULL;
+GaugeField *cpuLongLinkOprod = NULL;
+GaugeField *cudaLongLinkOprod = NULL;
 
 int ODD_BIT = 1;
 
 QudaPrecision force_prec = QUDA_DOUBLE_PRECISION;
 
-cudaGaugeField *cudaGauge_ex = NULL;
-cpuGaugeField *cpuGauge_ex = NULL;
-cudaGaugeField *cudaForce_ex = NULL;
-cpuGaugeField *cpuForce_ex = NULL;
-cpuGaugeField *cpuOprod_ex = NULL;
-cudaGaugeField *cudaOprod_ex = NULL;
-cpuGaugeField *cpuLongLinkOprod_ex = NULL;
-cudaGaugeField *cudaLongLinkOprod_ex = NULL;
+GaugeField *cudaGauge_ex = NULL;
+GaugeField *cpuGauge_ex = NULL;
+GaugeField *cudaForce_ex = NULL;
+GaugeField *cpuForce_ex = NULL;
+GaugeField *cpuOprod_ex = NULL;
+GaugeField *cudaOprod_ex = NULL;
+GaugeField *cpuLongLinkOprod_ex = NULL;
+GaugeField *cudaLongLinkOprod_ex = NULL;
 
 static void setPrecision(QudaPrecision precision)
 {
@@ -227,7 +227,7 @@ static void hisq_force_startup()
     gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0;
     gParam_ex.x[d] = X[d] + 2 * gParam_ex.r[d];
   } // set halo region for GPU
-  cudaGauge_ex = new cudaGaugeField(gParam_ex);
+  cudaGauge_ex = new GaugeField(gParam_ex);
 
   // Create the host gauge field
   memcpy(&qudaGaugeParam_ex, &qudaGaugeParam, sizeof(QudaGaugeParam));
@@ -238,7 +238,7 @@ static void hisq_force_startup()
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.order = gauge_order;
-  cpuGauge = new cpuGaugeField(gParam);
+  cpuGauge = new GaugeField(gParam);
 
   gParam_ex = GaugeFieldParam(qudaGaugeParam_ex);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
@@ -250,7 +250,7 @@ static void hisq_force_startup()
     gParam_ex.r[d] = R[d];
     gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d];
   } // set halo region for CPU
-  cpuGauge_ex = new cpuGaugeField(gParam_ex);
+  cpuGauge_ex = new GaugeField(gParam_ex);
 
   auto generated_link_type = (link_recon == QUDA_RECONSTRUCT_NO ?
                                 SITELINK_PHASE_NO :
@@ -279,8 +279,8 @@ static void hisq_force_startup()
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.order = gauge_order;
-  cpuForce = new cpuGaugeField(gParam);
-  hostVerifyForce = new cpuGaugeField(gParam);
+  cpuForce = new GaugeField(gParam);
+  hostVerifyForce = new GaugeField(gParam);
 
   gParam_ex.location = QUDA_CPU_FIELD_LOCATION;
   gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
@@ -292,7 +292,7 @@ static void hisq_force_startup()
     gParam_ex.r[d] = R[d];
     gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d];
   }
-  cpuForce_ex = new cpuGaugeField(gParam_ex);
+  cpuForce_ex = new GaugeField(gParam_ex);
 
   // create the momentum matrix
   gParam.location = QUDA_CPU_FIELD_LOCATION;
@@ -302,8 +302,8 @@ static void hisq_force_startup()
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   gParam.order = QUDA_MILC_GAUGE_ORDER;
   gParam.create = QUDA_NULL_FIELD_CREATE;
-  cpuMom = new cpuGaugeField(gParam);
-  refMom = new cpuGaugeField(gParam);
+  cpuMom = new GaugeField(gParam);
+  refMom = new GaugeField(gParam);
 
   /**********************************
    * Create the outer product fields *
@@ -316,8 +316,8 @@ static void hisq_force_startup()
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.order = gauge_order;
-  cpuOprod = new cpuGaugeField(gParam);
-  cpuLongLinkOprod = new cpuGaugeField(gParam);
+  cpuOprod = new GaugeField(gParam);
+  cpuLongLinkOprod = new GaugeField(gParam);
 
   // Create extended outer product fields
   gParam_ex.location = QUDA_CPU_FIELD_LOCATION;
@@ -328,8 +328,8 @@ static void hisq_force_startup()
     gParam_ex.r[d] = R[d];
     gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d];
   } // set halo region for CPU
-  cpuOprod_ex = new cpuGaugeField(gParam_ex);
-  cpuLongLinkOprod_ex = new cpuGaugeField(gParam_ex);
+  cpuOprod_ex = new GaugeField(gParam_ex);
+  cpuLongLinkOprod_ex = new GaugeField(gParam_ex);
 
   // initialize the CPU outer product fields and exchange once
   createStagForOprodCPU(stag_for_oprod, force_prec, qudaGaugeParam.X, *rng);
@@ -352,9 +352,9 @@ static void hisq_force_startup()
     gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0;
     gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d];
   } // set halo region
-  cudaForce_ex = new cudaGaugeField(gParam_ex);
-  cudaOprod_ex = new cudaGaugeField(gParam_ex);
-  cudaLongLinkOprod_ex = new cudaGaugeField(gParam_ex);
+  cudaForce_ex = new GaugeField(gParam_ex);
+  cudaOprod_ex = new GaugeField(gParam_ex);
+  cudaLongLinkOprod_ex = new GaugeField(gParam_ex);
 
   // create a device force for verify
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
@@ -362,7 +362,7 @@ static void hisq_force_startup()
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.setPrecision(prec, true);
-  cudaForce = new cudaGaugeField(gParam);
+  cudaForce = new GaugeField(gParam);
 
   // create the device momentum field
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
@@ -370,7 +370,7 @@ static void hisq_force_startup()
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParam.setPrecision(prec, true);
-  cudaMom = new cudaGaugeField(gParam);
+  cudaMom = new GaugeField(gParam);
 
   /********************************************************************
    * Copy to and exchange gauge and outer product fields on the device *
diff --git a/tests/hisq_unitarize_force_test.cpp b/tests/hisq_unitarize_force_test.cpp
index d27b09bfd8..41b977b8e1 100644
--- a/tests/hisq_unitarize_force_test.cpp
+++ b/tests/hisq_unitarize_force_test.cpp
@@ -12,16 +12,16 @@
 #include <sys/time.h>
 #include <gtest/gtest.h>
 
-quda::cudaGaugeField *cudaFatLink = NULL;
-quda::cpuGaugeField *cpuFatLink = NULL;
+quda::GaugeField *cudaFatLink = NULL;
+quda::GaugeField *cpuFatLink = NULL;
 
-quda::cudaGaugeField *cudaOprod = NULL;
-quda::cpuGaugeField *cpuOprod = NULL;
+quda::GaugeField *cudaOprod = NULL;
+quda::GaugeField *cpuOprod = NULL;
 
-quda::cudaGaugeField *cudaResult = NULL;
-quda::cpuGaugeField *cpuResult = NULL;
+quda::GaugeField *cudaResult = NULL;
+quda::GaugeField *cpuResult = NULL;
 
-quda::cpuGaugeField *cpuReference = NULL;
+quda::GaugeField *cpuReference = NULL;
 
 static QudaGaugeParam gaugeParam;
 
@@ -66,10 +66,10 @@ static void hisq_force_init()
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   gParam.anisotropy = 1;
 
-  cpuFatLink = new quda::cpuGaugeField(gParam);
-  cpuOprod = new quda::cpuGaugeField(gParam);
-  cpuResult = new quda::cpuGaugeField(gParam);
-  cpuReference = new quda::cpuGaugeField(gParam);
+  cpuFatLink = new quda::GaugeField(gParam);
+  cpuOprod = new quda::GaugeField(gParam);
+  cpuResult = new quda::GaugeField(gParam);
+  cpuReference = new quda::GaugeField(gParam);
 
   // create "gauge fields"
   int seed = 0;
@@ -83,9 +83,9 @@ static void hisq_force_init()
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.setPrecision(gaugeParam.cuda_prec, true);
 
-  cudaFatLink = new quda::cudaGaugeField(gParam);
-  cudaOprod = new quda::cudaGaugeField(gParam);
-  cudaResult = new quda::cudaGaugeField(gParam);
+  cudaFatLink = new quda::GaugeField(gParam);
+  cudaOprod = new quda::GaugeField(gParam);
+  cudaResult = new quda::GaugeField(gParam);
 
   gParam.order = QUDA_QDP_GAUGE_ORDER;
 
diff --git a/tests/host_reference/domain_wall_dslash_reference.cpp b/tests/host_reference/domain_wall_dslash_reference.cpp
index 5fba06fe30..29edd18a44 100644
--- a/tests/host_reference/domain_wall_dslash_reference.cpp
+++ b/tests/host_reference/domain_wall_dslash_reference.cpp
@@ -763,7 +763,7 @@ void dw_dslash(void *out, void *const *gauge, void *in, int oddBit, int daggerBi
 {
   GaugeFieldParam gauge_field_param(gauge_param, (void **)gauge);
   gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-  cpuGaugeField cpu(gauge_field_param);
+  GaugeField cpu(gauge_field_param);
   void *ghostGauge[4] = {cpu.Ghost()[0].data(), cpu.Ghost()[1].data(), cpu.Ghost()[2].data(), cpu.Ghost()[3].data()};
 
   // Get spinor ghost fields
@@ -830,7 +830,7 @@ void dslash_4_4d(void *out, void *const *gauge, void *in, int oddBit, int dagger
 {
   GaugeFieldParam gauge_field_param(gauge_param, (void **)gauge);
   gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-  cpuGaugeField cpu(gauge_field_param);
+  GaugeField cpu(gauge_field_param);
   void *ghostGauge[4] = {cpu.Ghost()[0].data(), cpu.Ghost()[1].data(), cpu.Ghost()[2].data(), cpu.Ghost()[3].data()};
 
   // Get spinor ghost fields
@@ -1318,7 +1318,7 @@ void mdw_mdagm_local(void *out, void *const *gauge, void *in, double _Complex *k
   lat_dim_t R;
   for (int d = 0; d < 4; d++) { R[d] = comm_dim_partitioned(d) ? 2 : 0; }
 
-  cpuGaugeField *padded_gauge = createExtendedGauge((void **)gauge, gauge_param, R);
+  GaugeField *padded_gauge = createExtendedGauge((void **)gauge, gauge_param, R);
 
   int padded_V = 1;
   int W[4];
diff --git a/tests/host_reference/dslash_test_helpers.cpp b/tests/host_reference/dslash_test_helpers.cpp
index b46b69ff75..be2a7cac18 100644
--- a/tests/host_reference/dslash_test_helpers.cpp
+++ b/tests/host_reference/dslash_test_helpers.cpp
@@ -7,9 +7,9 @@
 using namespace quda;
 
 // need a better solution here but as long as they gauge field live in interface probably ok
-extern cudaGaugeField *gaugePrecise;
-extern cudaGaugeField *gaugeFatPrecise;
-extern cudaGaugeField *gaugeLongPrecise;
+extern GaugeField *gaugePrecise;
+extern GaugeField *gaugeFatPrecise;
+extern GaugeField *gaugeLongPrecise;
 
 void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, dslash_test_type test_type)
 {
diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp
index a895730b33..83c5251e27 100644
--- a/tests/host_reference/gauge_force_reference.cpp
+++ b/tests/host_reference/gauge_force_reference.cpp
@@ -9,6 +9,7 @@
 #include "host_utils.h"
 #include "misc.h"
 #include "gauge_force_reference.h"
+#include "timer.h"
 
 extern int Z[4];
 extern int V;
@@ -491,6 +492,8 @@ void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int **
   param.t_boundary = QUDA_PERIODIC_T;
 
   auto qdp_ex = quda::createExtendedGauge((void **)sitelink, param, R);
+  //quda::TimeProfile dummy("blah");
+  //auto qdp_ex = quda::createExtendedGauge(u, R, dummy);
   lattice_t lat(*qdp_ex);
 
   void *sitelink_ex[] = {qdp_ex->data(0), qdp_ex->data(1), qdp_ex->data(2), qdp_ex->data(3)};
diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp
index 9ed6c0913d..58c6762e70 100644
--- a/tests/host_reference/hisq_force_reference.cpp
+++ b/tests/host_reference/hisq_force_reference.cpp
@@ -1197,8 +1197,8 @@ void doHisqStaplesForceCPU(const int dim[4], PathCoefficients<double> staple_coe
 #undef Qmu
 #undef Qnumu
 
-void hisqStaplesForceCPU(const double *path_coeff, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link,
-                         quda::cpuGaugeField *newOprod)
+void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda::GaugeField &link,
+                         quda::GaugeField *newOprod)
 {
   int X_[4];
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
@@ -1301,8 +1301,8 @@ void computeLongLinkField(const int dim[4], const Real *const oprod, const Real
   }
 }
 
-void hisqLongLinkForceCPU(double coeff, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link,
-                          quda::cpuGaugeField *newOprod)
+void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeField &link,
+                          quda::GaugeField *newOprod)
 {
   int X_[4];
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
@@ -1360,7 +1360,7 @@ void completeForceField(const int dim[4], const Real *const oprod, const Real *c
   for (int site = 0; site < half_volume; ++site) { completeForceSite<Real, 1>(site, dim, oprod, link, sig, ls, mom); }
 }
 
-void hisqCompleteForceCPU(quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *mom)
+void hisqCompleteForceCPU(quda::GaugeField &oprod, quda::GaugeField &link, quda::GaugeField *mom)
 {
   int X_[4];
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
diff --git a/tests/host_reference/hisq_force_reference.h b/tests/host_reference/hisq_force_reference.h
index 6e5e2923e4..fb8b773f84 100644
--- a/tests/host_reference/hisq_force_reference.h
+++ b/tests/host_reference/hisq_force_reference.h
@@ -21,8 +21,8 @@ void computeLinkOrderedOuterProduct(void *src, void *dest, QudaPrecision precisi
    @param[in] link Gauge field links
    @param[out] newOprod Force accumulated with fat link contributions
 */
-void hisqStaplesForceCPU(const double *path_coeff, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link,
-                         quda::cpuGaugeField *newOprod);
+void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda::GaugeField &link,
+                         quda::GaugeField *newOprod);
 
 /**
    @brief Compute the force contribution from the long link, CPU version
@@ -31,8 +31,8 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::cpuGaugeField &oprod, q
    @param[in] link Gauge field links
    @param[out] newOprod Force accumulated with fat link contributions
 */
-void hisqLongLinkForceCPU(double coeff, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link,
-                          quda::cpuGaugeField *newOprod);
+void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeField &link,
+                          quda::GaugeField *newOprod);
 
 /**
    @brief Accumulate the force contributions into the momentum field, CPU version
@@ -40,6 +40,6 @@ void hisqLongLinkForceCPU(double coeff, quda::cpuGaugeField &oprod, quda::cpuGau
    @param[in] link Gauge field links
    @param[out] mom Accumulated momentum
 */
-void hisqCompleteForceCPU(quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *mom);
+void hisqCompleteForceCPU(quda::GaugeField &oprod, quda::GaugeField &link, quda::GaugeField *mom);
 
 #endif
diff --git a/tests/host_reference/wilson_dslash_reference.cpp b/tests/host_reference/wilson_dslash_reference.cpp
index fbe5aa241d..471f79c38d 100644
--- a/tests/host_reference/wilson_dslash_reference.cpp
+++ b/tests/host_reference/wilson_dslash_reference.cpp
@@ -192,7 +192,7 @@ void wil_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, Qu
   GaugeFieldParam gauge_field_param(gauge_param, gauge);
   gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
   gauge_field_param.location = QUDA_CPU_FIELD_LOCATION;
-  cpuGaugeField cpu(gauge_field_param);
+  GaugeField cpu(gauge_field_param);
   void *ghostGauge[4] = {cpu.Ghost()[0].data(), cpu.Ghost()[1].data(), cpu.Ghost()[2].data(), cpu.Ghost()[3].data()};
 
   // Get spinor ghost fields
diff --git a/tests/multigrid_benchmark_test.cpp b/tests/multigrid_benchmark_test.cpp
index f954abe366..517d48de51 100644
--- a/tests/multigrid_benchmark_test.cpp
+++ b/tests/multigrid_benchmark_test.cpp
@@ -23,7 +23,7 @@ using namespace quda;
 
 std::vector<ColorSpinorField> xD, yD;
 
-cudaGaugeField *Y_d, *X_d, *Xinv_d, *Yhat_d;
+GaugeField *Y_d, *X_d, *Xinv_d, *Yhat_d;
 
 int Ncolor;
 
@@ -97,14 +97,14 @@ void initFields(QudaPrecision prec)
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
 
-  Y_d = new cudaGaugeField(gParam);
-  Yhat_d = new cudaGaugeField(gParam);
+  Y_d = new GaugeField(gParam);
+  Yhat_d = new GaugeField(gParam);
 
   gParam.geometry = QUDA_SCALAR_GEOMETRY;
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   gParam.nFace = 0;
-  X_d = new cudaGaugeField(gParam);
-  Xinv_d = new cudaGaugeField(gParam);
+  X_d = new GaugeField(gParam);
+  Xinv_d = new GaugeField(gParam);
 
   // insert random noise into the gauge fields
   {
diff --git a/tests/multigrid_evolve_test.cpp b/tests/multigrid_evolve_test.cpp
index 2436ddabf7..9545942a51 100644
--- a/tests/multigrid_evolve_test.cpp
+++ b/tests/multigrid_evolve_test.cpp
@@ -225,11 +225,11 @@ int main(int argc, char **argv)
     gParam.link_type = gauge_param.type;
     gParam.reconstruct = gauge_param.reconstruct;
     gParam.setPrecision(gParam.Precision(), true);
-    cudaGaugeField gauge(gParam);
+    GaugeField gauge(gParam);
 
     int pad = 0;
-    lat_dim_t y;
-    lat_dim_t R;
+    lat_dim_t y = {};
+    lat_dim_t R = {};
     for (int dir = 0; dir < 4; ++dir)
       if (comm_dim_partitioned(dir)) R[dir] = 2;
     for (int dir = 0; dir < 4; ++dir) y[dir] = gauge_param.X[dir] + 2 * R[dir];
@@ -241,7 +241,8 @@ int main(int argc, char **argv)
     gParamEx.t_boundary = gParam.t_boundary;
     gParamEx.nFace = 1;
     gParamEx.r = R;
-    cudaGaugeField gaugeEx(gParamEx);
+
+    GaugeField gaugeEx(gParamEx);
 
     QudaGaugeObservableParam obs_param = newQudaGaugeObservableParam();
     obs_param.compute_plaquette = QUDA_BOOLEAN_TRUE;
diff --git a/tests/pack_test.cpp b/tests/pack_test.cpp
index 694c993895..fe68c2645c 100644
--- a/tests/pack_test.cpp
+++ b/tests/pack_test.cpp
@@ -108,12 +108,12 @@ void packTest()
     param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER;
 
     GaugeFieldParam cpsParam(param, cpsCpuGauge_p);
-    cpuGaugeField cpsCpuGauge(cpsParam);
+    GaugeField cpsCpuGauge(cpsParam);
     cpsParam.create = QUDA_NULL_FIELD_CREATE;
     cpsParam.reconstruct = param.reconstruct;
     cpsParam.setPrecision(param.cuda_prec, true);
     cpsParam.pad = param.ga_pad;
-    cudaGaugeField cudaCpsGauge(cpsParam);
+    GaugeField cudaCpsGauge(cpsParam);
 
     host_timer.start();
     cudaCpsGauge.copy(cpsCpuGauge);
@@ -121,7 +121,7 @@ void packTest()
     printfQuda("CPS Gauge send time = %e seconds\n", host_timer.last());
 
     host_timer.start();
-    cpuCpuGauge.copy(cudaCpsGauge);
+    cpsCpuGauge.copy(cudaCpsGauge);
     host_timer.stop();
     printfQuda("CPS Gauge restore time = %e seconds\n", host_timer.last());
   }
@@ -132,12 +132,12 @@ void packTest()
     param.gauge_order = QUDA_QDP_GAUGE_ORDER;
 
     GaugeFieldParam qdpParam(param, qdpCpuGauge_p);
-    cpuGaugeField qdpCpuGauge(qdpParam);
+    GaugeField qdpCpuGauge(qdpParam);
     qdpParam.create = QUDA_NULL_FIELD_CREATE;
     qdpParam.reconstruct = param.reconstruct;
     qdpParam.setPrecision(param.cuda_prec, true);
     qdpParam.pad = param.ga_pad;
-    cudaGaugeField cudaQdpGauge(qdpParam);
+    GaugeField cudaQdpGauge(qdpParam);
 
     host_timer.start();
     cudaQdpGauge.copy(qdpCpuGauge);
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 3ff8fecb5a..5ee2616ad8 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -52,8 +52,8 @@ struct StaggeredDslashTestWrapper {
   void *milc_fatlink_gpu;
   void *milc_longlink_gpu;
 
-  cpuGaugeField *cpuFat = nullptr;
-  cpuGaugeField *cpuLong = nullptr;
+  GaugeField *cpuFat = nullptr;
+  GaugeField *cpuLong = nullptr;
 
   ColorSpinorField spinor;
   ColorSpinorField spinorOut;
@@ -204,14 +204,14 @@ struct StaggeredDslashTestWrapper {
     gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
     GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink_cpu);
     cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-    cpuFat = new cpuGaugeField(cpuFatParam);
+    cpuFat = new GaugeField(cpuFatParam);
     for (int i = 0; i < 4; i++) ghost_fatlink_cpu[i] = cpuFat->Ghost()[i].data();
 
     if (dslash_type == QUDA_ASQTAD_DSLASH) {
       gauge_param.type = QUDA_ASQTAD_LONG_LINKS;
       GaugeFieldParam cpuLongParam(gauge_param, milc_longlink_cpu);
       cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-      cpuLong = new cpuGaugeField(cpuLongParam);
+      cpuLong = new GaugeField(cpuLongParam);
       for (int i = 0; i < 4; i++) ghost_longlink_cpu[i] = cpuLong ? cpuLong->Ghost()[i].data() : nullptr;
     }
 #endif
diff --git a/tests/unitarize_link_test.cpp b/tests/unitarize_link_test.cpp
index 2d9dc14210..4cd8553fdd 100644
--- a/tests/unitarize_link_test.cpp
+++ b/tests/unitarize_link_test.cpp
@@ -32,8 +32,8 @@ static double max_allowed_error = 1e-11;
 
 static QudaGaugeFieldOrder gauge_order = QUDA_MILC_GAUGE_ORDER;
 
-quda::cpuGaugeField *cpuFatLink, *cpuULink, *cudaResult;
-quda::cudaGaugeField *cudaFatLink, *cudaULink;
+quda::GaugeField *cpuFatLink, *cpuULink, *cudaResult;
+quda::GaugeField *cudaFatLink, *cudaULink;
 
 const double unittol = (prec == QUDA_DOUBLE_PRECISION) ? 1e-10 : 1e-6;
 
@@ -124,21 +124,21 @@ static int unitarize_link_test(int &test_rc)
   gParam.create = QUDA_REFERENCE_FIELD_CREATE;
   gParam.gauge = fatlink;
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  cpuFatLink = new quda::cpuGaugeField(gParam);
+  cpuFatLink = new quda::GaugeField(gParam);
 
   gParam.create = QUDA_ZERO_FIELD_CREATE;
-  cpuULink = new quda::cpuGaugeField(gParam);
+  cpuULink = new quda::GaugeField(gParam);
 
   gParam.create = QUDA_ZERO_FIELD_CREATE;
-  cudaResult = new quda::cpuGaugeField(gParam);
+  cudaResult = new quda::GaugeField(gParam);
 
   gParam.pad = 0;
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.setPrecision(prec, true);
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
-  cudaFatLink = new quda::cudaGaugeField(gParam);
-  cudaULink = new quda::cudaGaugeField(gParam);
+  cudaFatLink = new quda::GaugeField(gParam);
+  cudaULink = new quda::GaugeField(gParam);
 
   { // create fat links
     double act_path_coeff[6];

From 19aa064690b96a339959d3c6e525dc1f25022957 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 16 May 2023 15:50:54 -0700
Subject: [PATCH 08/60] Add null, move and copy constructors, as well as copy
 and move assignment operators for GaugeField

---
 include/color_spinor_field.h |  18 +--
 include/gauge_field.h        | 294 ++++++++++++++++++-----------------
 include/malloc_quda.h        |   2 +-
 lib/gauge_field.cpp          | 275 +++++++++++++++++++-------------
 4 files changed, 325 insertions(+), 264 deletions(-)

diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index 76fa31b943..1bfd1be413 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -121,18 +121,13 @@ namespace quda
     }
   };
 
-  class ColorSpinorParam : public LatticeFieldParam
-  {
-
-  public:
+  struct ColorSpinorParam : public LatticeFieldParam {
     int nColor = 0; // Number of colors of the field
     int nSpin = 0;  // =1 for staggered, =2 for coarse Dslash, =4 for 4d spinor
     int nVec = 1;   // number of packed vectors (for multigrid transfer operator)
 
     QudaTwistFlavorType twistFlavor = QUDA_TWIST_INVALID; // used by twisted mass
-
     QudaSiteOrder siteOrder = QUDA_INVALID_SITE_ORDER; // defined for full fields
-
     QudaFieldOrder fieldOrder = QUDA_INVALID_FIELD_ORDER; // Float, Float2, Float4 etc.
     QudaGammaBasis gammaBasis = QUDA_INVALID_GAMMA_BASIS;
     QudaFieldCreate create = QUDA_INVALID_FIELD_CREATE;
@@ -179,7 +174,6 @@ namespace quda
     ColorSpinorParam() = default;
 
     // used to create cpu params
-
     ColorSpinorParam(void *V, QudaInvertParam &inv_param, const lat_dim_t &X, const bool pc_solution,
                      QudaFieldLocation location = QUDA_CPU_FIELD_LOCATION) :
       LatticeFieldParam(4, X, 0, location, inv_param.cpu_prec),
@@ -188,20 +182,12 @@ namespace quda
              || inv_param.dslash_type == QUDA_LAPLACE_DSLASH) ?
               1 :
               4),
-      nVec(1),
       twistFlavor(inv_param.twist_flavor),
-      siteOrder(QUDA_INVALID_SITE_ORDER),
-      fieldOrder(QUDA_INVALID_FIELD_ORDER),
       gammaBasis(inv_param.gamma_basis),
       create(QUDA_REFERENCE_FIELD_CREATE),
       pc_type(inv_param.dslash_type == QUDA_DOMAIN_WALL_DSLASH ? QUDA_5D_PC : QUDA_4D_PC),
-      v(V),
-      is_composite(false),
-      composite_dim(0),
-      is_component(false),
-      component_id(0)
+      v(V)
     {
-
       if (nDim > QUDA_MAX_DIM) errorQuda("Number of dimensions too great");
       for (int d = 0; d < nDim; d++) x[d] = X[d];
 
diff --git a/include/gauge_field.h b/include/gauge_field.h
index 23fb8939e3..71e1628370 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -37,99 +37,60 @@ namespace quda {
   } // namespace gauge
 
   struct GaugeFieldParam : public LatticeFieldParam {
+    int nColor = 3;
+    int nFace = 0;
 
-    int nColor;
-    int nFace;
+    QudaReconstructType reconstruct = QUDA_RECONSTRUCT_NO;
+    QudaGaugeFieldOrder order = QUDA_INVALID_GAUGE_ORDER;
+    QudaGaugeFixed fixed = QUDA_GAUGE_FIXED_NO;
+    QudaLinkType link_type = QUDA_WILSON_LINKS;
+    QudaTboundary t_boundary = QUDA_INVALID_T_BOUNDARY;
 
-    QudaReconstructType reconstruct;
-    QudaGaugeFieldOrder order;
-    QudaGaugeFixed fixed;
-    QudaLinkType link_type;
-    QudaTboundary t_boundary;
+    double anisotropy = 1.0;
+    double tadpole = 1.0;
+    GaugeField *field = nullptr; // pointer to a pre-allocated field
+    void *gauge = nullptr;       // used when we use a reference to an external field
 
-    double anisotropy;
-    double tadpole;
-    void *gauge; // used when we use a reference to an external field
+    QudaFieldCreate create = QUDA_REFERENCE_FIELD_CREATE; // used to determine the type of field created
 
-    QudaFieldCreate create; // used to determine the type of field created
-
-    QudaFieldGeometry geometry; // whether the field is a scale, vector or tensor
+    QudaFieldGeometry geometry = QUDA_VECTOR_GEOMETRY; // whether the field is a scale, vector or tensor
 
     // whether we need to compute the fat link maxima
     // FIXME temporary flag until we have a kernel that can do this, then we just do this in copy()
     // always set to false, requires external override
-    bool compute_fat_link_max;
+    bool compute_fat_link_max = false;
 
     /** The staggered phase convention to use */
-    QudaStaggeredPhase staggeredPhaseType;
+    QudaStaggeredPhase staggeredPhaseType = QUDA_STAGGERED_PHASE_NO;
 
     /** Whether the staggered phase factor has been applied */
-    bool staggeredPhaseApplied;
+    bool staggeredPhaseApplied = false;
 
     /** Imaginary chemical potential */
-    double i_mu;
+    double i_mu = 0.0;
 
     /** Offset into MILC site struct to the desired matrix field (only if gauge_order=MILC_SITE_GAUGE_ORDER) */
-    size_t site_offset;
+    size_t site_offset = 0;
 
     /** Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER) */
-    size_t site_size;
+    size_t site_size = 0;
 
     // Default constructor
-    GaugeFieldParam(void *const h_gauge = NULL) :
-      LatticeFieldParam(),
-      nColor(3),
-      nFace(0),
-      reconstruct(QUDA_RECONSTRUCT_NO),
-      order(QUDA_INVALID_GAUGE_ORDER),
-      fixed(QUDA_GAUGE_FIXED_NO),
-      link_type(QUDA_WILSON_LINKS),
-      t_boundary(QUDA_INVALID_T_BOUNDARY),
-      anisotropy(1.0),
-      tadpole(1.0),
-      gauge(h_gauge),
-      create(QUDA_REFERENCE_FIELD_CREATE),
-      geometry(QUDA_VECTOR_GEOMETRY),
-      compute_fat_link_max(false),
-      staggeredPhaseType(QUDA_STAGGERED_PHASE_NO),
-      staggeredPhaseApplied(false),
-      i_mu(0.0),
-      site_offset(0),
-      site_size(0)
-    {
-    }
+    GaugeFieldParam(void *const h_gauge = nullptr) : gauge(h_gauge) { }
 
     GaugeFieldParam(const GaugeField &u);
 
     GaugeFieldParam(const lat_dim_t &x, QudaPrecision precision, QudaReconstructType reconstruct, int pad,
                     QudaFieldGeometry geometry, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_PAD) :
       LatticeFieldParam(4, x, pad, QUDA_INVALID_FIELD_LOCATION, precision, ghostExchange),
-      nColor(3),
-      nFace(0),
       reconstruct(reconstruct),
-      order(QUDA_INVALID_GAUGE_ORDER),
-      fixed(QUDA_GAUGE_FIXED_NO),
-      link_type(QUDA_WILSON_LINKS),
-      t_boundary(QUDA_INVALID_T_BOUNDARY),
-      anisotropy(1.0),
-      tadpole(1.0),
-      gauge(0),
       create(QUDA_NULL_FIELD_CREATE),
-      geometry(geometry),
-      compute_fat_link_max(false),
-      staggeredPhaseType(QUDA_STAGGERED_PHASE_NO),
-      staggeredPhaseApplied(false),
-      i_mu(0.0),
-      site_offset(0),
-      site_size(0)
+      geometry(geometry)
     {
     }
 
     GaugeFieldParam(const QudaGaugeParam &param, void *h_gauge = nullptr, QudaLinkType link_type_ = QUDA_INVALID_LINKS) :
       LatticeFieldParam(param),
-      nColor(3),
-      nFace(0),
-      reconstruct(QUDA_RECONSTRUCT_NO),
       order(param.gauge_order),
       fixed(param.gauge_fix),
       link_type(link_type_ != QUDA_INVALID_LINKS ? link_type_ : param.type),
@@ -137,9 +98,6 @@ namespace quda {
       anisotropy(param.anisotropy),
       tadpole(param.tadpole_coeff),
       gauge(h_gauge),
-      create(QUDA_REFERENCE_FIELD_CREATE),
-      geometry(QUDA_VECTOR_GEOMETRY),
-      compute_fat_link_max(false),
       staggeredPhaseType(param.staggered_phase_type),
       staggeredPhaseApplied(param.staggered_phase_applied),
       i_mu(param.i_mu),
@@ -186,83 +144,103 @@ namespace quda {
 
   class GaugeField : public LatticeField {
 
+  private:
+    /**
+       @brief Create the field as specified by the param
+       @param[in] Parameter struct
+    */
+    void create(const GaugeFieldParam &param);
+
+    /**
+       @brief Move the contents of a field to this
+       @param[in,out] other Field we are moving from
+    */
+    void move(GaugeField &&other);
+
+    /**
+       @brief Fills the param with this field's meta data (used for
+       creating a cloned field)
+       @param[in] param The parameter we are filling
+    */
+    void fill(GaugeFieldParam &) const;
+
   protected:
-    quda_ptr gauge; /** The gauge field allocation */
-    array<quda_ptr, 8> gauge_array; /** Array of pointers to each subset (e.g., QDP or QDPJITorder) */
-      size_t bytes;        // bytes allocated per full field
-      size_t phase_offset; // offset in bytes to gauge phases - useful to keep track of texture alignment
-      size_t phase_bytes;  // bytes needed to store the phases
-      size_t length;
-      size_t real_length;
-      int nColor;
-      int nFace;
-      QudaFieldGeometry geometry; // whether the field is a scale, vector or tensor
-      int site_dim; // the dimensionality of each site (number of matrices per lattice site)
-
-      QudaReconstructType reconstruct;
-      int nInternal; // number of degrees of freedom per link matrix
-      QudaGaugeFieldOrder order;
-      QudaGaugeFixed fixed;
-      QudaLinkType link_type;
-      QudaTboundary t_boundary;
-
-      double anisotropy;
-      double tadpole;
-      double fat_link_max;
-
-      QudaFieldCreate create; // used to determine the type of field created
-
-      mutable array<quda_ptr, 2 * QUDA_MAX_DIM> ghost; // stores the ghost zone of the gauge field (non-native fields only)
-
-      mutable int ghostFace[QUDA_MAX_DIM]; // the size of each face
-
-      /**
-         The staggered phase convention to use
-      */
-      QudaStaggeredPhase staggeredPhaseType;
-
-      /**
-         Whether the staggered phase factor has been applied
-      */
-      bool staggeredPhaseApplied;
-
-      /**
-         @brief Exchange the buffers across all dimensions in a given direction
-         @param[out] recv Receive buffer
-         @param[in] send Send buffer
-         @param[in] dir Direction in which we are sending (forwards OR backwards only)
-      */
-      void exchange(void **recv, void **send, QudaDirection dir) const;
-
-      /**
-         Imaginary chemical potential
-      */
-      double i_mu;
-
-      /**
-         Offset into MILC site struct to the desired matrix field (only if gauge_order=MILC_SITE_GAUGE_ORDER)
-      */
-      size_t site_offset;
-
-      /**
-         Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER)
-      */
-      size_t site_size;
-
-      /**
-         Compute the required extended ghost zone sizes and offsets
-         @param[in] R Radius of the ghost zone
-         @param[in] no_comms_fill If true we create a full halo
-         regardless of partitioning
-         @param[in] bidir Is this a bi-directional exchange - if not
-         then we alias the fowards and backwards offsetss
-      */
-      void createGhostZone(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const;
-
-      /**
-         @brief Set the vol_string and aux_string for use in tuning
-      */
-      void setTuningString();
+    bool init = false;
+    quda_ptr gauge = {};                 /** The gauge field allocation */
+    array<quda_ptr, 8> gauge_array = {}; /** Array of pointers to each subset (e.g., QDP or QDPJITorder) */
+    size_t bytes = 0;                    // bytes allocated per full field
+    size_t phase_offset = 0;             // offset in bytes to gauge phases - useful to keep track of texture alignment
+    size_t phase_bytes = 0;              // bytes needed to store the phases
+    size_t length = 0;
+    size_t real_length = 0;
+    int nColor = 0;
+    int nFace = 0;
+    QudaFieldGeometry geometry = QUDA_INVALID_GEOMETRY; // whether the field is a scale, vector or tensor
+    int site_dim = 0; // the dimensionality of each site (number of matrices per lattice site)
+
+    QudaReconstructType reconstruct = QUDA_RECONSTRUCT_INVALID;
+    int nInternal = 0; // number of degrees of freedom per link matrix
+    QudaGaugeFieldOrder order = QUDA_INVALID_GAUGE_ORDER;
+    QudaGaugeFixed fixed = QUDA_GAUGE_FIXED_INVALID;
+    QudaLinkType link_type = QUDA_INVALID_LINKS;
+    QudaTboundary t_boundary = QUDA_INVALID_T_BOUNDARY;
+
+    double anisotropy = 0.0;
+    double tadpole = 0.0;
+    double fat_link_max = 0.0;
+
+    mutable array<quda_ptr, 2 *QUDA_MAX_DIM> ghost
+      = {}; // stores the ghost zone of the gauge field (non-native fields only)
+
+    mutable array<int, QUDA_MAX_DIM> ghostFace = {}; // the size of each face
+
+    /**
+       The staggered phase convention to use
+    */
+    QudaStaggeredPhase staggeredPhaseType = QUDA_STAGGERED_PHASE_INVALID;
+
+    /**
+       Whether the staggered phase factor has been applied
+    */
+    bool staggeredPhaseApplied = false;
+
+    /**
+       Imaginary chemical potential
+    */
+    double i_mu = 0.0;
+
+    /**
+       Offset into MILC site struct to the desired matrix field (only if gauge_order=MILC_SITE_GAUGE_ORDER)
+    */
+    size_t site_offset = 0;
+
+    /**
+       Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER)
+    */
+    size_t site_size = 0;
+
+    /**
+       @brief Exchange the buffers across all dimensions in a given direction
+       @param[out] recv Receive buffer
+       @param[in] send Send buffer
+       @param[in] dir Direction in which we are sending (forwards OR backwards only)
+    */
+    void exchange(void **recv, void **send, QudaDirection dir) const;
+
+    /**
+       Compute the required extended ghost zone sizes and offsets
+       @param[in] R Radius of the ghost zone
+       @param[in] no_comms_fill If true we create a full halo
+       regardless of partitioning
+       @param[in] bidir Is this a bi-directional exchange - if not
+       then we alias the fowards and backwards offsetss
+    */
+    void createGhostZone(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const;
+
+    /**
+       @brief Set the vol_string and aux_string for use in tuning
+    */
+    void setTuningString();
 
     /**
        @brief Initialize the padded region to 0
@@ -270,8 +248,42 @@ namespace quda {
     void zeroPad();
 
   public:
+    /**
+       @brief Default constructor
+    */
+    GaugeField() = default;
+
+    /**
+       @brief Copy constructor for creating a GaugeField from another GaugeField
+       @param field Instance of GaugeField from which we are cloning
+    */
+    GaugeField(const GaugeField &field) noexcept;
+
+    /**
+       @brief Move constructor for creating a GaugeField from another GaugeField
+       @param field Instance of GaugeField from which we are moving
+    */
+    GaugeField(GaugeField &&field) noexcept;
+
+    /**
+       @brief Constructor for creating a GaugeField from a GaugeFieldParam
+       @param param Contains the metadata for creating the field
+    */
     GaugeField(const GaugeFieldParam &param);
-    virtual ~GaugeField();
+
+    /**
+       @brief Copy assignment operator
+       @param[in] field Instance from which we are copying
+       @return Reference to this field
+     */
+    GaugeField &operator=(const GaugeField &field);
+
+    /**
+       @brief Move assignment operator
+       @param[in] field Instance from which we are moving
+       @return Reference to this field
+     */
+    GaugeField &operator=(GaugeField &&field);
 
     /**
        @brief Create the communication handlers and buffers
@@ -573,6 +585,8 @@ namespace quda {
       @param[in] the host buffer to copy from.
     */
     void copy_from_buffer(void *buffer);
+
+    friend class GaugeFieldParam;
   };
 
   /**
diff --git a/include/malloc_quda.h b/include/malloc_quda.h
index d1a7de9161..8cbc2fbb47 100644
--- a/include/malloc_quda.h
+++ b/include/malloc_quda.h
@@ -197,7 +197,7 @@ namespace quda {
 
   public:
     quda_ptr() = default;
-
+    quda_ptr(quda_ptr &&) = default;
     quda_ptr &operator=(quda_ptr &&);
 
     /**
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 61ea7ab505..8bc61c2035 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -5,67 +5,100 @@
 
 namespace quda {
 
-  GaugeFieldParam::GaugeFieldParam(const GaugeField &u) :
-    LatticeFieldParam(u),
-    nColor(u.Ncolor()),
-    nFace(u.Nface()),
-    reconstruct(u.Reconstruct()),
-    order(u.Order()),
-    fixed(u.GaugeFixed()),
-    link_type(u.LinkType()),
-    t_boundary(u.TBoundary()),
-    anisotropy(u.Anisotropy()),
-    tadpole(u.Tadpole()),
-    gauge(NULL),
-    create(QUDA_NULL_FIELD_CREATE),
-    geometry(u.Geometry()),
-    compute_fat_link_max(false),
-    staggeredPhaseType(u.StaggeredPhase()),
-    staggeredPhaseApplied(u.StaggeredPhaseApplied()),
-    i_mu(u.iMu()),
-    site_offset(u.SiteOffset()),
-    site_size(u.SiteSize())
-  { }
-
-  GaugeField::GaugeField(const GaugeFieldParam &param) :
-    LatticeField(param),
-    gauge(),
-    gauge_array {},
-    bytes(0),
-    phase_offset(0),
-    phase_bytes(0),
-    nColor(param.nColor),
-    nFace(param.nFace),
-    geometry(param.geometry),
-    site_dim(1),
-    reconstruct(param.reconstruct),
-    nInternal(reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : nColor * nColor * 2),
-    order(param.order),
-    fixed(param.fixed),
-    link_type(param.link_type),
-    t_boundary(param.t_boundary),
-    anisotropy(param.anisotropy),
-    tadpole(param.tadpole),
-    fat_link_max(link_type == QUDA_ASQTAD_FAT_LINKS ? 0.0 : 1.0),
-    create(param.create),
-    staggeredPhaseType(param.staggeredPhaseType),
-    staggeredPhaseApplied(param.staggeredPhaseApplied),
-    i_mu(param.i_mu),
-    site_offset(param.site_offset),
-    site_size(param.site_size)
+  GaugeFieldParam::GaugeFieldParam(const GaugeField &u) : LatticeFieldParam(u) { u.fill(*this); }
+
+  GaugeField::GaugeField(const GaugeFieldParam &param) : LatticeField(param)
+  {
+    create(param);
+
+    switch (param.create) {
+    case QUDA_NULL_FIELD_CREATE:
+    case QUDA_REFERENCE_FIELD_CREATE: break; // do nothing
+    case QUDA_ZERO_FIELD_CREATE: zero(); break;
+    case QUDA_COPY_FIELD_CREATE: copy(*param.field); break;
+    default: errorQuda("ERROR: create type(%d) not supported yet", param.create);
+    }
+  }
+
+  GaugeField::GaugeField(const GaugeField &u) noexcept : LatticeField(u)
+  {
+    GaugeFieldParam param;
+    u.fill(param);
+    param.create = QUDA_COPY_FIELD_CREATE;
+    create(param);
+    copy(u);
+  }
+
+  GaugeField::GaugeField(GaugeField &&u) noexcept : LatticeField(std::move(u)) { move(std::move(u)); }
+
+  GaugeField &GaugeField::operator=(const GaugeField &src)
+  {
+    if (&src != this) {
+      if (!init) { // keep current attributes unless unset
+        LatticeField::operator=(src);
+        GaugeFieldParam param;
+        src.fill(param);
+        param.create = QUDA_COPY_FIELD_CREATE;
+        create(param);
+      }
+
+      copy(src);
+    }
+    return *this;
+  }
+
+  GaugeField &GaugeField::operator=(GaugeField &&src)
+  {
+    if (&src != this) {
+      // if field not already initialized then move the field
+      if (!init) {
+        LatticeField::operator=(std::move(src));
+        move(std::move(src));
+      } else {
+        // we error if the field is not compatible with this
+        errorQuda("Moving to already created field");
+      }
+    }
+    return *this;
+  }
+
+  void GaugeField::create(const GaugeFieldParam &param)
   {
-    if (siteSubset != QUDA_FULL_SITE_SUBSET) errorQuda("Unexpected siteSubset %d", siteSubset);
-    if (order == QUDA_NATIVE_GAUGE_ORDER) errorQuda("Invalid gauge order %d", order);
-    if (ghost_precision != precision) ghost_precision = precision; // gauge fields require matching precision
-
-    if (link_type != QUDA_COARSE_LINKS && nColor != 3)
-      errorQuda("nColor must be 3, not %d for this link type", nColor);
-    if (nDim != 4)
-      errorQuda("Number of dimensions must be 4 not %d", nDim);
-    if (link_type != QUDA_WILSON_LINKS && anisotropy != 1.0)
+    if (param.siteSubset != QUDA_FULL_SITE_SUBSET) errorQuda("Unexpected siteSubset %d", param.siteSubset);
+    if (param.order == QUDA_NATIVE_GAUGE_ORDER) errorQuda("Invalid gauge order %d", param.order);
+    if (param.GhostPrecision() != param.Precision())
+      errorQuda("Ghost precision %d doesn't match field precision %d", param.GhostPrecision(), param.Precision());
+    if (param.link_type != QUDA_COARSE_LINKS && param.nColor != 3)
+      errorQuda("nColor must be 3, not %d for this link type", param.nColor);
+    if (param.nDim != 4) errorQuda("Number of dimensions must be 4 not %d", param.nDim);
+    if (param.link_type != QUDA_WILSON_LINKS && param.anisotropy != 1.0)
       errorQuda("Anisotropy only supported for Wilson links");
-    if (link_type != QUDA_WILSON_LINKS && fixed == QUDA_GAUGE_FIXED_YES)
+    if (param.link_type != QUDA_WILSON_LINKS && param.fixed == QUDA_GAUGE_FIXED_YES)
       errorQuda("Temporal gauge fixing only supported for Wilson links");
+    if ((param.reconstruct == QUDA_RECONSTRUCT_12 || param.reconstruct == QUDA_RECONSTRUCT_8)
+        && param.link_type != QUDA_SU3_LINKS)
+      errorQuda("Cannot request a 12/8 reconstruct type without SU(3) link type");
+    if (param.reconstruct == QUDA_RECONSTRUCT_10 && param.link_type != QUDA_ASQTAD_MOM_LINKS)
+      errorQuda("10-reconstruction only supported with momentum links");
+
+    nColor = param.nColor;
+    nFace = param.nFace;
+    geometry = param.geometry;
+    reconstruct = param.reconstruct;
+    nInternal = reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : nColor * nColor * 2;
+    order = param.order;
+    fixed = param.fixed;
+    link_type = param.link_type;
+    t_boundary = param.t_boundary;
+    anisotropy = param.anisotropy;
+    tadpole = param.tadpole;
+    fat_link_max = link_type == QUDA_ASQTAD_FAT_LINKS ? 0.0 : 1.0;
+    staggeredPhaseType = param.staggeredPhaseType;
+    staggeredPhaseApplied = param.staggeredPhaseApplied;
+    i_mu = param.i_mu;
+    site_offset = param.site_offset;
+    site_size = param.site_size;
+
     if (geometry == QUDA_SCALAR_GEOMETRY) {
       real_length = volume*nInternal;
       length = 2*stride*nInternal; // two comes from being full lattice
@@ -83,18 +116,6 @@ namespace quda {
       length = 2 * (1 << nDim) * nDim * stride * nInternal; // two comes from being full lattice
     }
 
-    if ((reconstruct == QUDA_RECONSTRUCT_12 || reconstruct == QUDA_RECONSTRUCT_8) && link_type != QUDA_SU3_LINKS) {
-      errorQuda("Cannot request a 12/8 reconstruct type without SU(3) link type");
-    }
-
-    if (reconstruct == QUDA_RECONSTRUCT_10 && link_type != QUDA_ASQTAD_MOM_LINKS) {
-      errorQuda("10-reconstruction only supported with momentum links");
-    }
-
-    if (create != QUDA_NULL_FIELD_CREATE && create != QUDA_ZERO_FIELD_CREATE && create != QUDA_REFERENCE_FIELD_CREATE) {
-      errorQuda("ERROR: create type(%d) not supported yet\n", create);
-    }
-
     switch (geometry) {
     case QUDA_SCALAR_GEOMETRY: site_dim = 1; break;
     case QUDA_VECTOR_GEOMETRY: site_dim = nDim; break;
@@ -147,9 +168,8 @@ namespace quda {
     }
 
     if (isNative()) {
-      if (create != QUDA_REFERENCE_FIELD_CREATE) {
+      if (param.create != QUDA_REFERENCE_FIELD_CREATE) {
         gauge = std::move(quda_ptr(mem_type, bytes));
-        if (create == QUDA_ZERO_FIELD_CREATE) qudaMemset(gauge, 0, bytes);
       } else {
         gauge = std::move(quda_ptr(param.gauge, mem_type));
       }
@@ -157,13 +177,12 @@ namespace quda {
 
       size_t nbytes = volume * nInternal * precision;
       for (int d = 0; d < site_dim; d++) {
-        if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
+        if (param.create != QUDA_REFERENCE_FIELD_CREATE) {
           gauge_array[d] = std::move(quda_ptr(mem_type, nbytes));
-          if (create == QUDA_ZERO_FIELD_CREATE) qudaMemset(gauge_array[d], 0, nbytes);
-        } else if (create == QUDA_REFERENCE_FIELD_CREATE) {
+        } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
           gauge_array[d] = std::move(quda_ptr(static_cast<void **>(param.gauge)[d], mem_type));
         } else {
-          errorQuda("Unsupported creation type %d", create);
+          errorQuda("Unsupported creation type %d", param.create);
         }
       }
 
@@ -172,17 +191,16 @@ namespace quda {
 	       order == QUDA_TIFR_PADDED_GAUGE_ORDER || order == QUDA_MILC_SITE_GAUGE_ORDER) {
       // does not support device
 
-      if (order == QUDA_MILC_SITE_GAUGE_ORDER && create != QUDA_REFERENCE_FIELD_CREATE) {
-	errorQuda("MILC site gauge order only supported for reference fields");
+      if (order == QUDA_MILC_SITE_GAUGE_ORDER && param.create != QUDA_REFERENCE_FIELD_CREATE) {
+        errorQuda("MILC site gauge order only supported for reference fields");
       }
 
-      if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
+      if (param.create != QUDA_REFERENCE_FIELD_CREATE) {
         gauge = std::move(quda_ptr(mem_type, bytes));
-        if (create == QUDA_ZERO_FIELD_CREATE) qudaMemset(gauge, 0, bytes);
-      } else if (create == QUDA_REFERENCE_FIELD_CREATE) {
+      } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
         gauge = std::move(quda_ptr(param.gauge, mem_type));
       } else {
-	errorQuda("Unsupported creation type %d", create);
+        errorQuda("Unsupported creation type %d", param.create);
       }
 
     } else {
@@ -200,15 +218,17 @@ namespace quda {
           if (geometry == QUDA_COARSE_GEOMETRY) qudaMemset(ghost[i + 4], 0, nbytes);
         }
       } else {
-        if (create != QUDA_ZERO_FIELD_CREATE) zeroPad();
+        if (param.create != QUDA_ZERO_FIELD_CREATE) zeroPad();
       }
     }
 
+    init = true;
     setTuningString();
 
     // exchange the boundaries if a non-trivial field
     if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD)
-      if (create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) {
+      if (param.create == QUDA_REFERENCE_FIELD_CREATE
+          && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) {
         exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
       }
 
@@ -216,7 +236,70 @@ namespace quda {
     if (param.compute_fat_link_max) fat_link_max = this->abs_max();
   }
 
-  GaugeField::~GaugeField() { }
+  void GaugeField::move(GaugeField &&src)
+  {
+    gauge = std::exchange(src.gauge, {});
+    gauge_array = std::exchange(src.gauge_array, {});
+    bytes = std::exchange(src.bytes, 0);
+    phase_offset = std::exchange(src.phase_offset, 0);
+    phase_bytes = std::exchange(src.phase_bytes, 0);
+    length = std::exchange(src.length, 0);
+    real_length = std::exchange(src.real_length, 0);
+    nColor = std::exchange(src.nColor, 0);
+    nFace = std::exchange(src.nFace, 0);
+    geometry = std::exchange(src.geometry, QUDA_INVALID_GEOMETRY);
+    site_dim = std::exchange(src.site_dim, 0);
+    reconstruct = std::exchange(src.reconstruct, QUDA_RECONSTRUCT_INVALID);
+    nInternal = std::exchange(src.nInternal, 0);
+    order = std::exchange(src.order, QUDA_INVALID_GAUGE_ORDER);
+    fixed = std::exchange(src.fixed, QUDA_GAUGE_FIXED_INVALID);
+    link_type = std::exchange(src.link_type, QUDA_INVALID_LINKS);
+    t_boundary = std::exchange(src.t_boundary, QUDA_INVALID_T_BOUNDARY);
+    anisotropy = std::exchange(src.anisotropy, 0.0);
+    tadpole = std::exchange(src.tadpole, 0.0);
+    fat_link_max = std::exchange(src.fat_link_max, 0.0);
+    ghost = std::exchange(src.ghost, {});
+    ghostFace = std::exchange(src.ghostFace, {});
+    staggeredPhaseType = std::exchange(src.staggeredPhaseType, QUDA_STAGGERED_PHASE_INVALID);
+    staggeredPhaseApplied = std::exchange(src.staggeredPhaseApplied, false);
+    i_mu = std::exchange(src.i_mu, 0.0);
+    site_offset = std::exchange(src.site_offset, 0);
+    site_size = std::exchange(src.site_size, 0);
+  }
+
+  void GaugeField::fill(GaugeFieldParam &param) const
+  {
+    LatticeField::fill(param);
+    param.gauge = nullptr;
+    param.nColor = nColor;
+    param.nFace = nFace;
+    param.reconstruct = reconstruct;
+    param.order = order;
+    param.fixed = fixed;
+    param.link_type = link_type;
+    param.t_boundary = t_boundary;
+    param.anisotropy = anisotropy;
+    param.tadpole = tadpole;
+    param.create = QUDA_NULL_FIELD_CREATE;
+    param.geometry = geometry;
+    param.compute_fat_link_max = false;
+    param.staggeredPhaseType = staggeredPhaseType;
+    param.staggeredPhaseApplied = staggeredPhaseApplied;
+    param.i_mu = i_mu;
+    param.site_offset = site_offset;
+    param.site_size = site_size;
+  }
+
+  void GaugeField::setTuningString()
+  {
+    LatticeField::setTuningString();
+    std::stringstream aux_ss;
+    aux_ss << "vol=" << volume << "stride=" << stride << "precision=" << precision << "geometry=" << geometry
+           << "Nc=" << nColor;
+    if (ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED) aux_ss << "r=" << r[0] << r[1] << r[2] << r[3];
+    aux_string = aux_ss.str();
+    if (aux_string.size() >= TuneKey::aux_n / 2) errorQuda("Aux string too large %lu", aux_string.size());
+  }
 
   void GaugeField::zeroPad()
   {
@@ -230,28 +313,6 @@ namespace quda {
         qudaMemset2D(gauge, parity * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad);
       }
     }
-#if 0
-    if (location == QUDA_CUDA_FIELD_LOCATION) {
-      for (int parity = 0; parity < 2; parity++) {
-        qudaMemset2D(data<char *>() + parity * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad);
-      }
-    } else {
-      for (int parity = 0; parity < 2; parity++)
-          for (int p = 0; p < Npad; p++)
-            memset(data<char *>() + parity * (bytes / 2) + (volumeCB + p * stride) * order * precision, 0, pad_bytes);
-      }
-    }
-#endif
-  }
-
-  void GaugeField::setTuningString() {
-    LatticeField::setTuningString();
-    std::stringstream aux_ss;
-    aux_ss << "vol=" << volume << "stride=" << stride << "precision=" << precision << "geometry=" << geometry
-           << "Nc=" << nColor;
-    if (ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED) aux_ss << "r=" << r[0] << r[1] << r[2] << r[3];
-    aux_string = aux_ss.str();
-    if (aux_string.size() >= TuneKey::aux_n / 2) errorQuda("Aux string too large %lu", aux_string.size());
   }
 
   void GaugeField::createGhostZone(const lat_dim_t &R, bool no_comms_fill, bool bidir) const

From bc3dba0bb993915ca679e24df1db317ac6e626d7 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 16 May 2023 15:53:48 -0700
Subject: [PATCH 09/60] Fix some issues with staggered quark smearing

---
 lib/interface_quda.cpp               | 20 ++++++--------------
 tests/staggered_gsmear_test_utils.h  |  4 ++--
 tests/utils/host_utils.h             |  2 +-
 tests/utils/staggered_host_utils.cpp | 11 +++++++----
 4 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 5ed54e37f2..36a252d809 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -3945,14 +3945,14 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
 
   checkGaugeParam(param);
 
-  GaugeFieldParam gParam(*param, inlink, QUDA_GENERAL_LINKS);
-  gParam.gauge     = twolink;
+  GaugeFieldParam gParam(*param, inlink, QUDA_ASQTAD_LONG_LINKS);
+  gParam.gauge = twolink;
   GaugeField cpuTwoLink(gParam);  // create the host twolink
   profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
 
   GaugeField *cudaInLinkEx = nullptr;
 
-  if(inlink) {
+  if (inlink) {
     gParam.link_type = param->type;
     gParam.gauge     = inlink;
     GaugeField cpuInLink(gParam);    // create the host sitelink
@@ -3961,19 +3961,13 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
     gParam.reconstruct = param->reconstruct;
     gParam.setPrecision(param->cuda_prec, true);
     gParam.create = QUDA_NULL_FIELD_CREATE;
-    GaugeField *cudaInLink = new GaugeField(gParam);
+    GaugeField cudaInLink(gParam);
     profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
 
     profileGaussianSmear.TPSTART(QUDA_PROFILE_H2D);
-    cudaInLink->copy(cpuInLink);
+    cudaInLink.copy(cpuInLink);
     profileGaussianSmear.TPSTOP(QUDA_PROFILE_H2D);
-    //
-    cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileGaussianSmear);
-    //
-    profileGaussianSmear.TPSTART(QUDA_PROFILE_FREE);
-    delete cudaInLink;
-    profileGaussianSmear.TPSTOP(QUDA_PROFILE_FREE);
-
+    cudaInLinkEx = createExtendedGauge(cudaInLink, R, profileGaussianSmear);
   } else {
     cudaInLinkEx = createExtendedGauge(*gaugePrecise, R, profileGaussianSmear);
   }
@@ -3992,7 +3986,6 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
 
   freeUniqueGaugeQuda(QUDA_SMEARED_LINKS);
   gaugeSmeared = new GaugeField(gsParam);
-
   
   profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
 
@@ -4006,7 +3999,6 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
   profileGaussianSmear.TPSTART(QUDA_PROFILE_D2H);
   cpuTwoLink.copy(*gaugeSmeared);
   profileGaussianSmear.TPSTOP(QUDA_PROFILE_D2H);
-
   profileGaussianSmear.TPSTART(QUDA_PROFILE_FREE);
 
   freeUniqueGaugeQuda(QUDA_SMEARED_LINKS);
diff --git a/tests/staggered_gsmear_test_utils.h b/tests/staggered_gsmear_test_utils.h
index 42bd43e293..7266844798 100644
--- a/tests/staggered_gsmear_test_utils.h
+++ b/tests/staggered_gsmear_test_utils.h
@@ -128,9 +128,9 @@ struct StaggeredGSmearTestWrapper { //
         quda::blas::ax(ftmp, tmp);
         quda::blas::axpy(a, tmp, tmp2);
 
-        staggeredTwoLinkGaussianSmear(spinorRef.Even(), qdp_twolnk, (void **)cpuTwoLink->Ghost(), tmp.Even(),
+        staggeredTwoLinkGaussianSmear(spinorRef.Even(), qdp_twolnk, *cpuTwoLink, tmp.Even(),
                                       &gauge_param, &inv_param, 0, smear_coeff, smear_t0, gauge_param.cpu_prec);
-        staggeredTwoLinkGaussianSmear(spinorRef.Odd(), qdp_twolnk, (void **)cpuTwoLink->Ghost(), tmp.Odd(),
+        staggeredTwoLinkGaussianSmear(spinorRef.Odd(), qdp_twolnk, *cpuTwoLink, tmp.Odd(),
                                       &gauge_param, &inv_param, 1, smear_coeff, smear_t0, gauge_param.cpu_prec);
 
         // blas::xpay(*tmp2, -1.0, *spinorRef);
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index 51cec06d27..88aed1f020 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -59,7 +59,7 @@ void computeLongLinkCPU(void **longlink, void **sitelink, QudaPrecision prec, vo
 void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink,
                          void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik);
 void computeTwoLinkCPU(void **twolink, void **sitelink, QudaGaugeParam *gauge_param);
-void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], void** ghost_twolnk,  quda::ColorSpinorField &in, QudaGaugeParam *qudaGaugeParam, QudaInvertParam *inv_param, const int oddBit, const double width, const int t0, QudaPrecision prec);
+void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], const quda::GaugeField &twolnk,  quda::ColorSpinorField &in, QudaGaugeParam *qudaGaugeParam, QudaInvertParam *inv_param, const int oddBit, const double width, const int t0, QudaPrecision prec);
 template <typename Float>
 void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type);
 void applyGaugeFieldScaling_long(void **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type,
diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp
index ba71e91926..365781c7d0 100644
--- a/tests/utils/staggered_host_utils.cpp
+++ b/tests/utils/staggered_host_utils.cpp
@@ -437,11 +437,14 @@ void staggeredTwoLinkGaussianSmear(sFloat *res, gFloat **twolink, gFloat **ghost
   return;
 }
 
-void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], void **ghost_twolnk,
+void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], const quda::GaugeField &twolnk,
                                    quda::ColorSpinorField &in, QudaGaugeParam * /*qudaGaugeParam*/,
                                    QudaInvertParam * /*inv_param*/, const int oddBit, const double /*width*/,
                                    const int t0, QudaPrecision prec)
 {
+  void *ghost[4];
+  for (int i = 0; i < 4; i++) ghost[i] = twolnk.Ghost()[i].data();
+
   QudaParity otherparity = QUDA_INVALID_PARITY;
   if (oddBit == QUDA_EVEN_PARITY) {
     otherparity = QUDA_ODD_PARITY;
@@ -459,19 +462,19 @@ void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk
 
   if (prec == QUDA_DOUBLE_PRECISION) {
     {
-      staggeredTwoLinkGaussianSmear((double *)out.V(), (double **)qdp_twolnk, (double **)ghost_twolnk, (double *)in.V(),
+      staggeredTwoLinkGaussianSmear((double *)out.V(), (double **)qdp_twolnk, (double **)ghost, (double *)in.V(),
                                     (double **)fwd_nbr_spinor, (double **)back_nbr_spinor, t0, oddBit);
     } 
   } else {
     {
-      staggeredTwoLinkGaussianSmear((float *)out.V(), (float **)qdp_twolnk, (float **)ghost_twolnk, (float *)in.V(),
+      staggeredTwoLinkGaussianSmear((float *)out.V(), (float **)qdp_twolnk, (float **)ghost, (float *)in.V(),
                                     (float **)fwd_nbr_spinor, (float **)back_nbr_spinor, t0, oddBit);
     }
   }
   return;
 }
 #else
-void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &, void **, void** ,  quda::ColorSpinorField&, QudaGaugeParam* , QudaInvertParam* , const int , const double , const int , QudaPrecision )
+void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &, void **, const quda::GaugeField &,  quda::ColorSpinorField&, QudaGaugeParam* , QudaInvertParam* , const int , const double , const int , QudaPrecision )
 {}
 #endif
 

From 13eb7e1d901604e87d4c3a3d6249e97ce0d3885f Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 18 May 2023 14:45:48 -0700
Subject: [PATCH 10/60] Fix HISQ force since unification, and renable hisq
 force ctests which were accidentally not being run

---
 tests/CMakeLists.txt                          |  2 -
 tests/hisq_paths_force_test.cpp               | 12 ++---
 tests/host_reference/hisq_force_reference.cpp | 50 +++++++++++--------
 tests/host_reference/hisq_force_reference.h   |  4 +-
 tests/utils/host_utils.cpp                    | 41 ++++++++++++++-
 tests/utils/host_utils.h                      |  1 +
 6 files changed, 76 insertions(+), 34 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 9ca912f8b3..14c1508a82 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1225,9 +1225,7 @@ foreach(prec IN LISTS TEST_PRECS)
              COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:unitarize_link_test> ${MPIEXEC_POSTFLAGS}
                      --dim 2 4 6 8 --prec ${prec}
                      --gtest_output=xml:unitarize_link_test_${prec}.xml)
-  endif()
 
-  if(QUDA_FORCE_HISQ)
     add_test(NAME hisq_paths_force_${prec}
              COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:hisq_paths_force_test> ${MPIEXEC_POSTFLAGS}
                      --dim 2 4 6 8 --prec ${prec}
diff --git a/tests/hisq_paths_force_test.cpp b/tests/hisq_paths_force_test.cpp
index 07f7b4e17b..9df6d2ec4c 100644
--- a/tests/hisq_paths_force_test.cpp
+++ b/tests/hisq_paths_force_test.cpp
@@ -333,8 +333,8 @@ static void hisq_force_startup()
 
   // initialize the CPU outer product fields and exchange once
   createStagForOprodCPU(stag_for_oprod, force_prec, qudaGaugeParam.X, *rng);
-  computeLinkOrderedOuterProduct(stag_for_oprod, cpuOprod->data(), force_prec, 1);
-  computeLinkOrderedOuterProduct(stag_for_oprod, cpuLongLinkOprod->data(), force_prec, 3);
+  computeLinkOrderedOuterProduct(stag_for_oprod, *cpuOprod, force_prec, 1);
+  computeLinkOrderedOuterProduct(stag_for_oprod, *cpuLongLinkOprod, force_prec, 3);
 
   copyExtendedGauge(*cpuOprod_ex, *cpuOprod, QUDA_CPU_FIELD_LOCATION);
   copyExtendedGauge(*cpuLongLinkOprod_ex, *cpuLongLinkOprod, QUDA_CPU_FIELD_LOCATION);
@@ -469,9 +469,7 @@ static int hisq_force_test(bool lepage)
                             getTolerance(force_prec), force_prec);
     }
 
-    strong_check_link(reinterpret_cast<void **>(hostVerifyForce->data()),
-                      "GPU results: ", reinterpret_cast<void **>(cpuForce->data()), "CPU reference results:", V,
-                      force_prec);
+    strong_check_link(*hostVerifyForce, "GPU result:", *cpuForce, "CPU reference results:");
     logQuda(QUDA_SUMMARIZE, "Lepage %s staples force test %s\n\n", lepage ? "enabled" : "disabled",
             (1 == res) ? "PASSED" : "FAILED");
   }
@@ -506,9 +504,7 @@ static int hisq_force_test(bool lepage)
                               getTolerance(force_prec), force_prec);
       }
 
-      strong_check_link(reinterpret_cast<void **>(hostVerifyForce->data()),
-                        "GPU results: ", reinterpret_cast<void **>(cpuForce->data()), "CPU reference results:", V,
-                        force_prec);
+      strong_check_link(*hostVerifyForce, "GPU results: ", *cpuForce, "CPU reference results:");
       logQuda(QUDA_SUMMARIZE, "Long link force test %s\n\n", (1 == res) ? "PASSED" : "FAILED");
     }
   }
diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp
index 58c6762e70..d0cfc197a2 100644
--- a/tests/host_reference/hisq_force_reference.cpp
+++ b/tests/host_reference/hisq_force_reference.cpp
@@ -84,9 +84,9 @@ typedef struct {
   double space;
 } danti_hermitmat;
 
-template <typename su3_matrix> su3_matrix *get_su3_matrix(su3_matrix *p, int idx, int dir)
+template <typename su3_matrix> su3_matrix *get_su3_matrix(quda::GaugeField &p, int idx, int dir)
 {
-  su3_matrix *data = ((su3_matrix **)p)[dir];
+  auto data = static_cast<su3_matrix*>(p.data(dir));
   return data + idx;
 }
 
@@ -96,8 +96,8 @@ template <typename su3_vector, typename su3_matrix> void su3_projector(su3_vecto
     for (int j = 0; j < 3; j++) CMUL_J(a->c[i], b->c[j], c->e[i][j]);
 }
 
-template <typename su3_vector, typename su3_matrix>
-void computeLinkOrderedOuterProduct(su3_vector *src, su3_matrix *dest, size_t nhops)
+template <typename su3_matrix, typename su3_vector>
+void computeLinkOrderedOuterProduct(su3_vector *src, quda::GaugeField &dest, size_t nhops)
 {
   int dx[4];
   for (int i = 0; i < V; ++i) {
@@ -106,18 +106,18 @@ void computeLinkOrderedOuterProduct(su3_vector *src, su3_matrix *dest, size_t nh
       dx[dir] = nhops;
       int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
       su3_vector *hw = src + nbr_idx;
-      su3_matrix *p = get_su3_matrix(dest, i, dir);
+      su3_matrix *p = get_su3_matrix<su3_matrix>(dest, i, dir);
       su3_projector(hw, &src[i], p);
     } // dir
   }   // i
 }
 
-void computeLinkOrderedOuterProduct(void *src, void *dst, QudaPrecision precision, size_t nhops)
+void computeLinkOrderedOuterProduct(void *src, quda::GaugeField &dst, QudaPrecision precision, size_t nhops)
 {
   if (precision == QUDA_SINGLE_PRECISION) {
-    computeLinkOrderedOuterProduct((fsu3_vector *)src, (fsu3_matrix *)dst, nhops);
+    computeLinkOrderedOuterProduct<fsu3_matrix>((fsu3_vector *)src, dst, nhops);
   } else {
-    computeLinkOrderedOuterProduct((dsu3_vector *)src, (dsu3_matrix *)dst, nhops);
+    computeLinkOrderedOuterProduct<dsu3_matrix>((dsu3_vector *)src, dst, nhops);
   }
 }
 
@@ -1222,12 +1222,15 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda
   act_path_coeff.seven = path_coeff[4];
   act_path_coeff.lepage = path_coeff[5];
 
+  void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)};
+  void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)};
+  void *noprod_array[] = {newOprod->data(0), newOprod->data(1), newOprod->data(2), newOprod->data(3)};
   if (precision == QUDA_DOUBLE_PRECISION) {
-    doHisqStaplesForceCPU<double>(X_, act_path_coeff, oprod.data<double *>(), link.data<double *>(),
-                                  (double **)tempmat, newOprod->data<double *>());
+    doHisqStaplesForceCPU<double>(X_, act_path_coeff, reinterpret_cast<double*>(oprod_array), reinterpret_cast<double*>(link_array),
+                                  (double **)tempmat, reinterpret_cast<double*>(noprod_array));
   } else if (precision == QUDA_SINGLE_PRECISION) {
-    doHisqStaplesForceCPU<float>(X_, act_path_coeff, oprod.data<float *>(), (float *)link.data<float *>(),
-                                 (float **)tempmat, newOprod->data<float *>());
+    doHisqStaplesForceCPU<float>(X_, act_path_coeff, reinterpret_cast<float*>(oprod_array), reinterpret_cast<float*>(link_array),
+                                 (float **)tempmat, reinterpret_cast<float*>(noprod_array));
   } else {
     errorQuda("Unsupported precision");
   }
@@ -1308,15 +1311,18 @@ void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeFiel
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
   QudaPrecision precision = oprod.Precision();
 
+  void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)};
+  void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)};
+  void *noprod_array[] = {newOprod->data(0), newOprod->data(1), newOprod->data(2), newOprod->data(3)};
   for (int sig = 0; sig < 4; ++sig) {
     if (precision == QUDA_SINGLE_PRECISION) {
-      computeLongLinkField<float>(X_, oprod.data<float *>(), link.data<float *>(), sig, coeff,
-                                  newOprod->data<float *>());
+      computeLongLinkField<float>(X_, reinterpret_cast<float*>(oprod_array), reinterpret_cast<float*>(link_array),
+                                  sig, coeff, reinterpret_cast<float*>(noprod_array));
     } else if (precision == QUDA_DOUBLE_PRECISION) {
-      computeLongLinkField<double>(X_, oprod.data<double *>(), link.data<double *>(), sig, coeff,
-                                   newOprod->data<double *>());
+      computeLongLinkField<double>(X_, reinterpret_cast<double*>(oprod_array), reinterpret_cast<double*>(link_array),
+                                   sig, coeff, reinterpret_cast<double*>(noprod_array));
     } else {
-      errorQuda("Unrecognised precision\n");
+      errorQuda("Unrecognised precision");
     }
   } // sig
 }
@@ -1366,13 +1372,17 @@ void hisqCompleteForceCPU(quda::GaugeField &oprod, quda::GaugeField &link, quda:
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
   QudaPrecision precision = oprod.Precision();
 
+  void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)};
+  void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)};
   for (int sig = 0; sig < 4; ++sig) {
     if (precision == QUDA_SINGLE_PRECISION) {
-      completeForceField<float>(X_, oprod.data<float *>(), link.data<float *>(), sig, mom->data<float *>());
+      completeForceField<float>(X_, reinterpret_cast<float*>(oprod_array), reinterpret_cast<float*>(link_array),
+                                sig, mom->data<float*>());
     } else if (precision == QUDA_DOUBLE_PRECISION) {
-      completeForceField<double>(X_, oprod.data<double *>(), link.data<double *>(), sig, mom->data<double *>());
+      completeForceField<double>(X_, reinterpret_cast<double*>(oprod_array), reinterpret_cast<double*>(link_array),
+                                 sig, mom->data<double*>());
     } else {
-      errorQuda("Unrecognised precision\n");
+      errorQuda("Unrecognised precision");
     }
   } // loop over sig
 }
diff --git a/tests/host_reference/hisq_force_reference.h b/tests/host_reference/hisq_force_reference.h
index fb8b773f84..da6a8b770e 100644
--- a/tests/host_reference/hisq_force_reference.h
+++ b/tests/host_reference/hisq_force_reference.h
@@ -8,11 +8,11 @@
 /**
    @brief Compute a staggered spinor outer product for some offset, CPU version
    @param[in] src Pointer to an appropriately sized host staggered spinor field
-   @param[out] dest Pointer to an appropriately sized output outer product field
+   @param[out] dest Reference to a gauge field for the outer product
    @param[in] precision Precision of data (single or double)
    @param[in] separation Offset for outer product (1 for fat links, 3 for long links)
 */
-void computeLinkOrderedOuterProduct(void *src, void *dest, QudaPrecision precision, size_t separation);
+void computeLinkOrderedOuterProduct(void *src, quda::GaugeField &dest, QudaPrecision precision, size_t separation);
 
 /**
    @brief Compute the force contribution from the fat links, CPU version
diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp
index 70421fb118..09ec26f415 100644
--- a/tests/utils/host_utils.cpp
+++ b/tests/utils/host_utils.cpp
@@ -1493,6 +1493,21 @@ static int compare_link(void **linkA, void **linkB, int len, QudaPrecision preci
   return ret;
 }
 
+static int compare_link(const GaugeField &linkA, const GaugeField &linkB)
+{
+  int ret;
+
+  void *a[] = {linkA.data(0), linkA.data(1), linkA.data(2), linkA.data(3)};
+  void *b[] = {linkB.data(0), linkB.data(1), linkB.data(2), linkB.data(3)};
+  if (checkPrecision(linkA, linkB) == QUDA_DOUBLE_PRECISION) {
+    ret = compareLink((double **)a, (double **)b, linkA.Volume());
+  } else {
+    ret = compareLink((float **)a, (float **)b, linkA.Volume());
+  }
+
+  return ret;
+}
+
 // X indexes the lattice site
 static void printLinkElement(void *link, int X, QudaPrecision precision)
 {
@@ -1524,8 +1539,30 @@ int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *
     printfQuda("\n");
   }
 
-  int ret = compare_link(linkA, linkB, len, prec);
-  return ret;
+  return compare_link(linkA, linkB, len, prec);
+}
+
+int strong_check_link(const GaugeField &linkA, const std::string &msgA, const GaugeField &linkB, const std::string &msgB)
+{
+  if (verbosity >= QUDA_VERBOSE) {
+    printfQuda("%s\n", msgA.c_str());
+    printLinkElement(linkA.data(0), 0, prec);
+    printfQuda("\n");
+    printLinkElement(linkA.data(0), 1, prec);
+    printfQuda("...\n");
+    printLinkElement(linkA.data(3), linkA.Volume() - 1, prec);
+    printfQuda("\n");
+
+    printfQuda("\n%s\n", msgB.c_str());
+    printLinkElement(linkB.data(0), 0, prec);
+    printfQuda("\n");
+    printLinkElement(linkB.data(0), 1, prec);
+    printfQuda("...\n");
+    printLinkElement(linkB.data(3), linkB.Volume() - 1, prec);
+    printfQuda("\n");
+  }
+
+  return compare_link(linkA, linkB);
 }
 
 void createMomCPU(void *mom, QudaPrecision precision)
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index 88aed1f020..d6eb26304f 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -183,6 +183,7 @@ double compare_floats_v2(void *a, void *b, int len, double epsilon, QudaPrecisio
 void check_gauge(void **, void **, double epsilon, QudaPrecision precision);
 
 int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec);
+int strong_check_link(const quda::GaugeField &linkA, const std::string &msgA, const quda::GaugeField &linkB, const std::string &msgB);
 int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec);
 
 /**

From e538fa028164647f2079c76f932adf9ed30f9d43 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 18 May 2023 14:56:05 -0700
Subject: [PATCH 11/60] Commenced use of new GaugeField features (default
 contructor, move and copy assigment) to clean up interface_quda.cpp.  Added
 new profile stack to allow for autoprofiling while also dramatically reducing
 LOC in the interface.  Work in progress

---
 include/gauge_field.h       |  15 +-
 include/timer.h             |   6 +
 lib/gauge_field.cpp         |  29 ++-
 lib/gauge_random.cu         |   8 +
 lib/gauge_update_quda.cu    |   3 +
 lib/interface_quda.cpp      | 440 +++++++++---------------------------
 lib/momentum.cu             |   7 +
 lib/staggered_oprod.cu      |   3 +
 lib/targets/cuda/malloc.cpp |   9 +
 lib/timer.cpp               |  22 ++
 10 files changed, 200 insertions(+), 342 deletions(-)

diff --git a/include/gauge_field.h b/include/gauge_field.h
index 71e1628370..52a4a40b06 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -40,11 +40,11 @@ namespace quda {
     int nColor = 3;
     int nFace = 0;
 
-    QudaReconstructType reconstruct = QUDA_RECONSTRUCT_NO;
     QudaGaugeFieldOrder order = QUDA_INVALID_GAUGE_ORDER;
     QudaGaugeFixed fixed = QUDA_GAUGE_FIXED_NO;
     QudaLinkType link_type = QUDA_WILSON_LINKS;
     QudaTboundary t_boundary = QUDA_INVALID_T_BOUNDARY;
+    QudaReconstructType reconstruct = QUDA_RECONSTRUCT_NO;
 
     double anisotropy = 1.0;
     double tadpole = 1.0;
@@ -95,6 +95,9 @@ namespace quda {
       fixed(param.gauge_fix),
       link_type(link_type_ != QUDA_INVALID_LINKS ? link_type_ : param.type),
       t_boundary(param.t_boundary),
+      // if we have momentum field and not using TIFR field, then we always have recon-10
+      reconstruct(link_type == QUDA_ASQTAD_MOM_LINKS && order != QUDA_TIFR_GAUGE_ORDER && order != QUDA_TIFR_PADDED_GAUGE_ORDER ?
+                  QUDA_RECONSTRUCT_10 : QUDA_RECONSTRUCT_NO),
       anisotropy(param.anisotropy),
       tadpole(param.tadpole_coeff),
       gauge(h_gauge),
@@ -556,6 +559,16 @@ namespace quda {
     */
     static GaugeField* Create(const GaugeFieldParam &param);
 
+    /**
+       @brief Create a field that aliases this field's storage.  The
+       alias field can use a different precision than this field,
+       though it cannot be greater.  This functionality is useful for
+       the case where we have multiple temporaries in different
+       precisions, but do not need them simultaneously.  Use this functionality with caution.
+       @param[in] param Parameters for the alias field
+    */
+    GaugeField create_alias(const GaugeFieldParam &param = GaugeFieldParam());
+
     /**
       @brief If managed memory and prefetch is enabled, prefetch
       the gauge field and buffers to the CPU or the GPU
diff --git a/include/timer.h b/include/timer.h
index 4c1557b7ce..20b9df45ff 100644
--- a/include/timer.h
+++ b/include/timer.h
@@ -296,6 +296,12 @@ namespace quda {
 
   static TimeProfile dummy("dummy");
 
+  void pushProfile(TimeProfile &profile);
+
+  void popProfile();
+
+  TimeProfile& getProfile();
+
 } // namespace quda
 
 #undef PUSH_RANGE
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 8bc61c2035..e8c4994670 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -227,8 +227,7 @@ namespace quda {
 
     // exchange the boundaries if a non-trivial field
     if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD)
-      if (param.create == QUDA_REFERENCE_FIELD_CREATE
-          && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) {
+      if (param.create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) {
         exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
       }
 
@@ -939,6 +938,13 @@ namespace quda {
 
   void GaugeField::copy(const GaugeField &src)
   {
+    auto &profile = getProfile();
+    if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
+      profile.TPSTART(QUDA_PROFILE_D2H);
+    } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
+      profile.TPSTART(QUDA_PROFILE_H2D);
+    }
+
     if (this == &src) return;
 
     checkField(src);
@@ -1104,7 +1110,12 @@ namespace quda {
     staggeredPhaseApplied = src.StaggeredPhaseApplied();
     staggeredPhaseType = src.StaggeredPhase();
 
-    qudaDeviceSynchronize(); // include sync here for accurate host-device profiling
+    if (src.Location() != location) qudaDeviceSynchronize(); // include sync here for accurate host-device profiling
+    if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
+      profile.TPSTOP(QUDA_PROFILE_D2H);
+    } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
+      profile.TPSTOP(QUDA_PROFILE_H2D);
+    }
   }
 
   std::ostream& operator<<(std::ostream& output, const GaugeFieldParam& param)
@@ -1195,11 +1206,19 @@ namespace quda {
 
   GaugeField* GaugeField::Create(const GaugeFieldParam &param) { return new GaugeField(param); }
 
+  GaugeField GaugeField::create_alias(const GaugeFieldParam &param_)
+  {
+    if (param_.init && param_.Precision() > precision)
+      errorQuda("Cannot create an alias to source with lower precision than the alias");
+    GaugeFieldParam param = param_.init ? param_ : GaugeFieldParam(*this);
+    param.create = QUDA_REFERENCE_FIELD_CREATE;
+    return GaugeField(param);
+  }
+
   // helper for creating extended gauge fields
   GaugeField *createExtendedGauge(GaugeField &in, const lat_dim_t &R, TimeProfile &profile,
                                   bool redundant_comms, QudaReconstructType recon)
   {
-    profile.TPSTART(QUDA_PROFILE_INIT);
     GaugeFieldParam gParamEx(in);
     //gParamEx.location = QUDA_CUDA_FIELD_LOCATION;
     gParamEx.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
@@ -1219,8 +1238,6 @@ namespace quda {
     // copy input field into the extended device gauge field
     copyExtendedGauge(*out, in, QUDA_CUDA_FIELD_LOCATION); // wrong location if both fields cpu
 
-    profile.TPSTOP(QUDA_PROFILE_INIT);
-
     // now fill up the halos
     out->exchangeExtendedGhost(R, profile, redundant_comms);
 
diff --git a/lib/gauge_random.cu b/lib/gauge_random.cu
index 0e056d305b..f3bfe8e22c 100644
--- a/lib/gauge_random.cu
+++ b/lib/gauge_random.cu
@@ -4,6 +4,7 @@
 #include <instantiate.h>
 #include <tunable_nd.h>
 #include <kernels/gauge_random.cuh>
+#include "timer.h"
 
 namespace quda {
 
@@ -55,19 +56,26 @@ namespace quda {
     if (U.LinkType() != QUDA_SU3_LINKS && U.LinkType() != QUDA_MOMENTUM_LINKS)
       errorQuda("Unexpected link type %d", U.LinkType());
 
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<GaugeGauss, ReconstructFull>(U, rng, sigma);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
 
     // ensure multi-gpu consistency if required
+    getProfile().TPSTART(QUDA_PROFILE_COMMS);
     if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_EXTENDED) {
       U.exchangeExtendedGhost(U.R());
     } else if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) {
       U.exchangeGhost();
     }
+    getProfile().TPSTOP(QUDA_PROFILE_COMMS);
   }
 
   void gaugeGauss(GaugeField &U, unsigned long long seed, double sigma)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMMS);
     RNG randstates(U, seed);
+    getProfile().TPSTOP(QUDA_PROFILE_COMMS);
+
     gaugeGauss(U, randstates, sigma);
   }
 
diff --git a/lib/gauge_update_quda.cu b/lib/gauge_update_quda.cu
index 0fdcb17387..78c4b47f4a 100644
--- a/lib/gauge_update_quda.cu
+++ b/lib/gauge_update_quda.cu
@@ -2,6 +2,7 @@
 #include <tunable_nd.h>
 #include <instantiate.h>
 #include <kernels/gauge_update.cuh>
+#include "timer.h"
 
 namespace quda {
 
@@ -61,11 +62,13 @@ namespace quda {
 
   void updateGaugeField(GaugeField &out, double dt, const GaugeField& in, const GaugeField& mom, bool conj_mom, bool exact)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkPrecision(out, in, mom);
     checkLocation(out, in, mom);
     checkReconstruct(out, in);
     if (mom.Reconstruct() != QUDA_RECONSTRUCT_10) errorQuda("Reconstruction type %d not supported", mom.Reconstruct());
     instantiate<UpdateGaugeField,ReconstructNo12>(out, in, mom, dt, conj_mom, exact);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 36a252d809..a8351fd35e 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -101,7 +101,7 @@ CloverField *cloverPrecondition = nullptr;
 CloverField *cloverRefinement = nullptr;
 CloverField *cloverEigensolver = nullptr;
 
-GaugeField *momResident = nullptr;
+GaugeField momResident;
 GaugeField *extendedGaugeResident = nullptr;
 
 std::vector<ColorSpinorField> solutionResident;
@@ -1379,8 +1379,6 @@ void endQuda(void)
 
   solutionResident.clear();
 
-  if(momResident) delete momResident;
-
   LatticeField::freeGhostBuffer();
   ColorSpinorField::freeGhostBuffer();
   FieldTmp<ColorSpinorField>::destroy();
@@ -4011,72 +4009,38 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
 int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int* path_length,
 			  double* loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam* qudaGaugeParam)
 {
-  profileGaugeForce.TPSTART(QUDA_PROFILE_TOTAL);
-  profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);
-
+  pushProfile(profileGaugeForce);
   checkGaugeParam(qudaGaugeParam);
 
   GaugeFieldParam gParam(*qudaGaugeParam, siteLink);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  gParam.site_offset = qudaGaugeParam->gauge_offset;
-  gParam.site_size = qudaGaugeParam->site_size;
-  GaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new GaugeField(gParam) : nullptr;
-
-  GaugeField* cudaSiteLink = nullptr;
-
-  if (qudaGaugeParam->use_resident_gauge) {
-    if (!gaugePrecise) errorQuda("No resident gauge field to use");
-    cudaSiteLink = gaugePrecise;
-  } else {
-    gParam.create = QUDA_NULL_FIELD_CREATE;
-    gParam.reconstruct = qudaGaugeParam->reconstruct;
-    gParam.setPrecision(qudaGaugeParam->cuda_prec, true);
-    gParam.location = QUDA_CUDA_FIELD_LOCATION;
-
-    cudaSiteLink = new GaugeField(gParam);
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
-
-    profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
-    cudaSiteLink->copy(*cpuSiteLink);
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);
+  GaugeField cpuSiteLink = !qudaGaugeParam->use_resident_gauge ? GaugeField(gParam) : GaugeField();
 
-    profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);
-  }
+  if (qudaGaugeParam->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field to use");
+  gParam.create = QUDA_COPY_FIELD_CREATE;
+  gParam.field = &cpuSiteLink;
+  gParam.reconstruct = qudaGaugeParam->reconstruct;
+  gParam.setPrecision(qudaGaugeParam->cuda_prec, true);
+  gParam.location = QUDA_CUDA_FIELD_LOCATION;
+  GaugeField cudaSiteLink = qudaGaugeParam->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
 
   GaugeFieldParam gParamMom(*qudaGaugeParam, mom, QUDA_ASQTAD_MOM_LINKS);
   gParamMom.location = QUDA_CPU_FIELD_LOCATION;
-  if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER)
-    gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;
-  else
-    gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
 
-  gParamMom.site_offset = qudaGaugeParam->mom_offset;
-  gParamMom.site_size = qudaGaugeParam->site_size;
-  GaugeField* cpuMom = (!qudaGaugeParam->use_resident_mom) ? new GaugeField(gParamMom) : nullptr;
+  GaugeField cpuMom = !qudaGaugeParam->use_resident_mom ? GaugeField(gParamMom) : GaugeField();
 
-  GaugeField* cudaMom = nullptr;
-  if (qudaGaugeParam->use_resident_mom) {
-    if (!momResident) errorQuda("No resident momentum field to use");
-    cudaMom = momResident;
-    if (qudaGaugeParam->overwrite_mom) cudaMom->zero();
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
-  } else {
-    gParamMom.location = QUDA_CUDA_FIELD_LOCATION;
-    gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
-    gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
-    gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
-    gParamMom.setPrecision(qudaGaugeParam->cuda_prec, true);
-    gParamMom.create = QUDA_ZERO_FIELD_CREATE;
-    cudaMom = new GaugeField(gParamMom);
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
-    if (!qudaGaugeParam->overwrite_mom) {
-      profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
-      cudaMom->copy(*cpuMom);
-      profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);
-    }
-  }
+  if (qudaGaugeParam->use_resident_mom && !momResident.Volume()) errorQuda("No resident momentum field to use");
+  gParamMom.location = QUDA_CUDA_FIELD_LOCATION;
+  gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_COPY_FIELD_CREATE;
+  gParamMom.field = &cpuMom;
+  gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
+  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
+  gParamMom.setPrecision(qudaGaugeParam->cuda_prec, true);
 
-  GaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugeForce);
+  GaugeField cudaMom = qudaGaugeParam->use_resident_mom ? momResident.create_alias() : GaugeField(gParamMom);
+  if (qudaGaugeParam->use_resident_mom && qudaGaugeParam->overwrite_mom) cudaMom.zero();
+
+  GaugeField *cudaGauge = createExtendedGauge(cudaSiteLink, R, profileGaugeForce);
   // apply / remove phase as appropriate
   if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase();
 
@@ -4095,41 +4059,26 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
   // actually do the computation
   profileGaugeForce.TPSTART(QUDA_PROFILE_COMPUTE);
   if (!forceMonitor()) {
-    gaugeForce(*cudaMom, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
+    gaugeForce(cudaMom, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
   } else {
     // if we are monitoring the force, separate the force computation from the momentum update
-    GaugeFieldParam gParam(*cudaMom);
+    GaugeFieldParam gParam(cudaMom);
     gParam.create = QUDA_ZERO_FIELD_CREATE;
-    GaugeField *force = GaugeField::Create(gParam);
-    gaugeForce(*force, *cudaGauge, 1.0, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
-    updateMomentum(*cudaMom, eb3, *force, "gauge");
-    delete force;
+    GaugeField force(gParam);
+    gaugeForce(force, *cudaGauge, 1.0, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
+    updateMomentum(cudaMom, eb3, force, "gauge");
   }
   profileGaugeForce.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-  if (qudaGaugeParam->return_result_mom) {
-    profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);
-    cpuMom->copy(*cudaMom);
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);
-  }
+  if (qudaGaugeParam->return_result_mom) cpuMom.copy(cudaMom);
 
-  profileGaugeForce.TPSTART(QUDA_PROFILE_FREE);
   if (qudaGaugeParam->make_resident_gauge) {
-    if (gaugePrecise && gaugePrecise != cudaSiteLink) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-    gaugePrecise = cudaSiteLink;
-  } else {
-    delete cudaSiteLink;
-  }
-
-  if (qudaGaugeParam->make_resident_mom) {
-    if (momResident && momResident != cudaMom) delete momResident;
-    momResident = cudaMom;
-  } else {
-    delete cudaMom;
+    if (gaugePrecise && !qudaGaugeParam->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    std::exchange(*gaugePrecise, cudaSiteLink);
   }
 
-  if (cpuSiteLink) delete cpuSiteLink;
-  if (cpuMom) delete cpuMom;
+  if (qudaGaugeParam->make_resident_mom && !qudaGaugeParam->use_resident_gauge) std::exchange(momResident, cudaMom);
+  else momResident = GaugeField();
 
   if (qudaGaugeParam->make_resident_gauge) {
     if (extendedGaugeResident) delete extendedGaugeResident;
@@ -4137,24 +4086,19 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
   } else {
     delete cudaGauge;
   }
-  profileGaugeForce.TPSTOP(QUDA_PROFILE_FREE);
 
-  profileGaugeForce.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
   return 0;
 }
 
 int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff,
                          int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam)
 {
-  profileGaugePath.TPSTART(QUDA_PROFILE_TOTAL);
-  profileGaugePath.TPSTART(QUDA_PROFILE_INIT);
-
+  pushProfile(profileGaugePath);
   checkGaugeParam(qudaGaugeParam);
 
   GaugeFieldParam gParam(*qudaGaugeParam, siteLink);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  gParam.site_offset = qudaGaugeParam->gauge_offset;
-  gParam.site_size = qudaGaugeParam->site_size;
   GaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new GaugeField(gParam) : nullptr;
 
   GaugeField *cudaSiteLink = nullptr;
@@ -4169,30 +4113,19 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
     gParam.setPrecision(qudaGaugeParam->cuda_prec, true);
 
     cudaSiteLink = new GaugeField(gParam);
-    profileGaugePath.TPSTOP(QUDA_PROFILE_INIT);
-
-    profileGaugePath.TPSTART(QUDA_PROFILE_H2D);
     cudaSiteLink->copy(*cpuSiteLink);
-    profileGaugePath.TPSTOP(QUDA_PROFILE_H2D);
-
-    profileGaugePath.TPSTART(QUDA_PROFILE_INIT);
   }
 
   GaugeFieldParam gParamOut(*qudaGaugeParam, out);
   gParamOut.location = QUDA_CPU_FIELD_LOCATION;
-  gParamOut.site_offset = qudaGaugeParam->gauge_offset;
-  gParamOut.site_size = qudaGaugeParam->site_size;
   GaugeField *cpuOut = new GaugeField(gParamOut);
   gParamOut.location = QUDA_CUDA_FIELD_LOCATION;
   gParamOut.create = qudaGaugeParam->overwrite_gauge ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
   gParamOut.reconstruct = QUDA_RECONSTRUCT_NO;
   gParamOut.setPrecision(qudaGaugeParam->cuda_prec, true);
   GaugeField *cudaOut = new GaugeField(gParamOut);
-  profileGaugePath.TPSTOP(QUDA_PROFILE_INIT);
   if (!qudaGaugeParam->overwrite_gauge) {
-    profileGaugePath.TPSTART(QUDA_PROFILE_H2D);
     cudaOut->copy(*cpuOut);
-    profileGaugePath.TPSTOP(QUDA_PROFILE_H2D);
   }
 
   GaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugePath);
@@ -4216,11 +4149,8 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
   gaugePath(*cudaOut, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
   profileGaugePath.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-  profileGaugePath.TPSTART(QUDA_PROFILE_D2H);
   cpuOut->copy(*cudaOut);
-  profileGaugePath.TPSTOP(QUDA_PROFILE_D2H);
 
-  profileGaugePath.TPSTART(QUDA_PROFILE_FREE);
   if (qudaGaugeParam->make_resident_gauge) {
     if (gaugePrecise && gaugePrecise != cudaSiteLink) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
     gaugePrecise = cudaSiteLink;
@@ -4235,66 +4165,46 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
 
   if (cpuSiteLink) delete cpuSiteLink;
   if (cpuOut) delete cpuOut;
-  profileGaugePath.TPSTOP(QUDA_PROFILE_FREE);
 
-  profileGaugePath.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
   return 0;
 }
 
 void momResidentQuda(void *mom, QudaGaugeParam *param)
 {
-  profileGaugeForce.TPSTART(QUDA_PROFILE_TOTAL);
-  profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);
-
+  pushProfile(profileGaugeForce);
   checkGaugeParam(param);
 
   GaugeFieldParam gParamMom(*param, mom, QUDA_ASQTAD_MOM_LINKS);
   gParamMom.location = QUDA_CPU_FIELD_LOCATION;
-  if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER)
-    gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;
-  else
-    gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
-  gParamMom.site_offset = param->mom_offset;
-  gParamMom.site_size = param->site_size;
 
   GaugeField cpuMom(gParamMom);
 
   if (param->make_resident_mom && !param->return_result_mom) {
-    if (momResident) delete momResident;
     gParamMom.location = QUDA_CUDA_FIELD_LOCATION;
     gParamMom.create = QUDA_NULL_FIELD_CREATE;
     gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
     gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
     gParamMom.setPrecision(param->cuda_prec, true);
     gParamMom.create = QUDA_ZERO_FIELD_CREATE;
-    momResident = new GaugeField(gParamMom);
+    momResident = GaugeField(gParamMom);
   } else if (param->return_result_mom && !param->make_resident_mom) {
-    if (!momResident) errorQuda("No resident momentum to return");
+    if (!momResident.Volume()) errorQuda("No resident momentum to return");
   } else {
     errorQuda("Unexpected combination make_resident_mom = %d return_result_mom = %d", param->make_resident_mom,
               param->return_result_mom);
   }
 
-  profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
-
   if (param->make_resident_mom) {
     // we are downloading the momentum from the host
-    profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
-    momResident->copy(cpuMom);
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);
+    momResident.copy(cpuMom);
   } else if (param->return_result_mom) {
     // we are uploading the momentum to the host
-    profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);
-    cpuMom.copy(*momResident);
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);
-
-    profileGaugeForce.TPSTART(QUDA_PROFILE_FREE);
-    delete momResident;
-    momResident = nullptr;
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_FREE);
+    cpuMom.copy(momResident);
+    momResident = GaugeField();
   }
 
-  profileGaugeForce.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
 }
 
 void createCloverQuda(QudaInvertParam* invertParam)
@@ -4381,8 +4291,7 @@ void destroyGaugeFieldQuda(void *gauge)
 void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, void **, QudaGaugeParam *gauge_param,
                                QudaInvertParam *inv_param)
 {
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_TOTAL);
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT);
+  pushProfile(profileStaggeredForce);
 
   GaugeFieldParam gParam(*gauge_param, h_mom, QUDA_ASQTAD_MOM_LINKS);
 
@@ -4393,12 +4302,14 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   GaugeField cpuMom(gParam);
 
   // create the device momentum field
+  if (gauge_param->use_resident_mom && !momResident.Volume()) errorQuda("Cannot use resident momentum field since none appears resident");
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
-  gParam.create = QUDA_ZERO_FIELD_CREATE; // FIXME
+  gParam.create = QUDA_COPY_FIELD_CREATE;
+  gParam.field = &cpuMom;
   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
-  GaugeField *cudaMom = !gauge_param->use_resident_mom ? new GaugeField(gParam) : nullptr;
+  GaugeField cudaMom = gauge_param->use_resident_mom ? momResident.create_alias() : GaugeField(gParam);
 
   // create temporary field for quark-field outer product
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
@@ -4407,6 +4318,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   GaugeField cudaForce(gParam);
   GaugeField *cudaForce_[2] = {&cudaForce};
 
+  profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT);
   ColorSpinorParam qParam;
   qParam.location = QUDA_CUDA_FIELD_LOCATION;
   qParam.nColor = 3;
@@ -4421,25 +4333,11 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   qParam.x[4] = 1;
   qParam.create = QUDA_NULL_FIELD_CREATE;
   qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
-
   profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT);
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_H2D);
-
-  if (gauge_param->use_resident_mom) {
-    if (!momResident) errorQuda("Cannot use resident momentum field since none appears resident");
-    cudaMom = momResident;
-  } else {
-    // download the initial momentum (FIXME make an option just to return?)
-    cudaMom->copy(cpuMom);
-  }
 
   // resident gauge field is required
-  if (!gauge_param->use_resident_gauge || !gaugePrecise)
-    errorQuda("Resident gauge field is required");
-
-  if (!gaugePrecise->StaggeredPhaseApplied()) {
-    errorQuda("Gauge field requires the staggered phase factors to be applied");
-  }
+  if (!gauge_param->use_resident_gauge || !gaugePrecise) errorQuda("Resident gauge field is required");
+  if (!gaugePrecise->StaggeredPhaseApplied()) errorQuda("Gauge field requires the staggered phase factors to be applied");
 
   // check if staggered phase is the desired one
   if (gauge_param->staggered_phase_type != gaugePrecise->StaggeredPhase()) {
@@ -4447,12 +4345,11 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
               gauge_param->staggered_phase_type, gaugePrecise->StaggeredPhase());
   }
 
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_H2D);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT);
 
   const int nvector = inv_param->num_offset;
   std::vector<ColorSpinorField*> X(nvector);
-  for ( int i=0; i<nvector; i++) X[i] = ColorSpinorField::Create(qParam);
+  for (int i=0; i<nvector; i++) X[i] = ColorSpinorField::Create(qParam);
 
   if (inv_param->use_resident_solution) {
     if (solutionResident.size() < (unsigned int)nvector)
@@ -4484,15 +4381,13 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   }
 
   profileStaggeredForce.TPSTOP(QUDA_PROFILE_PREAMBLE);
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE);
 
+  profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE);
 #if 0
   if (inv_param->use_resident_solution) solutionResident.clear();
 #endif
   delete dirac;
-
   profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE);
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_COMPUTE);
 
   // compute quark-field outer product
   for (int i=0; i<nvector; i++) {
@@ -4506,31 +4401,20 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
 
   // mom += delta * [U * force]TA
   applyU(cudaForce, *gaugePrecise);
-  updateMomentum(*cudaMom, dt * delta, cudaForce, "staggered");
+  updateMomentum(cudaMom, dt * delta, cudaForce, "staggered");
   qudaDeviceSynchronize();
 
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_COMPUTE);
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_D2H);
+  // copy the momentum field back to the host
+  if (gauge_param->return_result_mom) cpuMom.copy(cudaMom);
 
-  if (gauge_param->return_result_mom) {
-    // copy the momentum field back to the host
-    cpuMom.copy(*cudaMom);
-  }
+  if (gauge_param->make_resident_mom && !gauge_param->use_resident_mom) std::exchange(momResident, cudaMom);
+  else momResident = GaugeField();
 
-  if (gauge_param->make_resident_mom) {
-    // make the momentum field resident
-    momResident = cudaMom;
-  } else {
-    delete cudaMom;
-  }
-
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_D2H);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE);
-
   for (int i=0; i<nvector; i++) delete X[i];
-
   profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE);
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_TOTAL);
+
+  popProfile();
 }
 
 void computeHISQForceQuda(void* const milc_momentum,
@@ -4546,9 +4430,10 @@ void computeHISQForceQuda(void* const milc_momentum,
                           double **coeff,
                           QudaGaugeParam* gParam)
 {
+  pushProfile(profileHISQForce);
+
   using namespace quda;
   using namespace quda::fermion_force;
-  profileHISQForce.TPSTART(QUDA_PROFILE_TOTAL);
   if (gParam->gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported input field order %d", gParam->gauge_order);
 
   checkGaugeParam(gParam);
@@ -4768,9 +4653,7 @@ void computeHISQForceQuda(void* const milc_momentum,
   GaugeField *cudaWLink = new GaugeField(wParam);
   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
-  profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
   cudaWLink->copy(cpuWLink);
-  profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
 
   cudaWLink->exchangeExtendedGhost(cudaWLink->R(), profileHISQForce);
 
@@ -4816,9 +4699,7 @@ void computeHISQForceQuda(void* const milc_momentum,
   GaugeField *cudaVLink = new GaugeField(vParam);
   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
-  profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
   cudaVLink->copy(cpuVLink);
-  profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
   cudaVLink->exchangeExtendedGhost(cudaVLink->R(), profileHISQForce);
 
   profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
@@ -4851,9 +4732,7 @@ void computeHISQForceQuda(void* const milc_momentum,
   GaugeField *cudaULink = new GaugeField(uParam);
   profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
-  profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
   cudaULink->copy(cpuULink);
-  profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
   cudaULink->exchangeExtendedGhost(cudaULink->R(), profileHISQForce);
 
   // Compute Fat7-staple term
@@ -4873,8 +4752,8 @@ void computeHISQForceQuda(void* const milc_momentum,
   hisqCompleteForce(*cudaOutForce, *cudaULink);
 
   if (gParam->use_resident_mom) {
-    if (!momResident) errorQuda("No resident momentum field to use");
-    updateMomentum(*momResident, dt, *cudaOutForce, "hisq");
+    if (momResident.Length()) errorQuda("No resident momentum field to use");
+    updateMomentum(momResident, dt, *cudaOutForce, "hisq");
   } else {
     updateMomentum(*cudaMom, dt, *cudaOutForce, "hisq");
   }
@@ -4883,27 +4762,16 @@ void computeHISQForceQuda(void* const milc_momentum,
 
   if (gParam->return_result_mom) {
     // Close the paths, make anti-hermitian, and store in compressed format
-    if (gParam->return_result_mom) {
-      profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
-      cpuMom->copy(*cudaMom);
-      profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
-    }
+    if (gParam->return_result_mom) cpuMom->copy(*cudaMom);
   }
 
-  profileHISQForce.TPSTART(QUDA_PROFILE_FREE);
-
   if (cpuMom) delete cpuMom;
-
-  if (!gParam->make_resident_mom) {
-    delete momResident;
-    momResident = nullptr;
-  }
+  if (!gParam->make_resident_mom) momResident = GaugeField();
   if (cudaMom) delete cudaMom;
   delete cudaOutForce;
   delete cudaULink;
-  profileHISQForce.TPSTOP(QUDA_PROFILE_FREE);
 
-  profileHISQForce.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
 }
 
 void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double *coeff, double kappa2, double ck,
@@ -5085,106 +4953,55 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
   profileCloverForce.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
-void updateGaugeFieldQuda(void* gauge,
-			  void* momentum,
-			  double dt,
-			  int conj_mom,
-			  int exact,
-			  QudaGaugeParam* param)
+void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom, int exact, QudaGaugeParam* param)
 {
-  profileGaugeUpdate.TPSTART(QUDA_PROFILE_TOTAL);
-
+  pushProfile(profileGaugeUpdate);
   checkGaugeParam(param);
 
-  profileGaugeUpdate.TPSTART(QUDA_PROFILE_INIT);
-
   // create the host fields
   GaugeFieldParam gParam(*param, gauge, QUDA_SU3_LINKS);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  gParam.site_offset = param->gauge_offset;
-  gParam.site_size = param->site_size;
   bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
-  GaugeField *cpuGauge = need_cpu ? new GaugeField(gParam) : nullptr;
+  GaugeField cpuGauge = need_cpu ? GaugeField(gParam) : GaugeField();
 
-  GaugeFieldParam gParamMom(*param, momentum);
-  gParamMom.reconstruct = (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ?
-   QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;
-  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
-  gParamMom.site_offset = param->mom_offset;
-  gParamMom.site_size = param->site_size;
-  GaugeField *cpuMom = !param->use_resident_mom ? new GaugeField(gParamMom) : nullptr;
+  GaugeFieldParam gParamMom(*param, momentum, QUDA_ASQTAD_MOM_LINKS);
+  GaugeField cpuMom = !param->use_resident_mom ? GaugeField(gParamMom) : GaugeField();
 
   // create the device fields
+  if (param->use_resident_mom && !momResident.Volume()) errorQuda("No resident mom field allocated");
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
-  gParam.create = QUDA_NULL_FIELD_CREATE;
+  gParam.create = QUDA_COPY_FIELD_CREATE;
+  gParam.field = &cpuMom;
   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   gParam.pad = 0;
-  GaugeField *cudaMom = !param->use_resident_mom ? new GaugeField(gParam) : nullptr;
+  GaugeField cudaMom = param->use_resident_mom ? momResident.create_alias() : GaugeField(gParam);
 
+  if (param->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field allocated");
   gParam.link_type = QUDA_SU3_LINKS;
   gParam.reconstruct = param->reconstruct;
-  GaugeField *cudaInGauge = !param->use_resident_gauge ? new GaugeField(gParam) : nullptr;
-  auto *cudaOutGauge = new GaugeField(gParam);
-
-  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_INIT);
-
-  profileGaugeUpdate.TPSTART(QUDA_PROFILE_H2D);
-
-  if (!param->use_resident_gauge) {   // load fields onto the device
-    cudaInGauge->copy(*cpuGauge);
-  } else { // or use resident fields already present
-    if (!gaugePrecise) errorQuda("No resident gauge field allocated");
-    cudaInGauge = gaugePrecise;
-    gaugePrecise = nullptr;
-  }
-
-  if (!param->use_resident_mom) {
-    cudaMom->copy(*cpuMom);
-  } else {
-    if (!momResident) errorQuda("No resident mom field allocated");
-    cudaMom = momResident;
-    momResident = nullptr;
-  }
-
-  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_H2D);
+  gParam.field = &cpuGauge;
+  GaugeField u_in = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
+  gParam.create = QUDA_NULL_FIELD_CREATE;
+  GaugeField u_out(gParam);
 
   // perform the update
-  profileGaugeUpdate.TPSTART(QUDA_PROFILE_COMPUTE);
-  updateGaugeField(*cudaOutGauge, dt, *cudaInGauge, *cudaMom,
-      (bool)conj_mom, (bool)exact);
-  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_COMPUTE);
+  updateGaugeField(u_out, dt, u_in, cudaMom, (bool)conj_mom, (bool)exact);
 
-  if (param->return_result_gauge) {
-    // copy the gauge field back to the host
-    profileGaugeUpdate.TPSTART(QUDA_PROFILE_D2H);
-    cpuGauge->copy(*cudaOutGauge);
-    profileGaugeUpdate.TPSTOP(QUDA_PROFILE_D2H);
-  }
+  // copy the gauge field back to the host
+  if (param->return_result_gauge) cpuGauge.copy(u_out);
 
-  profileGaugeUpdate.TPSTART(QUDA_PROFILE_FREE);
   if (param->make_resident_gauge) {
-    if (gaugePrecise != nullptr) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-    gaugePrecise = cudaOutGauge;
-  } else {
-    delete cudaOutGauge;
-  }
-
-  if (param->make_resident_mom) {
-    if (momResident != nullptr && momResident != cudaMom) delete momResident;
-    momResident = cudaMom;
-  } else {
-    delete cudaMom;
+    if (gaugePrecise && !param->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    std::exchange(*gaugePrecise, u_out);
   }
 
-  delete cudaInGauge;
-  if (cpuMom) delete cpuMom;
-  if (cpuGauge) delete cpuGauge;
+  if (param->make_resident_mom && !param->use_resident_mom) std::exchange(momResident, cudaMom);
+  else momResident = GaugeField();
 
-  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_FREE);
-  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
 }
 
  void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) {
@@ -5196,8 +5013,6 @@ void updateGaugeFieldQuda(void* gauge,
    // create the gauge field
    GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS);
    gParam.location = QUDA_CPU_FIELD_LOCATION;
-   gParam.site_offset = param->gauge_offset;
-   gParam.site_size = param->site_size;
    bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
    GaugeField *cpuGauge = need_cpu ? new GaugeField(gParam) : nullptr;
 
@@ -5313,96 +5128,55 @@ void updateGaugeFieldQuda(void* gauge,
 // evaluate the momentum action
 double momActionQuda(void* momentum, QudaGaugeParam* param)
 {
-  profileMomAction.TPSTART(QUDA_PROFILE_TOTAL);
-
-  profileMomAction.TPSTART(QUDA_PROFILE_INIT);
+  pushProfile(profileMomAction);
   checkGaugeParam(param);
 
   // create the momentum fields
   GaugeFieldParam gParam(*param, momentum, QUDA_ASQTAD_MOM_LINKS);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  gParam.reconstruct = (gParam.order == QUDA_TIFR_GAUGE_ORDER || gParam.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ?
-    QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;
-  gParam.site_offset = param->mom_offset;
-  gParam.site_size = param->site_size;
-
-  GaugeField *cpuMom = !param->use_resident_mom ? new GaugeField(gParam) : nullptr;
+  GaugeField cpuMom = !param->use_resident_mom ? GaugeField(gParam) : GaugeField();
 
   // create the device fields
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
-  gParam.create = QUDA_NULL_FIELD_CREATE;
+  gParam.field = &cpuMom;
+  gParam.create = QUDA_COPY_FIELD_CREATE;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
   gParam.setPrecision(param->cuda_prec, true);
 
-  GaugeField *cudaMom = !param->use_resident_mom ? new GaugeField(gParam) : nullptr;
-
-  profileMomAction.TPSTOP(QUDA_PROFILE_INIT);
-
-  profileMomAction.TPSTART(QUDA_PROFILE_H2D);
-  if (!param->use_resident_mom) {
-    cudaMom->copy(*cpuMom);
-  } else {
-    if (!momResident) errorQuda("No resident mom field allocated");
-    cudaMom = momResident;
-  }
-  profileMomAction.TPSTOP(QUDA_PROFILE_H2D);
+  if (param->use_resident_mom && !momResident.Volume()) errorQuda("No resident mom field allocated");
+  GaugeField cudaMom = param->use_resident_mom ? momResident.create_alias() : GaugeField(gParam);
 
   // perform the update
-  profileMomAction.TPSTART(QUDA_PROFILE_COMPUTE);
-  double action = computeMomAction(*cudaMom);
-  profileMomAction.TPSTOP(QUDA_PROFILE_COMPUTE);
-
-  profileMomAction.TPSTART(QUDA_PROFILE_FREE);
-  if (param->make_resident_mom) {
-    if (momResident != nullptr && momResident != cudaMom) delete momResident;
-    momResident = cudaMom;
-  } else {
-    delete cudaMom;
-    momResident = nullptr;
-  }
-  if (cpuMom) {
-    delete cpuMom;
-  }
+  double action = computeMomAction(cudaMom);
 
-  profileMomAction.TPSTOP(QUDA_PROFILE_FREE);
-  profileMomAction.TPSTOP(QUDA_PROFILE_TOTAL);
+  if (param->make_resident_mom && !param->use_resident_gauge) std::exchange(momResident, cudaMom);
+  else momResident = GaugeField();
 
+  popProfile();
   return action;
 }
 
 void gaussGaugeQuda(unsigned long long seed, double sigma)
 {
-  profileGauss.TPSTART(QUDA_PROFILE_TOTAL);
+  pushProfile(profileGauss);
 
   if (!gaugePrecise) errorQuda("Cannot generate Gauss GaugeField as there is no resident gauge field");
-
-  GaugeField *data = gaugePrecise;
-
-  profileGauss.TPSTART(QUDA_PROFILE_COMPUTE);
-  quda::gaugeGauss(*data, seed, sigma);
-  profileGauss.TPSTOP(QUDA_PROFILE_COMPUTE);
+  quda::gaugeGauss(*gaugePrecise, seed, sigma);
 
   if (extendedGaugeResident) {
     extendedGaugeResident->copy(*gaugePrecise);
     extendedGaugeResident->exchangeExtendedGhost(R, profileGauss, redundant_comms);
   }
 
-  profileGauss.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
 }
 
 void gaussMomQuda(unsigned long long seed, double sigma)
 {
-  profileGauss.TPSTART(QUDA_PROFILE_TOTAL);
-
-  if (!momResident) errorQuda("Cannot generate Gauss GaugeField as there is no resident momentum field");
-
-  GaugeField *data = momResident;
-
-  profileGauss.TPSTART(QUDA_PROFILE_COMPUTE);
-  quda::gaugeGauss(*data, seed, sigma);
-  profileGauss.TPSTOP(QUDA_PROFILE_COMPUTE);
-
-  profileGauss.TPSTOP(QUDA_PROFILE_TOTAL);
+  pushProfile(profileGauss);
+  if (!momResident.Volume()) errorQuda("Cannot generate Gauss GaugeField as there is no resident momentum field");
+  quda::gaugeGauss(momResident, seed, sigma);
+  popProfile();
 }
 
 /*
@@ -5810,8 +5584,6 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u
 
   GaugeFieldParam gParam(*param, gauge);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  gParam.site_offset = param->gauge_offset;
-  gParam.site_size = param->site_size;
   auto *cpuGauge = new GaugeField(gParam);
 
   gParam.create = QUDA_NULL_FIELD_CREATE;
@@ -5888,8 +5660,6 @@ int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir,  const
 
   GaugeFieldParam gParam(*param, gauge);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  gParam.site_offset = param->gauge_offset;
-  gParam.site_size = param->site_size;
   auto *cpuGauge = new GaugeField(gParam);
 
   gParam.create = QUDA_NULL_FIELD_CREATE;
diff --git a/lib/momentum.cu b/lib/momentum.cu
index 78a981e509..7a574687ca 100644
--- a/lib/momentum.cu
+++ b/lib/momentum.cu
@@ -9,6 +9,7 @@
 #include <tunable_reduction.h>
 #include <tunable_nd.h>
 #include <kernels/momentum.cuh>
+#include "timer.h"
 
 namespace quda {
 
@@ -92,9 +93,11 @@ namespace quda {
   };
 
   double computeMomAction(const GaugeField& mom) {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (!mom.isNative()) errorQuda("Unsupported output ordering: %d\n", mom.Order());
     double action = 0.0;
     instantiate<ActionMom, Reconstruct10>(mom, action);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     return action;
   }
 
@@ -132,11 +135,13 @@ namespace quda {
 
   void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (mom.Reconstruct() != QUDA_RECONSTRUCT_10)
       errorQuda("Momentum field with reconstruct %d not supported", mom.Reconstruct());
 
     checkPrecision(mom, force);
     instantiate<UpdateMom, ReconstructMom>(force, mom, coeff, fname);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
   template <typename Float, int nColor, QudaReconstructType recon>
@@ -173,9 +178,11 @@ namespace quda {
 
   void applyU(GaugeField &force, GaugeField &U)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (!force.isNative()) errorQuda("Unsupported output ordering: %d\n", force.Order());
     checkPrecision(force, U);
     instantiate<UApply, ReconstructNo12>(U, force);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda
diff --git a/lib/staggered_oprod.cu b/lib/staggered_oprod.cu
index 8f4943061b..1f9cbcccf7 100644
--- a/lib/staggered_oprod.cu
+++ b/lib/staggered_oprod.cu
@@ -2,6 +2,7 @@
 #include <tunable_nd.h>
 #include <instantiate.h>
 #include <kernels/staggered_outer_product.cuh>
+#include "timer.h"
 
 namespace quda {
 
@@ -106,6 +107,7 @@ namespace quda {
 #ifdef GPU_STAGGERED_DIRAC
   void computeStaggeredOprod(GaugeField *out[], ColorSpinorField& in, const double coeff[], int nFace)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (nFace == 1) {
       computeStaggeredOprod(*out[0], *out[0], in.Even(), in.Odd(), 0, coeff, nFace);
       double coeff_[2] = {-coeff[0],0.0}; // need to multiply by -1 on odd sites
@@ -116,6 +118,7 @@ namespace quda {
     } else {
       errorQuda("Invalid nFace=%d", nFace);
     }
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else // GPU_STAGGERED_DIRAC not defined
   void computeStaggeredOprod(GaugeField *[], ColorSpinorField &, const double [], int)
diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp
index d4f4de254d..1f78d936bc 100644
--- a/lib/targets/cuda/malloc.cpp
+++ b/lib/targets/cuda/malloc.cpp
@@ -7,6 +7,7 @@
 #include <quda_internal.h>
 #include <device.h>
 #include <shmem_helper.cuh>
+#include "timer.h"
 
 #ifdef USE_QDPJIT
 #include "qdp_cache.h"
@@ -795,6 +796,7 @@ namespace quda
     size(size),
     pool(pool)
   {
+    getProfile().TPSTART(QUDA_PROFILE_INIT);
     if (pool && (type != QUDA_MEMORY_DEVICE && type != QUDA_MEMORY_HOST_PINNED && type != QUDA_MEMORY_HOST))
       errorQuda("Memory pool not available for memory type %d", type);
 
@@ -823,11 +825,13 @@ namespace quda
       default: errorQuda("Unknown memory type %d", type);
       }
     }
+    getProfile().TPSTOP(QUDA_PROFILE_INIT);
   }
 
   quda_ptr::quda_ptr(void *ptr, QudaMemoryType type) :
     type(type)
   {
+    getProfile().TPSTART(QUDA_PROFILE_INIT);
     switch (type) {
     case QUDA_MEMORY_DEVICE:
     case QUDA_MEMORY_DEVICE_PINNED:
@@ -845,6 +849,7 @@ namespace quda
       break;
     default: errorQuda("Unsupported memory type %d", type);
     }
+    getProfile().TPSTOP(QUDA_PROFILE_INIT);
   }
 
   quda_ptr& quda_ptr::operator=(quda_ptr &&other)
@@ -861,6 +866,8 @@ namespace quda
 
   quda_ptr::~quda_ptr()
   {
+    getProfile().TPSTART(QUDA_PROFILE_FREE);
+
     if (size > 0) {
       switch (type) {
       case QUDA_MEMORY_DEVICE:        pool ? pool_device_free(device) : device_free(device); break;
@@ -874,6 +881,8 @@ namespace quda
 
     device = nullptr;
     host = nullptr;
+
+    getProfile().TPSTOP(QUDA_PROFILE_FREE);
   }
 
   bool quda_ptr::is_device() const
diff --git a/lib/timer.cpp b/lib/timer.cpp
index e8e427fd74..c4e924ee6e 100644
--- a/lib/timer.cpp
+++ b/lib/timer.cpp
@@ -1,3 +1,4 @@
+#include <stack>
 #include <quda_internal.h>
 #include <timer.h>
 
@@ -113,4 +114,25 @@ namespace quda {
     }
   }
 
+  static std::stack<TimeProfile*> tpstack;
+
+  void pushProfile(TimeProfile &profile)
+  {
+    profile.TPSTART(QUDA_PROFILE_TOTAL);
+    tpstack.push(&profile);
+  }
+
+  void popProfile()
+  {
+    if (tpstack.empty()) errorQuda("popProfile() called with empty stack");
+    auto &profile = *(tpstack.top());
+    tpstack.pop();
+    profile.TPSTOP(QUDA_PROFILE_TOTAL);
+  }
+
+  TimeProfile& getProfile()
+  {
+    if (tpstack.empty()) return dummy;
+    return *(tpstack.top());
+  }
 }

From 3db98e143a2fde6ff8e4720414e2f72230149952 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 19 May 2023 10:24:56 -0700
Subject: [PATCH 12/60] Continued to add auto profiling support and GaugeField
 cleanup to various QUDA interfaces.  Add ref counting support to the
 profiling, to allow for multiple starts without throwing an error: if a timer
 has already been started we simply increment the ref counter and return. 
 Profiling now performs a device sync if the type is H2D, D2H or COMPUTE: this
 negates the need to use explicit synchronization and ensures accurate
 profiling

---
 include/gauge_tools.h        |   3 +-
 include/quda.h               |   7 +-
 include/quda_internal.h      |   1 +
 include/timer.h              |  17 +-
 lib/clover_field.cpp         |   2 -
 lib/color_spinor_field.cpp   |   1 -
 lib/contract.cu              |   2 +
 lib/gauge_ape.cu             |   2 +
 lib/gauge_field.cpp          |   2 +-
 lib/gauge_fix_fft.cu         |   2 +
 lib/gauge_fix_ovr.cu         |   2 +
 lib/gauge_force.cu           |   4 +
 lib/gauge_loop_trace.cu      |   2 +
 lib/gauge_observable.cpp     |  19 +-
 lib/gauge_plaq.cu            |   2 +
 lib/gauge_qcharge.cu         |   4 +
 lib/gauge_stout.cu           |   4 +
 lib/gauge_wilson_flow.cu     |   2 +
 lib/hisq_paths_force_quda.cu |  37 ++-
 lib/interface_quda.cpp       | 506 ++++++++++++-----------------------
 lib/milc_interface.cpp       |  19 +-
 lib/staggered_oprod.cu       |   1 -
 lib/unitarize_force_quda.cu  |   2 +
 lib/unitarize_links_quda.cu  |   4 +
 24 files changed, 249 insertions(+), 398 deletions(-)

diff --git a/include/gauge_tools.h b/include/gauge_tools.h
index 503c20bc9f..9b7d68db37 100644
--- a/include/gauge_tools.h
+++ b/include/gauge_tools.h
@@ -9,9 +9,8 @@ namespace quda
    * @param[in] Gauge field upon which we are measuring.
    * @param[in,out] param Parameter struct that defines which
    * observables we are making and the resulting observables.
-   * @param[in] profile TimeProfile instance used for profiling.
    */
-  void gaugeObservables(GaugeField &u, QudaGaugeObservableParam &param, TimeProfile &profile);
+  void gaugeObservables(GaugeField &u, QudaGaugeObservableParam &param);
 
   /**
    * @brief Project the input gauge field onto the SU(3) group.  This
diff --git a/include/quda.h b/include/quda.h
index b697ef7400..cb22e50033 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -1673,12 +1673,11 @@ extern "C" {
    * @param[in] reunit_interval, reunitarize gauge field when iteration count is a multiple of this
    * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value
    * @param[in] param The parameters of the external fields and the computation settings
-   * @param[out] timeinfo
    */
   int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps,
                                 const unsigned int verbose_interval, const double relax_boost, const double tolerance,
                                 const unsigned int reunit_interval, const unsigned int stopWtheta,
-                                QudaGaugeParam *param, double *timeinfo);
+                                QudaGaugeParam *param);
 
   /**
    * @brief Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
@@ -1692,12 +1691,10 @@ extern "C" {
    * iteration reachs the maximum number of steps defined by Nsteps
    * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value
    * @param[in] param The parameters of the external fields and the computation settings
-   * @param[out] timeinfo
    */
   int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps,
                                 const unsigned int verbose_interval, const double alpha, const unsigned int autotune,
-                                const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param,
-                                double *timeinfo);
+                                const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param);
 
   /**
    * @brief Strided Batched GEMM
diff --git a/include/quda_internal.h b/include/quda_internal.h
index 756d5822e0..dd8a6c8177 100644
--- a/include/quda_internal.h
+++ b/include/quda_internal.h
@@ -49,6 +49,7 @@
 #include <object.h>
 #include <device.h>
 #include <array.h>
+#include "timer.h"
 
 namespace quda {
 
diff --git a/include/timer.h b/include/timer.h
index 20b9df45ff..2475fee154 100644
--- a/include/timer.h
+++ b/include/timer.h
@@ -65,15 +65,16 @@ namespace quda {
       }
     }
 
+    int ref_count = 0;
+
     /**
        @brief Start the timer
      */
-    void start(const char *func = nullptr, const char *file = nullptr, int line = 0)
+    void start(const char * = nullptr, const char * = nullptr, int = 0)
     {
-      if (running) {
-        printfQuda("ERROR: Cannot start an already running timer (%s:%d in %s())", file ? file : "", line,
-                   func ? func : "");
-        errorQuda("Aborting");
+      if (running) { // if the timer has already started, we increment the ref counter and return
+        ref_count++;
+        return;
       }
       if (!device) {
         gettimeofday(&host_start, NULL);
@@ -110,6 +111,10 @@ namespace quda {
      */
     void stop(const char *func = nullptr, const char *file = nullptr, int line = 0)
     {
+      if (ref_count > 0) {
+        ref_count--;
+        return;
+      }
       peek(func, file, line);
       time += last_interval;
       count++;
@@ -271,6 +276,8 @@ namespace quda {
     }
 
     void Stop_(const char *func, const char *file, int line, QudaProfileType idx) {
+      if (idx == QUDA_PROFILE_COMPUTE || idx == QUDA_PROFILE_H2D || idx == QUDA_PROFILE_D2H)
+        qudaDeviceSynchronize(); // ensure accurate profiling
       profile[idx].stop(func, file, line);
       POP_RANGE
 
diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp
index bb952ba324..46394c332b 100644
--- a/lib/clover_field.cpp
+++ b/lib/clover_field.cpp
@@ -257,8 +257,6 @@ namespace quda {
         pool_device_free(packClover);
       }
     }
-
-    qudaDeviceSynchronize();
   }
 
   void CloverField::copy(const CloverField &src)
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 73417b4462..96df00ba55 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -482,7 +482,6 @@ namespace quda
             pool_device_free(buffer);
         }
       }
-      qudaDeviceSynchronize(); // include sync here for accurate host-device profiling
 
     } else if (Location() == QUDA_CPU_FIELD_LOCATION && src.Location() == QUDA_CUDA_FIELD_LOCATION) { // D2H
 
diff --git a/lib/contract.cu b/lib/contract.cu
index 491652ae9c..74206419c6 100644
--- a/lib/contract.cu
+++ b/lib/contract.cu
@@ -58,12 +58,14 @@ public:
 #ifdef GPU_CONTRACT
   void contractQuda(const ColorSpinorField &x, const ColorSpinorField &y, void *result, const QudaContractType cType)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkPrecision(x, y);
     if (x.GammaBasis() != QUDA_DEGRAND_ROSSI_GAMMA_BASIS || y.GammaBasis() != QUDA_DEGRAND_ROSSI_GAMMA_BASIS)
       errorQuda("Unexpected gamma basis x=%d y=%d", x.GammaBasis(), y.GammaBasis());
     if (x.Nspin() != 4 || y.Nspin() != 4) errorQuda("Unexpected number of spins x=%d y=%d", x.Nspin(), y.Nspin());
 
     instantiate<Contraction>(x, y, result, cType);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else
   void contractQuda(const ColorSpinorField &, const ColorSpinorField &, void *, const QudaContractType)
diff --git a/lib/gauge_ape.cu b/lib/gauge_ape.cu
index 5ace8e5a29..248b7d1d6c 100644
--- a/lib/gauge_ape.cu
+++ b/lib/gauge_ape.cu
@@ -57,7 +57,9 @@ namespace quda {
 
     copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
     in.exchangeExtendedGhost(in.R(), false);
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<GaugeAPE>(out, in, alpha);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     out.exchangeExtendedGhost(out.R(), false);
   }
 
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index e8c4994670..0003663e25 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -995,6 +995,7 @@ namespace quda {
             } else {
               errorQuda("Ghost copy not supported here");
             }
+            qudaDeviceSynchronize(); // synchronize to ensure visibility on the host
           } else {
             void *buffer = create_gauge_buffer(bytes, order, geometry);
             size_t ghost_bytes[8];
@@ -1110,7 +1111,6 @@ namespace quda {
     staggeredPhaseApplied = src.StaggeredPhaseApplied();
     staggeredPhaseType = src.StaggeredPhase();
 
-    if (src.Location() != location) qudaDeviceSynchronize(); // include sync here for accurate host-device profiling
     if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
       profile.TPSTOP(QUDA_PROFILE_D2H);
     } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
diff --git a/lib/gauge_fix_fft.cu b/lib/gauge_fix_fft.cu
index b85f5b4457..1de3980332 100644
--- a/lib/gauge_fix_fft.cu
+++ b/lib/gauge_fix_fft.cu
@@ -389,8 +389,10 @@ namespace quda {
   void gaugeFixingFFT(GaugeField& data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha,
                       const int autotune, const double tolerance, const int stopWtheta)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (comm_partitioned()) errorQuda("Gauge Fixing with FFTs in multi-GPU support NOT implemented yet!");
     instantiate<GaugeFixingFFT>(data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 }
diff --git a/lib/gauge_fix_ovr.cu b/lib/gauge_fix_ovr.cu
index 814b65427d..064ed5b158 100644
--- a/lib/gauge_fix_ovr.cu
+++ b/lib/gauge_fix_ovr.cu
@@ -502,7 +502,9 @@ namespace quda {
   void gaugeFixingOVR(GaugeField& data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost,
                       const double tolerance, const int reunit_interval, const int stopWtheta)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<GaugeFixingOVR>(data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 }   //namespace quda
diff --git a/lib/gauge_force.cu b/lib/gauge_force.cu
index 2558dadcac..5e43fa64e6 100644
--- a/lib/gauge_force.cu
+++ b/lib/gauge_force.cu
@@ -48,6 +48,7 @@ namespace quda {
   void gaugeForce(GaugeField& mom, const GaugeField& u, double epsilon, std::vector<int**>& input_path,
                   std::vector<int>& length, std::vector<double>& path_coeff, int num_paths, int path_max_length)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkPrecision(mom, u);
     checkLocation(mom, u);
     if (mom.Reconstruct() != QUDA_RECONSTRUCT_10) errorQuda("Reconstruction type %d not supported", mom.Reconstruct());
@@ -57,11 +58,13 @@ namespace quda {
     // gauge field must be passed as first argument so we peel off its reconstruct type
     instantiate<GaugeForce_>(u, mom, epsilon, p);
     p.free();
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
   
   void gaugePath(GaugeField& out, const GaugeField& u, double coeff, std::vector<int**>& input_path,
 		 std::vector<int>& length, std::vector<double>& path_coeff, int num_paths, int path_max_length)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkPrecision(out, u);
     checkLocation(out, u);
     if (out.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("Reconstruction type %d not supported", out.Reconstruct());
@@ -71,6 +74,7 @@ namespace quda {
     // gauge field must be passed as first argument so we peel off its reconstruct type
     instantiate<GaugePath>(u, out, coeff, p);
     p.free();
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda
diff --git a/lib/gauge_loop_trace.cu b/lib/gauge_loop_trace.cu
index faaaa97d99..0b1af50ba4 100644
--- a/lib/gauge_loop_trace.cu
+++ b/lib/gauge_loop_trace.cu
@@ -55,6 +55,7 @@ namespace quda {
   void gaugeLoopTrace(const GaugeField& u, std::vector<Complex>& loop_traces, double factor, std::vector<int**>& input_path,
 		 std::vector<int>& length, std::vector<double>& path_coeff, int num_paths, int path_max_length)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     paths<1> p(input_path, length, path_coeff, num_paths, path_max_length);
 
     std::vector<array<double, 2>> tr_array(loop_traces.size());
@@ -65,6 +66,7 @@ namespace quda {
     for (auto i = 0u; i < tr_array.size(); i++) { loop_traces[i] = Complex(tr_array[i][0], tr_array[i][1]); }
 
     p.free();
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda
diff --git a/lib/gauge_observable.cpp b/lib/gauge_observable.cpp
index b825a2ad81..041dc6164d 100644
--- a/lib/gauge_observable.cpp
+++ b/lib/gauge_observable.cpp
@@ -5,9 +5,9 @@
 namespace quda
 {
 
-  void gaugeObservables(GaugeField &u, QudaGaugeObservableParam &param, TimeProfile &profile)
+  void gaugeObservables(GaugeField &u, QudaGaugeObservableParam &param)
   {
-    profile.TPSTART(QUDA_PROFILE_COMPUTE);
+    auto &profile = getProfile();
     if (param.su_project) {
       int *num_failures_h = static_cast<int *>(pool_pinned_malloc(sizeof(int)));
       int *num_failures_d = static_cast<int *>(get_mapped_device_pointer(num_failures_h));
@@ -24,7 +24,6 @@ namespace quda
       param.plaquette[1] = plaq.y;
       param.plaquette[2] = plaq.z;
     }
-    profile.TPSTOP(QUDA_PROFILE_COMPUTE);
 
     if (param.compute_polyakov_loop) { gaugePolyakovLoop(param.ploop, u, 3, profile); }
 
@@ -45,10 +44,8 @@ namespace quda
       std::vector<Complex> loop_traces(param.num_paths);
 
       // actually do the computation
-      profile.TPSTART(QUDA_PROFILE_COMPUTE);
       gaugeLoopTrace(u, loop_traces, param.factor, input_path_v, path_length_v, loop_coeff_v, param.num_paths,
                      param.max_length);
-      profile.TPSTOP(QUDA_PROFILE_COMPUTE);
 
       for (int i = 0; i < param.num_paths; i++) { memcpy(param.traces + i, &loop_traces[i], sizeof(Complex)); }
     }
@@ -57,7 +54,6 @@ namespace quda
     if (!param.compute_qcharge && !param.compute_qcharge_density) return;
 
     // create the Fmunu field
-    profile.TPSTART(QUDA_PROFILE_INIT);
     // u is an extended field we need to shrink for the Fmunu field
     lat_dim_t x;
     for (int i = 0; i < 4; i++) x[i] = u.X()[i] - 2 * u.R()[i];
@@ -67,15 +63,10 @@ namespace quda
     tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
     tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
     GaugeField gaugeFmunu(tensorParam);
-    profile.TPSTOP(QUDA_PROFILE_INIT);
 
-    profile.TPSTART(QUDA_PROFILE_COMPUTE);
     computeFmunu(gaugeFmunu, u);
-    profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-    profile.TPSTOP(QUDA_PROFILE_TOTAL);
 
     if (param.compute_qcharge || param.compute_qcharge_density) {
-      profile.TPSTART(QUDA_PROFILE_TOTAL);
       profile.TPSTART(QUDA_PROFILE_INIT);
       if (param.compute_qcharge_density && !param.qcharge_density)
         errorQuda("Charge density requested, but destination field not defined");
@@ -83,23 +74,17 @@ namespace quda
       void *d_qDensity = param.compute_qcharge_density ? pool_device_malloc(size) : nullptr;
       profile.TPSTOP(QUDA_PROFILE_INIT);
 
-      profile.TPSTART(QUDA_PROFILE_COMPUTE);
-
       if (param.compute_qcharge_density)
         computeQChargeDensity(param.energy, param.qcharge, d_qDensity, gaugeFmunu);
       else
         computeQCharge(param.energy, param.qcharge, gaugeFmunu);
 
-      profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-
       if (param.compute_qcharge_density) {
         profile.TPSTART(QUDA_PROFILE_D2H);
         qudaMemcpy(param.qcharge_density, d_qDensity, size, qudaMemcpyDeviceToHost);
         profile.TPSTOP(QUDA_PROFILE_D2H);
 
-        profile.TPSTART(QUDA_PROFILE_FREE);
         pool_device_free(d_qDensity);
-        profile.TPSTOP(QUDA_PROFILE_FREE);
       }
     }
   }
diff --git a/lib/gauge_plaq.cu b/lib/gauge_plaq.cu
index 7ad5c0399e..ee48d2e3d2 100644
--- a/lib/gauge_plaq.cu
+++ b/lib/gauge_plaq.cu
@@ -37,9 +37,11 @@ namespace quda {
 
   double3 plaquette(const GaugeField &U)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     array<double, 2> plq{0.0, 0.0};
     instantiate<GaugePlaq, ReconstructGauge>(U, plq);
     double3 plaq = make_double3(0.5*(plq[0] + plq[1]), plq[0], plq[1]);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     return plaq;
   }
 
diff --git a/lib/gauge_qcharge.cu b/lib/gauge_qcharge.cu
index d847b219ae..3b4e584b02 100644
--- a/lib/gauge_qcharge.cu
+++ b/lib/gauge_qcharge.cu
@@ -62,12 +62,16 @@ namespace quda
 
   void computeQCharge(double energy[3], double &qcharge, const GaugeField &Fmunu)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<QCharge, ReconstructNone>(Fmunu, energy, qcharge, nullptr, false);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
   void computeQChargeDensity(double energy[3], double &qcharge, void *qdensity, const GaugeField &Fmunu)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<QCharge, ReconstructNone>(Fmunu, energy, qcharge, qdensity, true);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda
diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu
index d8d40af42f..f537ca60ea 100644
--- a/lib/gauge_stout.cu
+++ b/lib/gauge_stout.cu
@@ -72,7 +72,9 @@ namespace quda {
 
     copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
     in.exchangeExtendedGhost(in.R(), false);
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<GaugeSTOUT>(out, in, false, rho);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     out.exchangeExtendedGhost(out.R(), false);
   }
 
@@ -84,7 +86,9 @@ namespace quda {
 
     copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
     in.exchangeExtendedGhost(in.R(), false);
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<GaugeSTOUT>(out, in, true, rho, epsilon);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     out.exchangeExtendedGhost(out.R(), false);
   }
 
diff --git a/lib/gauge_wilson_flow.cu b/lib/gauge_wilson_flow.cu
index a3ce38ba81..d92fb0a68c 100644
--- a/lib/gauge_wilson_flow.cu
+++ b/lib/gauge_wilson_flow.cu
@@ -38,6 +38,7 @@ namespace quda {
       wflow_type(wflow_type),
       step_type(step_type)
     {
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       strcat(aux, comm_dim_partitioned_string());
       switch (wflow_type) {
       case QUDA_GAUGE_SMEAR_WILSON_FLOW: strcat(aux,",computeWFlowStepWilson"); break;
@@ -52,6 +53,7 @@ namespace quda {
       }
 
       apply(device::get_default_stream());
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     }
 
     template <QudaGaugeSmearType wflow_type, WFlowStepType step_type> using Arg =
diff --git a/lib/hisq_paths_force_quda.cu b/lib/hisq_paths_force_quda.cu
index 320000dc75..e6e30f90bc 100644
--- a/lib/hisq_paths_force_quda.cu
+++ b/lib/hisq_paths_force_quda.cu
@@ -547,6 +547,7 @@ namespace quda {
 #ifdef GPU_STAGGERED_DIRAC
     void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff_array[6])
     {
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       checkNative(link, oprod, newOprod);
       checkLocation(newOprod, oprod, link);
       checkPrecision(oprod, link, newOprod);
@@ -557,32 +558,24 @@ namespace quda {
       gauge_param.geometry = QUDA_SCALAR_GEOMETRY;
       gauge_param.setPrecision(gauge_param.Precision(), true);
 
-      auto P3 = GaugeField::Create(gauge_param);
-
-      auto Pmu = GaugeField::Create(gauge_param);
-      auto P5 = GaugeField::Create(gauge_param);
-      auto Pnumu = GaugeField::Create(gauge_param);
-      auto Qnumu = GaugeField::Create(gauge_param);
+      auto P3 = GaugeField(gauge_param);
+      auto Pmu = GaugeField(gauge_param);
+      auto P5 = GaugeField(gauge_param);
+      auto Pnumu = GaugeField(gauge_param);
+      auto Qnumu = GaugeField(gauge_param);
 
       // need double buffers for these fields to fuse "side link" terms with
       // subsequent "middle link" terms in a different direction
-      auto Pmu_next = GaugeField::Create(gauge_param);
-      auto Pnumu_next = GaugeField::Create(gauge_param);
-      auto Qnumu_next = GaugeField::Create(gauge_param);
+      auto Pmu_next = GaugeField(gauge_param);
+      auto Pnumu_next = GaugeField(gauge_param);
+      auto Qnumu_next = GaugeField(gauge_param);
 
-      instantiateGaugeStaggered<HisqStaplesForce>(link, *P3, GaugeField_ref(*Pmu),
-        GaugeField_ref(*P5), GaugeField_ref(*Pnumu), GaugeField_ref(*Qnumu),
-        GaugeField_ref(*Pmu_next), GaugeField_ref(*Pnumu_next), GaugeField_ref(*Qnumu_next),
+      instantiateGaugeStaggered<HisqStaplesForce>(link, P3, GaugeField_ref(Pmu),
+        GaugeField_ref(P5), GaugeField_ref(Pnumu), GaugeField_ref(Qnumu),
+        GaugeField_ref(Pmu_next), GaugeField_ref(Pnumu_next), GaugeField_ref(Qnumu_next),
         newOprod, oprod, path_coeff_array);
 
-      delete Pmu;
-      delete P3;
-      delete P5;
-      delete Pnumu;
-      delete Qnumu;
-      delete Pmu_next;
-      delete Pnumu_next;
-      delete Qnumu_next;
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     }
 #else
     void hisqStaplesForce(GaugeField &, const GaugeField &, const GaugeField &, const double[6])
@@ -651,10 +644,12 @@ namespace quda {
 #ifdef GPU_STAGGERED_DIRAC
     void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oldOprod, const GaugeField &link, double coeff)
     {
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       checkNative(link, oldOprod, newOprod);
       checkLocation(newOprod, oldOprod, link);
       checkPrecision(newOprod, link, oldOprod);
       instantiateGaugeStaggered<HisqLongLinkForce>(link, newOprod, oldOprod, coeff);
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     }
 #else
     void hisqLongLinkForce(GaugeField &, const GaugeField &, const GaugeField &, double)
@@ -725,10 +720,12 @@ namespace quda {
 #ifdef GPU_STAGGERED_DIRAC
     void hisqCompleteForce(GaugeField &force, const GaugeField &link)
     {
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       checkNative(link, force);
       checkLocation(force, link);
       checkPrecision(link, force);
       instantiateGaugeStaggered<HisqCompleteForce>(link, force);
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     }
 #else
     void hisqCompleteForce(GaugeField &, const GaugeField &)
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index a8351fd35e..23d06d3564 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -4057,7 +4057,6 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
   for (int d = 0; d < 4; d++) { input_path_v[d] = input_path_buf[d]; }
 
   // actually do the computation
-  profileGaugeForce.TPSTART(QUDA_PROFILE_COMPUTE);
   if (!forceMonitor()) {
     gaugeForce(cudaMom, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
   } else {
@@ -4068,16 +4067,17 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
     gaugeForce(force, *cudaGauge, 1.0, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
     updateMomentum(cudaMom, eb3, force, "gauge");
   }
-  profileGaugeForce.TPSTOP(QUDA_PROFILE_COMPUTE);
 
   if (qudaGaugeParam->return_result_mom) cpuMom.copy(cudaMom);
 
   if (qudaGaugeParam->make_resident_gauge) {
     if (gaugePrecise && !qudaGaugeParam->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    gaugePrecise = new GaugeField();
     std::exchange(*gaugePrecise, cudaSiteLink);
   }
 
-  if (qudaGaugeParam->make_resident_mom && !qudaGaugeParam->use_resident_gauge) std::exchange(momResident, cudaMom);
+  if (qudaGaugeParam->make_resident_mom && !qudaGaugeParam->use_resident_mom)
+    std::exchange(momResident, cudaMom);
   else momResident = GaugeField();
 
   if (qudaGaugeParam->make_resident_gauge) {
@@ -4145,9 +4145,7 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
   for (int d = 0; d < 4; d++) { input_path_v[d] = input_path_buf[d]; }
 
   // actually do the computation
-  profileGaugePath.TPSTART(QUDA_PROFILE_COMPUTE);
   gaugePath(*cudaOut, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
-  profileGaugePath.TPSTOP(QUDA_PROFILE_COMPUTE);
 
   cpuOut->copy(*cudaOut);
 
@@ -4402,7 +4400,6 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   // mom += delta * [U * force]TA
   applyU(cudaForce, *gaugePrecise);
   updateMomentum(cudaMom, dt * delta, cudaForce, "staggered");
-  qudaDeviceSynchronize();
 
   // copy the momentum field back to the host
   if (gauge_param->return_result_mom) cpuMom.copy(cudaMom);
@@ -4431,15 +4428,12 @@ void computeHISQForceQuda(void* const milc_momentum,
                           QudaGaugeParam* gParam)
 {
   pushProfile(profileHISQForce);
+  checkGaugeParam(gParam);
 
   using namespace quda;
   using namespace quda::fermion_force;
   if (gParam->gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported input field order %d", gParam->gauge_order);
 
-  checkGaugeParam(gParam);
-
-  profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
-
   {
     // default settings for the unitarization
     const double unitarize_eps = 1e-14;
@@ -4483,16 +4477,14 @@ void computeHISQForceQuda(void* const milc_momentum,
   oParam.setPrecision(gParam->cpu_prec, true);
   oParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
 
-  GaugeField *stapleOprod = new GaugeField(oParam);
-  GaugeField *oneLinkOprod = new GaugeField(oParam);
-  GaugeField *naikOprod = new GaugeField(oParam);
+  GaugeField stapleOprod(oParam);
+  GaugeField oneLinkOprod(oParam);
+  GaugeField naikOprod(oParam);
 
   double act_path_coeff[6] = {0, 1, level2_coeff[2], level2_coeff[3], level2_coeff[4], level2_coeff[5]};
   // You have to look at the MILC routine to understand the following
   // Basically, I have already absorbed the one-link coefficient
 
-  profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
-
   { // do outer-product computation
     ColorSpinorParam qParam;
     qParam.nColor = 3;
@@ -4517,10 +4509,10 @@ void computeHISQForceQuda(void* const milc_momentum,
     qParam.v = fermion[0];
 
     { // regular terms
-      GaugeField *oprod[2] = {stapleOprod, naikOprod};
+      GaugeField *oprod[2] = {&stapleOprod, &naikOprod};
 
       // loop over different quark fields
-      for(int i=0; i<num_terms; ++i){
+      for (int i = 0; i < num_terms; ++i) {
 
         // Wrap the MILC quark field
         profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
@@ -4532,20 +4524,17 @@ void computeHISQForceQuda(void* const milc_momentum,
         cudaQuark = cpuQuark;
         profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
 
-        profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
         computeStaggeredOprod(oprod, cudaQuark, coeff[i], 3);
-        qudaDeviceSynchronize();
-        profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
       }
     }
 
     { // naik terms
-      oneLinkOprod->copy(*stapleOprod);
-      ax(level2_coeff[0], *oneLinkOprod);
-      GaugeField *oprod[2] = {oneLinkOprod, naikOprod};
+      oneLinkOprod.copy(stapleOprod);
+      ax(level2_coeff[0], oneLinkOprod);
+      GaugeField *oprod[2] = {&oneLinkOprod, &naikOprod};
 
       // loop over different quark fields
-      for(int i=0; i<num_naik_terms; ++i){
+      for (int i = 0; i < num_naik_terms; ++i) {
 
         // Wrap the MILC quark field
         profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
@@ -4557,16 +4546,11 @@ void computeHISQForceQuda(void* const milc_momentum,
         cudaQuark = cpuQuark;
         profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
 
-        profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
         computeStaggeredOprod(oprod, cudaQuark, coeff[i + num_terms], 3);
-        qudaDeviceSynchronize();
-        profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
       }
     }
   }
 
-  profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
-
   // Compute the pad size
   int pad_size = 0;
 #ifdef MULTI_GPU
@@ -4589,13 +4573,13 @@ void computeHISQForceQuda(void* const milc_momentum,
     oParam.r[dir] = R[dir];
   }
 
-  GaugeField *cudaInForce = new GaugeField(oParam);
-  copyExtendedGauge(*cudaInForce, *stapleOprod, QUDA_CUDA_FIELD_LOCATION);
-  delete stapleOprod;
+  GaugeField cudaInForce(oParam);
+  copyExtendedGauge(cudaInForce, stapleOprod, QUDA_CUDA_FIELD_LOCATION);
+  stapleOprod = GaugeField();
 
-  GaugeField *cudaOutForce = new GaugeField(oParam);
-  copyExtendedGauge(*cudaOutForce, *oneLinkOprod, QUDA_CUDA_FIELD_LOCATION);
-  delete oneLinkOprod;
+  GaugeField cudaOutForce(oParam);
+  copyExtendedGauge(cudaOutForce, oneLinkOprod, QUDA_CUDA_FIELD_LOCATION);
+  oneLinkOprod = GaugeField();
 
   // Create CPU momentum fields, prepare GPU momentum param
   GaugeFieldParam param(*gParam);
@@ -4606,7 +4590,7 @@ void computeHISQForceQuda(void* const milc_momentum,
   param.reconstruct = QUDA_RECONSTRUCT_10;
   param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   param.gauge = milc_momentum;
-  GaugeField *cpuMom = (!gParam->use_resident_mom) ? new GaugeField(param) : nullptr;
+  GaugeField cpuMom = (!gParam->use_resident_mom) ? GaugeField(param) : GaugeField();
 
   param.location = QUDA_CUDA_FIELD_LOCATION;
   param.create = QUDA_ZERO_FIELD_CREATE;
@@ -4650,41 +4634,32 @@ void computeHISQForceQuda(void* const milc_momentum,
   wParam.create = QUDA_NULL_FIELD_CREATE;
   wParam.setPrecision(gParam->cpu_prec, true);
 
-  GaugeField *cudaWLink = new GaugeField(wParam);
-  profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
+  GaugeField cudaWLink(wParam);
 
-  cudaWLink->copy(cpuWLink);
+  cudaWLink.copy(cpuWLink);
 
-  cudaWLink->exchangeExtendedGhost(cudaWLink->R(), profileHISQForce);
+  cudaWLink.exchangeExtendedGhost(cudaWLink.R(), profileHISQForce);
 
-  cudaInForce->exchangeExtendedGhost(R, profileHISQForce);
-  cudaWLink->exchangeExtendedGhost(cudaWLink->R(), profileHISQForce);
-  cudaOutForce->exchangeExtendedGhost(R, profileHISQForce);
+  cudaInForce.exchangeExtendedGhost(R, profileHISQForce);
+  cudaWLink.exchangeExtendedGhost(cudaWLink.R(), profileHISQForce);
+  cudaOutForce.exchangeExtendedGhost(R, profileHISQForce);
 
   // Compute level two term
-  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
-  hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaWLink, act_path_coeff);
-  qudaDeviceSynchronize();
-  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
+  hisqStaplesForce(cudaOutForce, cudaInForce, cudaWLink, act_path_coeff);
 
   // Load naik outer product
-  copyExtendedGauge(*cudaInForce, *naikOprod, QUDA_CUDA_FIELD_LOCATION);
-  cudaInForce->exchangeExtendedGhost(cudaWLink->R(), profileHISQForce);
-  delete naikOprod;
+  copyExtendedGauge(cudaInForce, naikOprod, QUDA_CUDA_FIELD_LOCATION);
+  cudaInForce.exchangeExtendedGhost(cudaWLink.R(), profileHISQForce);
+  naikOprod = GaugeField();
 
   // Compute Naik three-link term contribution
-  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
-  hisqLongLinkForce(*cudaOutForce, *cudaInForce, *cudaWLink, act_path_coeff[1]);
-  qudaDeviceSynchronize();
-  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
+  hisqLongLinkForce(cudaOutForce, cudaInForce, cudaWLink, act_path_coeff[1]);
 
-  cudaOutForce->exchangeExtendedGhost(R, profileHISQForce);
+  cudaOutForce.exchangeExtendedGhost(R, profileHISQForce);
 
   // Load the V field, which contains general matrices, to the device
-  profileHISQForce.TPSTART(QUDA_PROFILE_FREE);
-  delete cudaWLink;
-  profileHISQForce.TPSTOP(QUDA_PROFILE_FREE);
-  profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
+  cudaWLink = GaugeField();
+
   for (int dir = 0; dir < 4; ++dir) {
     vParam.x[dir] += 2 * R[dir];
     vParam.r[dir] = R[dir];
@@ -4696,28 +4671,20 @@ void computeHISQForceQuda(void* const milc_momentum,
   vParam.setPrecision(gParam->cpu_prec, true);
   vParam.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
   vParam.pad = 3 * pad_size;
-  GaugeField *cudaVLink = new GaugeField(vParam);
-  profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
+  GaugeField cudaVLink(vParam);
 
-  cudaVLink->copy(cpuVLink);
-  cudaVLink->exchangeExtendedGhost(cudaVLink->R(), profileHISQForce);
+  cudaVLink.copy(cpuVLink);
+  cudaVLink.exchangeExtendedGhost(cudaVLink.R(), profileHISQForce);
 
-  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
   *num_failures_h = 0;
-  unitarizeForce(*cudaInForce, *cudaOutForce, *cudaVLink, num_failures_d);
+  unitarizeForce(cudaInForce, cudaOutForce, cudaVLink, num_failures_d);
 
   if (*num_failures_h>0) errorQuda("Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h);
 
-  cudaOutForce->zero();
-  qudaDeviceSynchronize();
-  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
-
   // Load the U field, which contains U(3) matrices, to the device
   // TODO: in theory these should just be SU(3) matrices with MILC phases?
-  profileHISQForce.TPSTART(QUDA_PROFILE_FREE);
-  delete cudaVLink;
-  profileHISQForce.TPSTOP(QUDA_PROFILE_FREE);
-  profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
+  cudaVLink = GaugeField();
+
   for (int dir = 0; dir < 4; ++dir) {
     uParam.x[dir] += 2 * R[dir];
     uParam.r[dir] = R[dir];
@@ -4729,47 +4696,31 @@ void computeHISQForceQuda(void* const milc_momentum,
   uParam.setPrecision(gParam->cpu_prec, true);
   uParam.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
   uParam.pad = 3 * pad_size;
-  GaugeField *cudaULink = new GaugeField(uParam);
-  profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
+  GaugeField cudaULink(uParam);
 
-  cudaULink->copy(cpuULink);
-  cudaULink->exchangeExtendedGhost(cudaULink->R(), profileHISQForce);
+  cudaULink.copy(cpuULink);
+  cudaULink.exchangeExtendedGhost(cudaULink.R(), profileHISQForce);
 
   // Compute Fat7-staple term
-  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
-  hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaULink, fat7_coeff);
-  qudaDeviceSynchronize();
-  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
-
-  profileHISQForce.TPSTART(QUDA_PROFILE_FREE);
-  delete cudaInForce;
-  profileHISQForce.TPSTOP(QUDA_PROFILE_FREE);
-  profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
-  GaugeField* cudaMom = new GaugeField(momParam);
-  profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
-
-  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
-  hisqCompleteForce(*cudaOutForce, *cudaULink);
-
-  if (gParam->use_resident_mom) {
-    if (momResident.Length()) errorQuda("No resident momentum field to use");
-    updateMomentum(momResident, dt, *cudaOutForce, "hisq");
-  } else {
-    updateMomentum(*cudaMom, dt, *cudaOutForce, "hisq");
-  }
-  qudaDeviceSynchronize();
-  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
+  cudaOutForce.zero();
+  hisqStaplesForce(cudaOutForce, cudaInForce, cudaULink, fat7_coeff);
 
-  if (gParam->return_result_mom) {
-    // Close the paths, make anti-hermitian, and store in compressed format
-    if (gParam->return_result_mom) cpuMom->copy(*cudaMom);
-  }
+  cudaInForce = GaugeField();
 
-  if (cpuMom) delete cpuMom;
+  hisqCompleteForce(cudaOutForce, cudaULink);
+
+  if (gParam->use_resident_mom && !momResident.Length()) errorQuda("No resident momentum field to use");
+  GaugeField mom = gParam->use_resident_mom ? momResident.create_alias() : GaugeField(momParam);
+  updateMomentum(mom, dt, cudaOutForce, "hisq");
+
+  // Close the paths, make anti-hermitian, and store in compressed format
+  if (gParam->return_result_mom) cpuMom.copy(mom);
   if (!gParam->make_resident_mom) momResident = GaugeField();
-  if (cudaMom) delete cudaMom;
-  delete cudaOutForce;
-  delete cudaULink;
+
+  if (gParam->make_resident_mom && !gParam->use_resident_mom)
+    std::exchange(momResident, mom);
+  else
+    momResident = GaugeField();
 
   popProfile();
 }
@@ -4995,6 +4946,7 @@ void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom,
 
   if (param->make_resident_gauge) {
     if (gaugePrecise && !param->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    gaugePrecise = new GaugeField();
     std::exchange(*gaugePrecise, u_out);
   }
 
@@ -5004,126 +4956,87 @@ void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom,
   popProfile();
 }
 
- void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) {
-   profileProject.TPSTART(QUDA_PROFILE_TOTAL);
-
-   profileProject.TPSTART(QUDA_PROFILE_INIT);
-   checkGaugeParam(param);
-
-   // create the gauge field
-   GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS);
-   gParam.location = QUDA_CPU_FIELD_LOCATION;
-   bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
-   GaugeField *cpuGauge = need_cpu ? new GaugeField(gParam) : nullptr;
-
-   // create the device fields
-   gParam.location = QUDA_CUDA_FIELD_LOCATION;
-   gParam.create = QUDA_NULL_FIELD_CREATE;
-   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
-   gParam.reconstruct = param->reconstruct;
-   GaugeField *cudaGauge = !param->use_resident_gauge ? new GaugeField(gParam) : nullptr;
-   profileProject.TPSTOP(QUDA_PROFILE_INIT);
-
-   if (param->use_resident_gauge) {
-     if (!gaugePrecise) errorQuda("No resident gauge field to use");
-     cudaGauge = gaugePrecise;
-     gaugePrecise = nullptr;
-   } else {
-     profileProject.TPSTART(QUDA_PROFILE_H2D);
-     cudaGauge->copy(*cpuGauge);
-     profileProject.TPSTOP(QUDA_PROFILE_H2D);
-   }
+void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
+{
+  pushProfile(profileProject);
+  checkGaugeParam(param);
 
-   profileProject.TPSTART(QUDA_PROFILE_COMPUTE);
-   *num_failures_h = 0;
+  // create the gauge field
+  GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS);
+  gParam.location = QUDA_CPU_FIELD_LOCATION;
+  bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
+  GaugeField cpuGauge = need_cpu ? GaugeField(gParam) : GaugeField();
 
-   // project onto SU(3)
-   if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase();
-   projectSU3(*cudaGauge, tol, num_failures_d);
-   if (!cudaGauge->StaggeredPhaseApplied() && param->staggered_phase_applied) cudaGauge->applyStaggeredPhase();
+  // create the device fields
+  if (param->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field to use");
+  gParam.location = QUDA_CUDA_FIELD_LOCATION;
+  gParam.create = QUDA_COPY_FIELD_CREATE;
+  gParam.field = &cpuGauge;
+  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
+  gParam.reconstruct = param->reconstruct;
+  GaugeField cudaGauge = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
 
-   profileProject.TPSTOP(QUDA_PROFILE_COMPUTE);
+  *num_failures_h = 0;
 
-   if(*num_failures_h>0)
-     errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
+  // project onto SU(3)
+  if (cudaGauge.StaggeredPhaseApplied()) cudaGauge.removeStaggeredPhase();
+  projectSU3(cudaGauge, tol, num_failures_d);
+  if (!cudaGauge.StaggeredPhaseApplied() && param->staggered_phase_applied) cudaGauge.applyStaggeredPhase();
 
-   if (param->return_result_gauge) {
-     profileProject.TPSTART(QUDA_PROFILE_D2H);
-     cpuGauge->copy(*cudaGauge);
-     profileProject.TPSTOP(QUDA_PROFILE_D2H);
-   }
+  if (*num_failures_h > 0) errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
 
-   if (param->make_resident_gauge) {
-     if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-     gaugePrecise = cudaGauge;
-   } else {
-     delete cudaGauge;
-   }
+  if (param->return_result_gauge) cpuGauge.copy(cudaGauge);
 
-   profileProject.TPSTART(QUDA_PROFILE_FREE);
-   if (cpuGauge) delete cpuGauge;
-   profileProject.TPSTOP(QUDA_PROFILE_FREE);
-
-   profileProject.TPSTOP(QUDA_PROFILE_TOTAL);
- }
-
- void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) {
-   profilePhase.TPSTART(QUDA_PROFILE_TOTAL);
-
-   profilePhase.TPSTART(QUDA_PROFILE_INIT);
-   checkGaugeParam(param);
-
-   // create the gauge field
-   GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS);
-   bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
-   gParam.location = QUDA_CPU_FIELD_LOCATION;
-   GaugeField *cpuGauge = need_cpu ? new GaugeField(gParam) : nullptr;
-
-   // create the device fields
-   gParam.location = QUDA_CUDA_FIELD_LOCATION;
-   gParam.create = QUDA_NULL_FIELD_CREATE;
-   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
-   gParam.reconstruct = param->reconstruct;
-   GaugeField *cudaGauge = !param->use_resident_gauge ? new GaugeField(gParam) : nullptr;
-   profilePhase.TPSTOP(QUDA_PROFILE_INIT);
-
-   if (param->use_resident_gauge) {
-     if (!gaugePrecise) errorQuda("No resident gauge field to use");
-     cudaGauge = gaugePrecise;
-   } else {
-     profilePhase.TPSTART(QUDA_PROFILE_H2D);
-     cudaGauge->copy(*cpuGauge);
-     profilePhase.TPSTOP(QUDA_PROFILE_H2D);
-   }
+  if (param->make_resident_gauge) {
+    if (gaugePrecise != nullptr && !param->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    gaugePrecise = new GaugeField();
+    std::exchange(*gaugePrecise, cudaGauge);
+  }
 
-   profilePhase.TPSTART(QUDA_PROFILE_COMPUTE);
-   *num_failures_h = 0;
+  popProfile();
+}
 
-   // apply / remove phase as appropriate
-   if (!cudaGauge->StaggeredPhaseApplied()) cudaGauge->applyStaggeredPhase();
-   else cudaGauge->removeStaggeredPhase();
+void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
+{
+  pushProfile(profilePhase);
+  checkGaugeParam(param);
 
-   profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE);
+  // create the gauge field
+  GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS);
+  bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
+  gParam.location = QUDA_CPU_FIELD_LOCATION;
+  GaugeField cpuGauge = need_cpu ? GaugeField(gParam) : GaugeField();
 
-   if (param->return_result_gauge) {
-     profilePhase.TPSTART(QUDA_PROFILE_D2H);
-     cpuGauge->copy(*cudaGauge);
-     profilePhase.TPSTOP(QUDA_PROFILE_D2H);
-   }
+  // create the device fields
+  if (param->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field to use");
+  gParam.location = QUDA_CUDA_FIELD_LOCATION;
+  gParam.create = QUDA_COPY_FIELD_CREATE;
+  gParam.field = &cpuGauge;
+  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
+  gParam.reconstruct = param->reconstruct;
+  GaugeField cudaGauge = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
 
-   if (param->make_resident_gauge) {
-     if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-     gaugePrecise = cudaGauge;
-   } else {
-     delete cudaGauge;
-   }
+  profilePhase.TPSTART(QUDA_PROFILE_COMPUTE);
+  *num_failures_h = 0;
+
+  // apply / remove phase as appropriate
+  if (!cudaGauge.StaggeredPhaseApplied())
+    cudaGauge.applyStaggeredPhase();
+  else
+    cudaGauge.removeStaggeredPhase();
 
-   profilePhase.TPSTART(QUDA_PROFILE_FREE);
-   if (cpuGauge) delete cpuGauge;
-   profilePhase.TPSTOP(QUDA_PROFILE_FREE);
+  profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-   profilePhase.TPSTOP(QUDA_PROFILE_TOTAL);
- }
+  if (param->return_result_gauge) cpuGauge.copy(cudaGauge);
+
+  if (param->make_resident_gauge) {
+    if (gaugePrecise != nullptr && !param->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    gaugePrecise = new GaugeField();
+    std::exchange(*gaugePrecise, cudaGauge);
+  }
+
+  popProfile();
+}
 
 // evaluate the momentum action
 double momActionQuda(void* momentum, QudaGaugeParam* param)
@@ -5149,7 +5062,8 @@ double momActionQuda(void* momentum, QudaGaugeParam* param)
   // perform the update
   double action = computeMomAction(cudaMom);
 
-  if (param->make_resident_mom && !param->use_resident_gauge) std::exchange(momResident, cudaMom);
+  if (param->make_resident_mom && !param->use_resident_mom)
+    std::exchange(momResident, cudaMom);
   else momResident = GaugeField();
 
   popProfile();
@@ -5389,7 +5303,6 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
   ColorSpinorField out(cudaParam);
   ColorSpinorField temp1(cudaParam);
  
-
   // Create the smearing operator
   //------------------------------------------------------
   Dirac *d       = nullptr;
@@ -5470,8 +5383,8 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
 
 void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param)
 {
+  pushProfile(profileGaugeSmear);
   pushOutputPrefix("performGaugeSmearQuda: ");
-  profileGaugeSmear.TPSTART(QUDA_PROFILE_TOTAL);
   checkGaugeSmearParam(smear_param);
 
   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
@@ -5480,7 +5393,7 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
 
   GaugeFieldParam gParam(*gaugeSmeared);
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
-  auto *cudaGaugeTemp = new GaugeField(gParam);
+  GaugeField tmp(gParam);
 
   int measurement_n = 0; // The nth measurement to take
   gaugeObservablesQuda(&obs_param[measurement_n]);
@@ -5489,18 +5402,15 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
   }
 
   for (unsigned int i = 0; i < smear_param->n_steps; i++) {
-    profileGaugeSmear.TPSTART(QUDA_PROFILE_COMPUTE);
-
     switch (smear_param->smear_type) {
-    case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, *cudaGaugeTemp, smear_param->alpha); break;
-    case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, *cudaGaugeTemp, smear_param->rho); break;
+    case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, tmp, smear_param->alpha); break;
+    case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, tmp, smear_param->rho); break;
     case QUDA_GAUGE_SMEAR_OVRIMP_STOUT:
-      OvrImpSTOUTStep(*gaugeSmeared, *cudaGaugeTemp, smear_param->rho, smear_param->epsilon);
+      OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon);
       break;
     default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type);
     }
 
-    profileGaugeSmear.TPSTOP(QUDA_PROFILE_COMPUTE);
     if ((i + 1) % smear_param->meas_interval == 0) {
       measurement_n++;
       gaugeObservablesQuda(&obs_param[measurement_n]);
@@ -5510,15 +5420,14 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
     }
   }
 
-  delete cudaGaugeTemp;
-  profileGaugeSmear.TPSTOP(QUDA_PROFILE_TOTAL);
   popOutputPrefix();
+  popProfile();
 }
 
 void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param)
 {
+  pushProfile(profileWFlow);
   pushOutputPrefix("performWFlowQuda: ");
-  profileWFlow.TPSTART(QUDA_PROFILE_TOTAL);
   checkGaugeSmearParam(smear_param);
 
   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
@@ -5526,18 +5435,18 @@ void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam
   gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileWFlow);
 
   GaugeFieldParam gParamEx(*gaugeSmeared);
-  auto *gaugeAux = GaugeField::Create(gParamEx);
+  GaugeField gaugeAux(gParamEx);
 
   GaugeFieldParam gParam(*gaugePrecise);
   gParam.reconstruct = QUDA_RECONSTRUCT_NO; // temporary field is not on manifold so cannot use reconstruct
-  auto *gaugeTemp = GaugeField::Create(gParam);
+  GaugeField gaugeTemp(gParam);
 
-  GaugeField *in = gaugeSmeared;
-  GaugeField *out = gaugeAux;
+  GaugeField &in = *gaugeSmeared;
+  GaugeField &out = gaugeAux;
 
   int measurement_n = 0; // The nth measurement to take
 
-  gaugeObservables(*in, obs_param[measurement_n], profileWFlow);
+  gaugeObservables(in, obs_param[measurement_n]);
 
   if (getVerbosity() >= QUDA_SUMMARIZE) {
     printfQuda("flow t, plaquette, E_tot, E_spatial, E_temporal, Q charge\n");
@@ -5548,14 +5457,12 @@ void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam
   for (unsigned int i = 0; i < smear_param->n_steps; i++) {
     // Perform W1, W2, and Vt Wilson Flow steps as defined in
     // https://arxiv.org/abs/1006.4518v3
-    profileWFlow.TPSTART(QUDA_PROFILE_COMPUTE);
     if (i > 0) std::swap(in, out); // output from prior step becomes input for next step
-    WFlowStep(*out, *gaugeTemp, *in, smear_param->epsilon, smear_param->smear_type);
-    profileWFlow.TPSTOP(QUDA_PROFILE_COMPUTE);
+    WFlowStep(out, gaugeTemp, in, smear_param->epsilon, smear_param->smear_type);
 
     if ((i + 1) % smear_param->meas_interval == 0) {
       measurement_n++; // increment measurements.
-      gaugeObservables(*out, obs_param[measurement_n], profileWFlow);
+      gaugeObservables(out, obs_param[measurement_n]);
       if (getVerbosity() >= QUDA_SUMMARIZE) {
         printfQuda("%le %.16e %+.16e %+.16e %+.16e %+.16e\n", smear_param->epsilon * (i + 1),
                    obs_param[measurement_n].plaquette[0], obs_param[measurement_n].energy[0],
@@ -5565,153 +5472,98 @@ void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam
     }
   }
 
-  delete gaugeTemp;
-  delete gaugeAux;
-  profileWFlow.TPSTOP(QUDA_PROFILE_TOTAL);
   popOutputPrefix();
+  popProfile();
 }
 
 int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps,
                               const unsigned int verbose_interval, const double relax_boost, const double tolerance,
-                              const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param,
-                              double *timeinfo)
+                              const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param)
 {
-  GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_TOTAL);
-
+  pushProfile(GaugeFixOVRQuda);
   checkGaugeParam(param);
 
-  GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_INIT);
-
   GaugeFieldParam gParam(*param, gauge);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  auto *cpuGauge = new GaugeField(gParam);
+  GaugeField cpuGauge(gParam);
 
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.link_type = param->type;
   gParam.reconstruct = param->reconstruct;
   gParam.setPrecision(gParam.Precision(), true);
-  auto *cudaInGauge = new GaugeField(gParam);
-
-  GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_INIT);
-  GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_H2D);
-
-  cudaInGauge->copy(*cpuGauge);
+  GaugeField cudaInGauge(gParam);
 
-  GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_H2D);
+  cudaInGauge.copy(cpuGauge);
 
-  GaugeField *cudaInGaugeEx = nullptr;
-
-  if (comm_size() == 1) {
-    // perform the update
-    GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_COMPUTE);
-    gaugeFixingOVR(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval,
-                   stopWtheta);
-    GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_COMPUTE);
-  } else {
-    cudaInGaugeEx = createExtendedGauge(*cudaInGauge, R, GaugeFixOVRQuda);
+  GaugeField *cudaInGaugeEx = createExtendedGauge(cudaInGauge, R, GaugeFixOVRQuda);
 
-    // perform the update
-    GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_COMPUTE);
-    gaugeFixingOVR(*cudaInGaugeEx, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval,
-                   stopWtheta);
-    GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_COMPUTE);
+  // perform the update
+  gaugeFixingOVR(*cudaInGaugeEx, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval,
+                 stopWtheta);
 
-    copyExtendedGauge(*cudaInGauge, *cudaInGaugeEx, QUDA_CUDA_FIELD_LOCATION);
-  }
+  copyExtendedGauge(cudaInGauge, *cudaInGaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
   // copy the gauge field back to the host
-  GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_D2H);
-  cpuGauge->copy(*cudaInGauge);
-  GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_D2H);
-
-  GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_TOTAL);
+  cpuGauge.copy(cudaInGauge);
 
   if (param->make_resident_gauge) {
-    if (gaugePrecise != nullptr) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-    gaugePrecise = cudaInGauge;
+    freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    gaugePrecise = new GaugeField();
+    std::exchange(*gaugePrecise, cudaInGauge);
     if (extendedGaugeResident) delete extendedGaugeResident;
     extendedGaugeResident = cudaInGaugeEx;
   } else {
-    delete cudaInGauge;
-    if (cudaInGaugeEx) delete cudaInGaugeEx;
-  }
-
-  delete cpuGauge;
-
-  if(timeinfo){
-    timeinfo[0] = GaugeFixOVRQuda.Last(QUDA_PROFILE_H2D);
-    timeinfo[1] = GaugeFixOVRQuda.Last(QUDA_PROFILE_COMPUTE);
-    timeinfo[2] = GaugeFixOVRQuda.Last(QUDA_PROFILE_D2H);
+    delete cudaInGaugeEx;
   }
 
+  popProfile();
   return 0;
 }
 
-int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir,  const unsigned int Nsteps, \
-  const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, \
-  const unsigned int  stopWtheta, QudaGaugeParam* param , double* timeinfo)
+int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps,
+                              const unsigned int verbose_interval, const double alpha, const unsigned int autotune,
+                              const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param)
 {
-  GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_TOTAL);
-
+  pushProfile(GaugeFixFFTQuda);
   checkGaugeParam(param);
 
-  GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_INIT);
-
   GaugeFieldParam gParam(*param, gauge);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  auto *cpuGauge = new GaugeField(gParam);
+  GaugeField cpuGauge(gParam);
 
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.link_type = param->type;
   gParam.reconstruct = param->reconstruct;
   gParam.setPrecision(gParam.Precision(), true);
-  auto *cudaInGauge = new GaugeField(gParam);
+  GaugeField cudaInGauge(gParam);
 
-  GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_INIT);
-
-  GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_H2D);
-  cudaInGauge->copy(*cpuGauge);
-  GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_H2D);
+  cudaInGauge.copy(cpuGauge);
 
   // perform the update
-  GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_COMPUTE);
-
-  gaugeFixingFFT(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
-
-  GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_COMPUTE);
+  gaugeFixingFFT(cudaInGauge, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
 
   // copy the gauge field back to the host
-  GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_D2H);
-  cpuGauge->copy(*cudaInGauge);
-  GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_D2H);
-
-  GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_TOTAL);
+  cpuGauge.copy(cudaInGauge);
 
   if (param->make_resident_gauge) {
-    if (gaugePrecise != nullptr) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-    gaugePrecise = cudaInGauge;
-  } else {
-    delete cudaInGauge;
-  }
-
-  if (timeinfo) {
-    timeinfo[0] = GaugeFixFFTQuda.Last(QUDA_PROFILE_H2D);
-    timeinfo[1] = GaugeFixFFTQuda.Last(QUDA_PROFILE_COMPUTE);
-    timeinfo[2] = GaugeFixFFTQuda.Last(QUDA_PROFILE_D2H);
+    freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    gaugePrecise = new GaugeField();
+    std::exchange(*gaugePrecise, cudaInGauge);
   }
 
+  popProfile();
   return 0;
 }
 
 void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const QudaContractType cType,
                   QudaInvertParam *param, const int *X)
 {
+  pushProfile(profileContract);
   // DMH: Easiest way to construct ColorSpinorField? Do we require the user
   //     to declare and fill and invert_param, or can it just be hacked?.
 
-  profileContract.TPSTART(QUDA_PROFILE_TOTAL);
   profileContract.TPSTART(QUDA_PROFILE_INIT);
 
   // wrap CPU host side pointers
@@ -5743,21 +5595,19 @@ void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const Quda
   y[0] = h_y;
   profileContract.TPSTOP(QUDA_PROFILE_H2D);
 
-  profileContract.TPSTART(QUDA_PROFILE_COMPUTE);
   contractQuda(x[0], y[0], d_result, cType);
-  profileContract.TPSTOP(QUDA_PROFILE_COMPUTE);
 
   profileContract.TPSTART(QUDA_PROFILE_D2H);
   qudaMemcpy(h_result, d_result, data_bytes, qudaMemcpyDeviceToHost);
   profileContract.TPSTOP(QUDA_PROFILE_D2H);
 
   pool_device_free(d_result);
-  profileContract.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
 }
 
 void gaugeObservablesQuda(QudaGaugeObservableParam *param)
 {
-  profileGaugeObs.TPSTART(QUDA_PROFILE_TOTAL);
+  pushProfile(profileGaugeObs);
   checkGaugeObservableParam(param);
 
   if (!gaugePrecise) errorQuda("Cannot compute Polyakov loop as there is no resident gauge field");
@@ -5778,6 +5628,6 @@ void gaugeObservablesQuda(QudaGaugeObservableParam *param)
       errorQuda("Removing staggered phases was requested, however staggered phases aren't already applied");
   }
 
-  gaugeObservables(*gauge, *param, profileGaugeObs);
-  profileGaugeObs.TPSTOP(QUDA_PROFILE_TOTAL);
+  gaugeObservables(*gauge, *param);
+  popProfile();
 }
diff --git a/lib/milc_interface.cpp b/lib/milc_interface.cpp
index 8f33083574..1c9e25cb54 100644
--- a/lib/milc_interface.cpp
+++ b/lib/milc_interface.cpp
@@ -3007,14 +3007,8 @@ void qudaGaugeFixingOVR(int precision, unsigned int gauge_dir, int Nsteps, int v
   qudaGaugeParam.site_size = arg->size;
   qudaGaugeParam.gauge_order = arg->site ? QUDA_MILC_SITE_GAUGE_ORDER : QUDA_MILC_GAUGE_ORDER;
 
-  double timeinfo[3];
   computeGaugeFixingOVRQuda(gauge, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval,
-                            stopWtheta, &qudaGaugeParam, timeinfo);
-
-  printfQuda("Time H2D: %lf\n", timeinfo[0]);
-  printfQuda("Time to Compute: %lf\n", timeinfo[1]);
-  printfQuda("Time D2H: %lf\n", timeinfo[2]);
-  printfQuda("Time all: %lf\n", timeinfo[0]+timeinfo[1]+timeinfo[2]);
+                            stopWtheta, &qudaGaugeParam);
 
   qudamilc_called<false>(__func__, verbosity);
 }
@@ -3036,13 +3030,6 @@ void qudaGaugeFixingFFT( int precision,
   qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
   //qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
 
-
-  double timeinfo[3];
-  computeGaugeFixingFFTQuda(milc_sitelink, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta, \
-    &qudaGaugeParam, timeinfo);
-
-  printfQuda("Time H2D: %lf\n", timeinfo[0]);
-  printfQuda("Time to Compute: %lf\n", timeinfo[1]);
-  printfQuda("Time D2H: %lf\n", timeinfo[2]);
-  printfQuda("Time all: %lf\n", timeinfo[0]+timeinfo[1]+timeinfo[2]);
+  computeGaugeFixingFFTQuda(milc_sitelink, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta,
+                            &qudaGaugeParam);
 }
diff --git a/lib/staggered_oprod.cu b/lib/staggered_oprod.cu
index 1f9cbcccf7..bd085c899b 100644
--- a/lib/staggered_oprod.cu
+++ b/lib/staggered_oprod.cu
@@ -2,7 +2,6 @@
 #include <tunable_nd.h>
 #include <instantiate.h>
 #include <kernels/staggered_outer_product.cuh>
-#include "timer.h"
 
 namespace quda {
 
diff --git a/lib/unitarize_force_quda.cu b/lib/unitarize_force_quda.cu
index 29b315d2ef..84b94a0d54 100644
--- a/lib/unitarize_force_quda.cu
+++ b/lib/unitarize_force_quda.cu
@@ -56,6 +56,7 @@ namespace quda {
     void unitarizeForce(GaugeField &newForce, const GaugeField &oldForce, const GaugeField &u,
 			int* fails)
     {
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       checkReconstruct(u, oldForce, newForce);
       checkPrecision(u, oldForce, newForce);
 
@@ -63,6 +64,7 @@ namespace quda {
         errorQuda("Only native order supported");
 
       instantiate<ForceUnitarize, ReconstructNone>(newForce, oldForce, u, fails);
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     }
 #else
     void unitarizeForce(GaugeField &, const GaugeField &, const GaugeField &, int*)
diff --git a/lib/unitarize_links_quda.cu b/lib/unitarize_links_quda.cu
index 2bdd24880a..83ea615c48 100644
--- a/lib/unitarize_links_quda.cu
+++ b/lib/unitarize_links_quda.cu
@@ -141,8 +141,10 @@ namespace quda {
 
   void unitarizeLinks(GaugeField& out, const GaugeField &in, int* fails)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkPrecision(out, in);
     instantiate<UnitarizeLinks, ReconstructNo12>(out, in, fails);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
   void unitarizeLinks(GaugeField &links, int* fails) { unitarizeLinks(links, links, fails); }
@@ -182,11 +184,13 @@ namespace quda {
 
   void projectSU3(GaugeField &u, double tol, int *fails)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     // check the the field doesn't have staggered phases applied
     if (u.StaggeredPhaseApplied())
       errorQuda("Cannot project gauge field with staggered phases applied");
 
     instantiate<ProjectSU3>(u, tol, fails);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda

From a75fad1244dbc01930f23d5d34af7109bc824b90 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 22 May 2023 14:35:08 -0700
Subject: [PATCH 13/60] More interface code related cleanup

---
 include/llfat_quda.h           |   4 +-
 lib/interface_quda.cpp         | 141 ++++++++-------------------------
 lib/llfat_quda.cu              |  33 ++++----
 lib/staggered_two_link_quda.cu |   2 +
 4 files changed, 55 insertions(+), 125 deletions(-)

diff --git a/include/llfat_quda.h b/include/llfat_quda.h
index 696c67d3f8..0bf9f5b249 100644
--- a/include/llfat_quda.h
+++ b/include/llfat_quda.h
@@ -11,7 +11,7 @@ namespace quda {
      @param u[in] The input gauge field
      @param coeff[in] Array of path coefficients
   */
-  void fatKSLink(GaugeField *fat, const GaugeField &u, const double *coeff);
+  void fatKSLink(GaugeField &fat, const GaugeField &u, const double *coeff);
 
   /**
      @brief Compute the long links for an improved staggered (Kogut-Susskind) fermions.
@@ -19,6 +19,6 @@ namespace quda {
      @param u[in] The input gauge field
      @param coeff[in] Array of path coefficients
   */
-  void longKSLink(GaugeField *lng, const GaugeField &u, const double *coeff);
+  void longKSLink(GaugeField &lng, const GaugeField &u, const double *coeff);
 
 } // namespace quda
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 23d06d3564..e39bf48b5c 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -3826,9 +3826,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
 
 void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
 {
-  profileFatLink.TPSTART(QUDA_PROFILE_TOTAL);
-  profileFatLink.TPSTART(QUDA_PROFILE_INIT);
-
+  pushProfile(profileFatLink);
   checkGaugeParam(param);
 
   GaugeFieldParam gParam(*param, fatlink, QUDA_GENERAL_LINKS);
@@ -3848,16 +3846,11 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
   gParam.setPrecision(param->cuda_prec, true);
   gParam.create = QUDA_NULL_FIELD_CREATE;
   GaugeField *cudaInLink = new GaugeField(gParam);
-  profileFatLink.TPSTOP(QUDA_PROFILE_INIT);
 
-  profileFatLink.TPSTART(QUDA_PROFILE_H2D);
   cudaInLink->copy(cpuInLink);
-  profileFatLink.TPSTOP(QUDA_PROFILE_H2D);
   GaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink);
 
-  profileFatLink.TPSTART(QUDA_PROFILE_FREE);
   delete cudaInLink;
-  profileFatLink.TPSTOP(QUDA_PROFILE_FREE);
 
   gParam.create = QUDA_ZERO_FIELD_CREATE;
   gParam.link_type = QUDA_GENERAL_LINKS;
@@ -3866,40 +3859,14 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
 
   if (longlink) {
-    profileFatLink.TPSTART(QUDA_PROFILE_INIT);
-    GaugeField *cudaLongLink = new GaugeField(gParam);
-    profileFatLink.TPSTOP(QUDA_PROFILE_INIT);
-
-    profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
-    longKSLink(cudaLongLink, *cudaInLinkEx, path_coeff);
-    profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);
-
-    profileFatLink.TPSTART(QUDA_PROFILE_D2H);
-    cpuLongLink.copy(*cudaLongLink);
-    profileFatLink.TPSTOP(QUDA_PROFILE_D2H);
-
-    profileFatLink.TPSTART(QUDA_PROFILE_FREE);
-    delete cudaLongLink;
-    profileFatLink.TPSTOP(QUDA_PROFILE_FREE);
+    GaugeField longLink(gParam);
+    longKSLink(longLink, *cudaInLinkEx, path_coeff);
+    cpuLongLink.copy(longLink);
   }
 
-  profileFatLink.TPSTART(QUDA_PROFILE_INIT);
-  GaugeField *cudaFatLink = new GaugeField(gParam);
-  profileFatLink.TPSTOP(QUDA_PROFILE_INIT);
-
-  profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
-  fatKSLink(cudaFatLink, *cudaInLinkEx, path_coeff);
-  profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);
-
-  if (fatlink) {
-    profileFatLink.TPSTART(QUDA_PROFILE_D2H);
-    cpuFatLink.copy(*cudaFatLink);
-    profileFatLink.TPSTOP(QUDA_PROFILE_D2H);
-  }
-
-  profileFatLink.TPSTART(QUDA_PROFILE_FREE);
-  delete cudaInLinkEx;
-  profileFatLink.TPSTOP(QUDA_PROFILE_FREE);
+  GaugeField fatLink(gParam);
+  fatKSLink(fatLink, *cudaInLinkEx, path_coeff);
+  if (fatlink) cpuFatLink.copy(fatLink);
 
   if (ulink) {
     const double unitarize_eps = 1e-14;
@@ -3911,42 +3878,28 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
     quda::setUnitarizeLinksConstants(unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error,
                                      svd_abs_error);
 
-    GaugeField *cudaUnitarizedLink = new GaugeField(gParam);
+    GaugeField unitarizedLink(gParam);
 
-    profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
     *num_failures_h = 0;
-    quda::unitarizeLinks(*cudaUnitarizedLink, *cudaFatLink, num_failures_d); // unitarize on the gpu
+    quda::unitarizeLinks(unitarizedLink, fatLink, num_failures_d); // unitarize on the gpu
     if (*num_failures_h > 0)
       errorQuda("Error in unitarization component of the hisq fattening: %d failures", *num_failures_h);
-    profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);
-
-    profileFatLink.TPSTART(QUDA_PROFILE_D2H);
-    cpuUnitarizedLink.copy(*cudaUnitarizedLink);
-    profileFatLink.TPSTOP(QUDA_PROFILE_D2H);
 
-    profileFatLink.TPSTART(QUDA_PROFILE_FREE);
-    delete cudaUnitarizedLink;
-    profileFatLink.TPSTOP(QUDA_PROFILE_FREE);
+    cpuUnitarizedLink.copy(unitarizedLink);
   }
 
-  profileFatLink.TPSTART(QUDA_PROFILE_FREE);
-  delete cudaFatLink;
-  profileFatLink.TPSTOP(QUDA_PROFILE_FREE);
-
-  profileFatLink.TPSTOP(QUDA_PROFILE_TOTAL);
+  delete cudaInLinkEx;
+  popProfile();
 }
 
 void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
 {
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_TOTAL);
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_INIT);
-
+  pushProfile(profileGaussianSmear);
   checkGaugeParam(param);
 
   GaugeFieldParam gParam(*param, inlink, QUDA_ASQTAD_LONG_LINKS);
   gParam.gauge = twolink;
   GaugeField cpuTwoLink(gParam);  // create the host twolink
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
 
   GaugeField *cudaInLinkEx = nullptr;
 
@@ -3962,9 +3915,7 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
     GaugeField cudaInLink(gParam);
     profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
 
-    profileGaussianSmear.TPSTART(QUDA_PROFILE_H2D);
     cudaInLink.copy(cpuInLink);
-    profileGaussianSmear.TPSTOP(QUDA_PROFILE_H2D);
     cudaInLinkEx = createExtendedGauge(cudaInLink, R, profileGaussianSmear);
   } else {
     cudaInLinkEx = createExtendedGauge(*gaugePrecise, R, profileGaussianSmear);
@@ -3980,30 +3931,18 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
   gsParam.nFace         = 3;
   gsParam.pad           = gsParam.pad*gsParam.nFace;
 
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_INIT);
-
   freeUniqueGaugeQuda(QUDA_SMEARED_LINKS);
   gaugeSmeared = new GaugeField(gsParam);
   
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
-
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_COMPUTE);
-
   computeTwoLink(*gaugeSmeared, *cudaInLinkEx);
   gaugeSmeared->exchangeGhost();
 
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_COMPUTE);
-
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_D2H);
   cpuTwoLink.copy(*gaugeSmeared);
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_D2H);
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_FREE);
 
   freeUniqueGaugeQuda(QUDA_SMEARED_LINKS);
   delete cudaInLinkEx;
 
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_FREE);
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
 }
 
 int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int* path_length,
@@ -4017,11 +3956,11 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
   GaugeField cpuSiteLink = !qudaGaugeParam->use_resident_gauge ? GaugeField(gParam) : GaugeField();
 
   if (qudaGaugeParam->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field to use");
+  gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.create = QUDA_COPY_FIELD_CREATE;
   gParam.field = &cpuSiteLink;
   gParam.reconstruct = qudaGaugeParam->reconstruct;
   gParam.setPrecision(qudaGaugeParam->cuda_prec, true);
-  gParam.location = QUDA_CUDA_FIELD_LOCATION;
   GaugeField cudaSiteLink = qudaGaugeParam->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
 
   GaugeFieldParam gParamMom(*qudaGaugeParam, mom, QUDA_ASQTAD_MOM_LINKS);
@@ -4099,36 +4038,27 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
 
   GaugeFieldParam gParam(*qudaGaugeParam, siteLink);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  GaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new GaugeField(gParam) : nullptr;
-
-  GaugeField *cudaSiteLink = nullptr;
-
-  if (qudaGaugeParam->use_resident_gauge) {
-    if (!gaugePrecise) errorQuda("No resident gauge field to use");
-    cudaSiteLink = gaugePrecise;
-  } else {
-    gParam.location = QUDA_CUDA_FIELD_LOCATION;
-    gParam.create = QUDA_NULL_FIELD_CREATE;
-    gParam.reconstruct = qudaGaugeParam->reconstruct;
-    gParam.setPrecision(qudaGaugeParam->cuda_prec, true);
+  GaugeField cpuSiteLink = !qudaGaugeParam->use_resident_gauge ? GaugeField(gParam) : GaugeField();
 
-    cudaSiteLink = new GaugeField(gParam);
-    cudaSiteLink->copy(*cpuSiteLink);
-  }
+  if (qudaGaugeParam->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field to use");
+  gParam.location = QUDA_CUDA_FIELD_LOCATION;
+  gParam.create = QUDA_COPY_FIELD_CREATE;
+  gParam.field  = &cpuSiteLink;
+  gParam.reconstruct = qudaGaugeParam->reconstruct;
+  gParam.setPrecision(qudaGaugeParam->cuda_prec, true);
+  GaugeField cudaSiteLink = qudaGaugeParam->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
 
   GaugeFieldParam gParamOut(*qudaGaugeParam, out);
   gParamOut.location = QUDA_CPU_FIELD_LOCATION;
-  GaugeField *cpuOut = new GaugeField(gParamOut);
+  GaugeField cpuOut = GaugeField(gParamOut);
   gParamOut.location = QUDA_CUDA_FIELD_LOCATION;
-  gParamOut.create = qudaGaugeParam->overwrite_gauge ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
+  gParamOut.create = qudaGaugeParam->overwrite_gauge ? QUDA_ZERO_FIELD_CREATE : QUDA_COPY_FIELD_CREATE;
+  gParamOut.field = &cpuOut;
   gParamOut.reconstruct = QUDA_RECONSTRUCT_NO;
   gParamOut.setPrecision(qudaGaugeParam->cuda_prec, true);
-  GaugeField *cudaOut = new GaugeField(gParamOut);
-  if (!qudaGaugeParam->overwrite_gauge) {
-    cudaOut->copy(*cpuOut);
-  }
+  GaugeField cudaOut(gParamOut);
 
-  GaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugePath);
+  GaugeField *cudaGauge = createExtendedGauge(cudaSiteLink, R, profileGaugePath);
   // apply / remove phase as appropriate
   if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase();
 
@@ -4145,25 +4075,20 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
   for (int d = 0; d < 4; d++) { input_path_v[d] = input_path_buf[d]; }
 
   // actually do the computation
-  gaugePath(*cudaOut, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
+  gaugePath(cudaOut, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
 
-  cpuOut->copy(*cudaOut);
+  cpuOut.copy(cudaOut);
 
   if (qudaGaugeParam->make_resident_gauge) {
-    if (gaugePrecise && gaugePrecise != cudaSiteLink) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-    gaugePrecise = cudaSiteLink;
+    if (gaugePrecise && !qudaGaugeParam->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    gaugePrecise = new GaugeField();
+    std::exchange(*gaugePrecise, cudaSiteLink);
     if (extendedGaugeResident) delete extendedGaugeResident;
     extendedGaugeResident = cudaGauge;
   } else {
-    delete cudaSiteLink;
     delete cudaGauge;
   }
 
-  delete cudaOut;
-
-  if (cpuSiteLink) delete cpuSiteLink;
-  if (cpuOut) delete cpuOut;
-
   popProfile();
   return 0;
 }
diff --git a/lib/llfat_quda.cu b/lib/llfat_quda.cu
index f39233aeea..8ac2e25d36 100644
--- a/lib/llfat_quda.cu
+++ b/lib/llfat_quda.cu
@@ -166,46 +166,50 @@ namespace quda {
   }
 
 #ifdef GPU_STAGGERED_DIRAC
-  void longKSLink(GaugeField *lng, const GaugeField &u, const double *coeff)
+  void longKSLink(GaugeField &lng, const GaugeField &u, const double *coeff)
   {
-    computeLongLink(*lng, u, coeff[1]);
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+    computeLongLink(lng, u, coeff[1]);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
-  void fatKSLink(GaugeField *fat, const GaugeField& u, const double *coeff)
+  void fatKSLink(GaugeField &fat, const GaugeField& u, const double *coeff)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+
     GaugeFieldParam gParam(u);
     gParam.reconstruct = QUDA_RECONSTRUCT_NO;
     gParam.setPrecision(gParam.Precision());
     gParam.create = QUDA_NULL_FIELD_CREATE;
-    auto staple = GaugeField::Create(gParam);
-    auto staple1 = GaugeField::Create(gParam);
+    GaugeField staple(gParam);
+    GaugeField staple1(gParam);
 
-    if ( ((fat->X()[0] % 2 != 0) || (fat->X()[1] % 2 != 0) || (fat->X()[2] % 2 != 0) || (fat->X()[3] % 2 != 0))
+    if ( ((fat.X()[0] % 2 != 0) || (fat.X()[1] % 2 != 0) || (fat.X()[2] % 2 != 0) || (fat.X()[3] % 2 != 0))
 	&& (u.Reconstruct()  != QUDA_RECONSTRUCT_NO)){
       errorQuda("Reconstruct %d and odd dimensionsize is not supported by link fattening code (yet)\n",
 		u.Reconstruct());
     }
 
-    computeOneLink(*fat, u, coeff[0]-6.0*coeff[5]);
+    computeOneLink(fat, u, coeff[0]-6.0*coeff[5]);
 
     // Check the coefficients. If all of the following are zero, return.
     if (fabs(coeff[2]) >= MIN_COEFF || fabs(coeff[3]) >= MIN_COEFF ||
 	fabs(coeff[4]) >= MIN_COEFF || fabs(coeff[5]) >= MIN_COEFF) {
 
       for (int nu = 0; nu < 4; nu++) {
-        computeStaple(*fat, *staple, u, u, nu, -1, -1, coeff[2], 1);
+        computeStaple(fat, staple, u, u, nu, -1, -1, coeff[2], 1);
 
-        if (coeff[5] != 0.0) computeStaple(*fat, *staple, *staple, u, nu, -1, -1, coeff[5], 0);
+        if (coeff[5] != 0.0) computeStaple(fat, staple, staple, u, nu, -1, -1, coeff[5], 0);
 
         for (int rho = 0; rho < 4; rho++) {
           if (rho != nu) {
 
-            computeStaple(*fat, *staple1, *staple, u, rho, nu, -1, coeff[3], 1);
+            computeStaple(fat, staple1, staple, u, rho, nu, -1, coeff[3], 1);
 
             if (fabs(coeff[4]) > MIN_COEFF) {
               for (int sig = 0; sig < 4; sig++) {
                 if (sig != nu && sig != rho) {
-                  computeStaple(*fat, *staple, *staple1, u, sig, nu, rho, coeff[4], 0);
+                  computeStaple(fat, staple, staple1, u, sig, nu, rho, coeff[4], 0);
                 }
               } //sig
             } // MIN_COEFF
@@ -214,16 +218,15 @@ namespace quda {
       } //nu
     }
 
-    delete staple;
-    delete staple1;
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else
-  void longKSLink(GaugeField *, const GaugeField&, const double *)
+  void longKSLink(GaugeField &, const GaugeField&, const double *)
   {
     errorQuda("Long-link computation not enabled");
   }
 
-  void fatKSLink(GaugeField *, const GaugeField&, const double *)
+  void fatKSLink(GaugeField &, const GaugeField&, const double *)
   {
     errorQuda("Fat-link computation not enabled");
   }
diff --git a/lib/staggered_two_link_quda.cu b/lib/staggered_two_link_quda.cu
index 8dce83c997..3afb950d82 100644
--- a/lib/staggered_two_link_quda.cu
+++ b/lib/staggered_two_link_quda.cu
@@ -53,10 +53,12 @@ namespace quda
 #if defined(GPU_STAGGERED_DIRAC) && defined(GPU_TWOLINK_GSMEAR)
   void computeTwoLink(GaugeField &newTwoLink, const GaugeField &link)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkNative(newTwoLink, link);
     checkLocation(newTwoLink, link);
     checkPrecision(newTwoLink, link);
     instantiate<ComputeTwoLink, ReconstructNone>(link, newTwoLink);//FIXME : enable link-12/8 reconstruction  
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else
   void computeTwoLink(GaugeField &, const GaugeField &)

From 52a1d1ca9ac79e47bb216a0cb32506f00fd7fc6d Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 22 May 2023 15:39:19 -0700
Subject: [PATCH 14/60] ColorSpinorField and CloverField now autoprofile any
 H2D and D2H transfers.  Further interface cleanup

---
 lib/clover_deriv_quda.cu           |   2 +
 lib/clover_field.cpp               |  12 +++
 lib/clover_invert.cu               |   2 +
 lib/clover_outer_product.cu        |   2 +
 lib/clover_quda.cu                 |   2 +
 lib/clover_sigma_outer_product.cu  |   2 +
 lib/color_spinor_field.cpp         |  13 +++
 lib/gauge_field.cpp                |  13 ++-
 lib/gauge_field_strength_tensor.cu |   2 +
 lib/gauge_phase.cu                 |   2 +
 lib/interface_quda.cpp             | 123 +++++++----------------------
 11 files changed, 72 insertions(+), 103 deletions(-)

diff --git a/lib/clover_deriv_quda.cu b/lib/clover_deriv_quda.cu
index 34f121de93..34ef0b993b 100644
--- a/lib/clover_deriv_quda.cu
+++ b/lib/clover_deriv_quda.cu
@@ -66,6 +66,7 @@ namespace quda {
 #if defined(GPU_CLOVER_DIRAC) && (QUDA_PRECISION & 8)
   void cloverDerivative(GaugeField &force, GaugeField &gauge, GaugeField &oprod, double coeff, QudaParity parity)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     assert(oprod.Geometry() == QUDA_TENSOR_GEOMETRY);
     assert(force.Geometry() == QUDA_VECTOR_GEOMETRY);
 
@@ -79,6 +80,7 @@ namespace quda {
     } else {
       errorQuda("Precision %d not supported", force.Precision());
     }
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else
   void cloverDerivative(GaugeField &, GaugeField &, GaugeField &, double, QudaParity)
diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp
index 46394c332b..cd8cc04ba2 100644
--- a/lib/clover_field.cpp
+++ b/lib/clover_field.cpp
@@ -184,6 +184,12 @@ namespace quda {
 
   void CloverField::copy(const CloverField &src, bool is_inverse)
   {
+    if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
+      getProfile().TPSTART(QUDA_PROFILE_D2H);
+    } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
+      getProfile().TPSTART(QUDA_PROFILE_H2D);
+    }
+
     // special case where we wish to make a copy of the inverse field when dynamic_inverse is enabled
     static bool dynamic_inverse_copy = false;
     if (is_inverse && clover::dynamic_inverse() && V(true) && !src.V(true) && !dynamic_inverse_copy) {
@@ -257,6 +263,12 @@ namespace quda {
         pool_device_free(packClover);
       }
     }
+
+    if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
+      getProfile().TPSTOP(QUDA_PROFILE_D2H);
+    } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
+      getProfile().TPSTOP(QUDA_PROFILE_H2D);
+    }
   }
 
   void CloverField::copy(const CloverField &src)
diff --git a/lib/clover_invert.cu b/lib/clover_invert.cu
index ac7f15fbfe..903ce4e76c 100644
--- a/lib/clover_invert.cu
+++ b/lib/clover_invert.cu
@@ -49,9 +49,11 @@ namespace quda {
 #ifdef GPU_CLOVER_DIRAC
   void cloverInvert(CloverField &clover, bool computeTraceLog)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (clover.Reconstruct()) errorQuda("Cannot store the inverse with a reconstruct field");
     if (clover.Precision() < QUDA_SINGLE_PRECISION) errorQuda("Cannot use fixed-point precision here");
     instantiate<CloverInvert>(clover, computeTraceLog);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else
   void cloverInvert(CloverField &, bool)
diff --git a/lib/clover_outer_product.cu b/lib/clover_outer_product.cu
index 93096d15e7..d579476714 100644
--- a/lib/clover_outer_product.cu
+++ b/lib/clover_outer_product.cu
@@ -136,6 +136,7 @@ namespace quda {
   void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector<ColorSpinorField *> &x,
                           std::vector<ColorSpinorField *> &p, std::vector<double> &coeff)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkNative(*x[0], *p[0], force, U);
     checkPrecision(*x[0], *p[0], force, U);
 
@@ -159,6 +160,7 @@ namespace quda {
         instantiate<CloverForce, ReconstructNo12>(U, force, inA, inB, inC, inD, parity, coeff[i]);
       }
     }
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else // GPU_CLOVER_DIRAC not defined
   void computeCloverForce(GaugeField &, const GaugeField &, std::vector<ColorSpinorField *> &,
diff --git a/lib/clover_quda.cu b/lib/clover_quda.cu
index 853fdbe156..c000310f6b 100644
--- a/lib/clover_quda.cu
+++ b/lib/clover_quda.cu
@@ -37,9 +37,11 @@ namespace quda {
 #ifdef GPU_CLOVER_DIRAC
   void computeClover(CloverField &clover, const GaugeField& f, double coeff)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (clover.Precision() < QUDA_SINGLE_PRECISION) errorQuda("Cannot use fixed-point precision here");
     clover.Diagonal(0.5); // 0.5 comes from scaling used on native fields
     instantiate<ComputeClover>(clover, f, coeff);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else
   void computeClover(CloverField &, const GaugeField &, double)
diff --git a/lib/clover_sigma_outer_product.cu b/lib/clover_sigma_outer_product.cu
index 1c34a7ff33..370ada813f 100644
--- a/lib/clover_sigma_outer_product.cu
+++ b/lib/clover_sigma_outer_product.cu
@@ -61,6 +61,7 @@ namespace quda {
   void computeCloverSigmaOprod(GaugeField& oprod, std::vector<ColorSpinorField*> &x,
 			       std::vector<ColorSpinorField*> &p, std::vector<std::vector<double> > &coeff)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (x.size() > MAX_NVECTOR) {
       // divide and conquer
       std::vector<ColorSpinorField*> x0(x.begin(), x.begin()+x.size()/2);
@@ -83,6 +84,7 @@ namespace quda {
     }
 
     instantiate<CloverSigmaOprod>(oprod, x, p, coeff);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else // GPU_CLOVER_DIRAC not defined
   void computeCloverSigmaOprod(GaugeField &, std::vector<ColorSpinorField*> &,
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 96df00ba55..9649ce9a7f 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -442,6 +442,13 @@ namespace quda
   void ColorSpinorField::copy(const ColorSpinorField &src)
   {
     test_compatible_weak(*this, src);
+
+    if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
+      getProfile().TPSTART(QUDA_PROFILE_D2H);
+    } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
+      getProfile().TPSTART(QUDA_PROFILE_H2D);
+    }
+
     if (Location() == src.Location()) { // H2H and D2D
 
       copyGenericColorSpinor(*this, src, Location());
@@ -525,6 +532,12 @@ namespace quda
 
       qudaDeviceSynchronize(); // need to sync before data can be used on CPU
     }
+
+    if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
+      getProfile().TPSTOP(QUDA_PROFILE_D2H);
+    } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
+      getProfile().TPSTOP(QUDA_PROFILE_H2D);
+    }
   }
 
   // Fills the param with the contents of this field
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 0003663e25..cb4319857e 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -938,15 +938,14 @@ namespace quda {
 
   void GaugeField::copy(const GaugeField &src)
   {
-    auto &profile = getProfile();
+    if (this == &src) return;
+
     if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
-      profile.TPSTART(QUDA_PROFILE_D2H);
+      getProfile().TPSTART(QUDA_PROFILE_D2H);
     } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
-      profile.TPSTART(QUDA_PROFILE_H2D);
+      getProfile().TPSTART(QUDA_PROFILE_H2D);
     }
 
-    if (this == &src) return;
-
     checkField(src);
 
     if (link_type == QUDA_ASQTAD_FAT_LINKS) {
@@ -1112,9 +1111,9 @@ namespace quda {
     staggeredPhaseType = src.StaggeredPhase();
 
     if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
-      profile.TPSTOP(QUDA_PROFILE_D2H);
+      getProfile().TPSTOP(QUDA_PROFILE_D2H);
     } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
-      profile.TPSTOP(QUDA_PROFILE_H2D);
+      getProfile().TPSTOP(QUDA_PROFILE_H2D);
     }
   }
 
diff --git a/lib/gauge_field_strength_tensor.cu b/lib/gauge_field_strength_tensor.cu
index d0ec026881..dc6b763b54 100644
--- a/lib/gauge_field_strength_tensor.cu
+++ b/lib/gauge_field_strength_tensor.cu
@@ -34,8 +34,10 @@ namespace quda
 
   void computeFmunu(GaugeField &f, const GaugeField &u)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkPrecision(f, u);
     instantiate2<Fmunu,ReconstructWilson>(u, f); // u must be first here for correct template instantiation
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda
diff --git a/lib/gauge_phase.cu b/lib/gauge_phase.cu
index ff959ef0b3..929c5eadb5 100644
--- a/lib/gauge_phase.cu
+++ b/lib/gauge_phase.cu
@@ -45,9 +45,11 @@ namespace quda {
 
   void applyGaugePhase(GaugeField &u)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<GaugePhase_, ReconstructNone>(u);
     // ensure that ghosts are updated if needed
     if (u.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) u.exchangeGhost();
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index e39bf48b5c..aa25b06621 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -558,13 +558,12 @@ void freeUniqueGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeFiel
 
 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 {
-  profileGauge.TPSTART(QUDA_PROFILE_TOTAL);
+  pushProfile(profileGauge);
+  checkGaugeParam(param);
 
   if (!initialized) errorQuda("QUDA not initialized");
   if (getVerbosity() == QUDA_DEBUG_VERBOSE) printQudaGaugeParam(param);
 
-  checkGaugeParam(param);
-
   profileGauge.TPSTART(QUDA_PROFILE_INIT);
   // Set the specific input parameters and create the cpu gauge field
   GaugeFieldParam gauge_param(*param, h_gauge);
@@ -631,9 +630,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
     profileGauge.TPSTOP(QUDA_PROFILE_INIT);
   } else {
     profileGauge.TPSTOP(QUDA_PROFILE_INIT);
-    profileGauge.TPSTART(QUDA_PROFILE_H2D);
     precise->copy(*in);
-    profileGauge.TPSTOP(QUDA_PROFILE_H2D);
   }
 
   // for gaugeSmeared we are interested only in the precise version
@@ -645,7 +642,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
     delete in;
     profileGauge.TPSTOP(QUDA_PROFILE_FREE);
 
-    profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
+    popProfile();
     return;
   }
 
@@ -766,12 +763,12 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
     extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGauge, false, recon);
   }
 
-  profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
 }
 
 void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 {
-  profileGauge.TPSTART(QUDA_PROFILE_TOTAL);
+  pushProfile(profileGauge);
 
   if (param->location != QUDA_CPU_FIELD_LOCATION) errorQuda("Non-cpu output location not yet supported");
 
@@ -798,13 +795,11 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
   default: errorQuda("Invalid gauge type");
   }
 
-  profileGauge.TPSTART(QUDA_PROFILE_D2H);
   cpuGauge.copy(*cudaGauge);
-  profileGauge.TPSTOP(QUDA_PROFILE_D2H);
 
   if (param->type == QUDA_SMEARED_LINKS) { delete cudaGauge; }
 
-  profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
 }
 
 void loadSloppyCloverQuda(const QudaPrecision prec[]);
@@ -812,8 +807,8 @@ void freeSloppyCloverQuda();
 
 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
 {
+  pushProfile(profileClover);
   pushVerbosity(inv_param->verbosity);
-  profileClover.TPSTART(QUDA_PROFILE_TOTAL);
   profileClover.TPSTART(QUDA_PROFILE_INIT);
 
   checkCloverParam(inv_param);
@@ -890,11 +885,9 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
     profileClover.TPSTOP(QUDA_PROFILE_INIT);
 
     if (!device_calc) {
-      profileClover.TPSTART(QUDA_PROFILE_H2D);
       cloverPrecise->copy(*in, false);
       if ((h_clovinv && !inv_param->compute_clover_inverse) && !clover::dynamic_inverse())
         cloverPrecise->copy(*in, true);
-      profileClover.TPSTOP(QUDA_PROFILE_H2D);
     } else {
       profileClover.TPSTOP(QUDA_PROFILE_TOTAL);
       createCloverQuda(inv_param);
@@ -902,13 +895,11 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
     }
 
     if ((!h_clovinv || inv_param->compute_clover_inverse) && !clover::dynamic_inverse()) {
-      profileClover.TPSTART(QUDA_PROFILE_COMPUTE);
       cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog);
       if (inv_param->compute_clover_trlog) {
         inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
         inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
       }
-      profileClover.TPSTOP(QUDA_PROFILE_COMPUTE);
     }
   } else {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gauge field unchanged - using cached clover field\n");
@@ -918,16 +909,12 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
   if (inv_param->return_clover || inv_param->return_clover_inverse) {
     if (inv_param->return_clover) {
       if (!h_clover) errorQuda("Requested clover field return but no clover host pointer set");
-      profileClover.TPSTART(QUDA_PROFILE_D2H);
       in->copy(*cloverPrecise, false);
-      profileClover.TPSTOP(QUDA_PROFILE_D2H);
     }
 
     if (inv_param->return_clover_inverse) {
       if (!h_clovinv) errorQuda("Requested clover field inverse return but no clover host pointer set");
-      profileClover.TPSTART(QUDA_PROFILE_D2H);
       in->copy(*cloverPrecise, true);
-      profileClover.TPSTOP(QUDA_PROFILE_D2H);
     }
   }
 
@@ -950,8 +937,8 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
                           inv_param->clover_cuda_prec_refinement_sloppy, inv_param->clover_cuda_prec_eigensolver};
   loadSloppyCloverQuda(prec);
 
-  profileClover.TPSTOP(QUDA_PROFILE_TOTAL);
   popVerbosity();
+  popProfile();
 }
 
 void freeSloppyCloverQuda();
@@ -1819,7 +1806,7 @@ namespace quda {
 
 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
 {
-  profileDslash.TPSTART(QUDA_PROFILE_TOTAL);
+  pushProfile(profileDslash);
   profileDslash.TPSTART(QUDA_PROFILE_INIT);
 
   const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;
@@ -1850,9 +1837,7 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
 
   profileDslash.TPSTOP(QUDA_PROFILE_INIT);
 
-  profileDslash.TPSTART(QUDA_PROFILE_H2D);
   in = in_h;
-  profileDslash.TPSTOP(QUDA_PROFILE_H2D);
 
   profileDslash.TPSTART(QUDA_PROFILE_COMPUTE);
 
@@ -1886,19 +1871,16 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
   }
   profileDslash.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-  profileDslash.TPSTART(QUDA_PROFILE_D2H);
   out_h = out;
-  profileDslash.TPSTOP(QUDA_PROFILE_D2H);
 
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
 
   profileDslash.TPSTART(QUDA_PROFILE_FREE);
   delete dirac; // clean up
-
   profileDslash.TPSTOP(QUDA_PROFILE_FREE);
 
   popVerbosity();
-  profileDslash.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
 }
 
 void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
@@ -2201,8 +2183,7 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
 void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param)
 {
   if (!initialized) errorQuda("QUDA not initialized");
-
-  profileEigensolve.TPSTART(QUDA_PROFILE_TOTAL);
+  pushProfile(profileEigensolve);
   profileEigensolve.TPSTART(QUDA_PROFILE_INIT);
 
   // Transfer the inv param structure contained in eig_param.
@@ -2357,9 +2338,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
   // host side gamma basis.
   for (int i = 0; i < eig_param->n_conv; i++) { memcpy(host_evals + i, &evals[i], sizeof(Complex)); }
   if (!(eig_param->arpack_check)) {
-    profileEigensolve.TPSTART(QUDA_PROFILE_D2H);
     for (int i = 0; i < n_eig; i++) host_evecs_[i] = kSpace[i];
-    profileEigensolve.TPSTOP(QUDA_PROFILE_D2H);
   }
 
   profileEigensolve.TPSTART(QUDA_PROFILE_FREE);
@@ -2373,7 +2352,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();
 
-  profileEigensolve.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
 }
 
 multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)
@@ -2696,10 +2675,9 @@ void destroyDeflationQuda(void *df) {
 
 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
 {
+  pushProfile(profileInvert);
   profilerStart(__func__);
 
-  profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
-
   if (!initialized) errorQuda("QUDA not initialized");
 
   pushVerbosity(param->verbosity);
@@ -2743,8 +2721,6 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
   Dirac &diracPre = *dPre;
   Dirac &diracEig = *dEig;
 
-  profileInvert.TPSTART(QUDA_PROFILE_H2D);
-
   ColorSpinorField *in = nullptr;
   ColorSpinorField *out = nullptr;
 
@@ -2805,7 +2781,6 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
     diracPre.prefetch(QUDA_CUDA_FIELD_LOCATION);
   }
 
-  profileInvert.TPSTOP(QUDA_PROFILE_H2D);
   profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE);
 
   double nb = blas::norm2(b);
@@ -3028,9 +3003,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
   profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE);
 
   if (!param->make_resident_solution) {
-    profileInvert.TPSTART(QUDA_PROFILE_D2H);
     h_x = x;
-    profileInvert.TPSTOP(QUDA_PROFILE_D2H);
   }
 
   profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE);
@@ -3064,9 +3037,8 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();
 
-  profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
-
   profilerStop(__func__);
+  popProfile();
 }
 
 void loadFatLongGaugeQuda(QudaInvertParam *inv_param, QudaGaugeParam *gauge_param, void *milc_fatlinks,
@@ -3473,9 +3445,9 @@ void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param
  */
 void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
 {
+  pushProfile(profileMulti);
   profilerStart(__func__);
 
-  profileMulti.TPSTART(QUDA_PROFILE_TOTAL);
   profileMulti.TPSTART(QUDA_PROFILE_INIT);
 
   if (!initialized) errorQuda("QUDA not initialized");
@@ -3582,7 +3554,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
   }
 
   profileMulti.TPSTOP(QUDA_PROFILE_INIT);
-  profileMulti.TPSTART(QUDA_PROFILE_H2D);
   // Now I need a colorSpinorParam for the device
   ColorSpinorParam cudaParam(cpuParam, *param, QUDA_CUDA_FIELD_LOCATION);
   // This setting will download a host vector
@@ -3590,8 +3561,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
   cudaParam.field = &h_b;
   ColorSpinorField b(cudaParam); // Creates b and downloads h_b to it
 
-  profileMulti.TPSTOP(QUDA_PROFILE_H2D);
-
   profileMulti.TPSTART(QUDA_PROFILE_INIT);
   // Create the solution fields filled with zero
   cudaParam.create = QUDA_ZERO_FIELD_CREATE;
@@ -3781,8 +3750,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
   // restore shifts
   for (int i = 0; i < param->num_offset; i++) param->offset[i] = unscaled_shifts[i];
 
-  profileMulti.TPSTART(QUDA_PROFILE_D2H);
-
   if (param->compute_action) {
     Complex action(0);
     for (int i = 0; i < param->num_offset; i++) action += param->residue[i] * blas::cDotProduct(b, x[i]);
@@ -3799,7 +3766,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
 
     if (!param->make_resident_solution) *h_x[i] = x[i];
   }
-  profileMulti.TPSTOP(QUDA_PROFILE_D2H);
 
   profileMulti.TPSTART(QUDA_PROFILE_EPILOGUE);
 
@@ -3819,9 +3785,8 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();
 
-  profileMulti.TPSTOP(QUDA_PROFILE_TOTAL);
-
   profilerStop(__func__);
+  popProfile();
 }
 
 void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
@@ -4132,7 +4097,7 @@ void momResidentQuda(void *mom, QudaGaugeParam *param)
 
 void createCloverQuda(QudaInvertParam* invertParam)
 {
-  profileClover.TPSTART(QUDA_PROFILE_TOTAL);
+  pushProfile(profileClover);
   if (!cloverPrecise) errorQuda("Clover field not allocated");
 
   QudaReconstructType recon = (gaugePrecise->Reconstruct() == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_12 : gaugePrecise->Reconstruct();
@@ -4141,8 +4106,6 @@ void createCloverQuda(QudaInvertParam* invertParam)
   for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d));
   GaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profileClover, false, recon);
 
-  profileClover.TPSTART(QUDA_PROFILE_INIT);
-
   GaugeField *ex = gauge;
   if (gauge->Precision() < cloverPrecise->Precision()) {
     GaugeFieldParam param(*gauge);
@@ -4159,17 +4122,14 @@ void createCloverQuda(QudaInvertParam* invertParam)
   tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   GaugeField Fmunu(tensorParam);
-  profileClover.TPSTOP(QUDA_PROFILE_INIT);
-  profileClover.TPSTART(QUDA_PROFILE_COMPUTE);
   computeFmunu(Fmunu, *ex);
   computeClover(*cloverPrecise, Fmunu, invertParam->clover_coeff);
-  profileClover.TPSTOP(QUDA_PROFILE_COMPUTE);
-  profileClover.TPSTOP(QUDA_PROFILE_TOTAL);
 
   if (ex != gauge) delete ex;
 
   // FIXME always preserve the extended gauge
   extendedGaugeResident = gauge;
+  popProfile();
 }
 
 void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
@@ -4445,10 +4405,7 @@ void computeHISQForceQuda(void* const milc_momentum,
         ColorSpinorField cpuQuark(qParam); // create host quark field
         profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
-        profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
         cudaQuark = cpuQuark;
-        profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
-
         computeStaggeredOprod(oprod, cudaQuark, coeff[i], 3);
       }
     }
@@ -4467,10 +4424,7 @@ void computeHISQForceQuda(void* const milc_momentum,
         ColorSpinorField cpuQuark(qParam); // create host quark field
         profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
-        profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
         cudaQuark = cpuQuark;
-        profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
-
         computeStaggeredOprod(oprod, cudaQuark, coeff[i + num_terms], 3);
       }
     }
@@ -4655,7 +4609,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
                             QudaInvertParam *inv_param)
 {
   using namespace quda;
-  profileCloverForce.TPSTART(QUDA_PROFILE_TOTAL);
+  pushProfile(profileCloverForce);
   profileCloverForce.TPSTART(QUDA_PROFILE_INIT);
 
   checkGaugeParam(gauge_param);
@@ -4731,7 +4685,6 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
   GaugeField oprod(fParam);
 
   profileCloverForce.TPSTOP(QUDA_PROFILE_INIT);
-  profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);
 
   std::vector<double> force_coeff(nvector);
   // loop over different quark fields
@@ -4745,17 +4698,13 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
       qParam.x[0] /= 2;
 
       // Wrap the even-parity MILC quark field
-      profileCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE);
       profileCloverForce.TPSTART(QUDA_PROFILE_INIT);
       qParam.v = h_x[i];
       ColorSpinorField cpuQuarkX(qParam); // create host quark field
       profileCloverForce.TPSTOP(QUDA_PROFILE_INIT);
 
-      profileCloverForce.TPSTART(QUDA_PROFILE_H2D);
       x.Even() = cpuQuarkX;
-      profileCloverForce.TPSTOP(QUDA_PROFILE_H2D);
 
-      profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);
       gamma5(x.Even(), x.Even());
     } else {
       x.Even() = solutionResident[i];
@@ -4798,20 +4747,15 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
 
   GaugeField *oprodEx = createExtendedGauge(oprod, R, profileCloverForce);
 
-  profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);
-
   cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_ODD_PARITY);
   cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_EVEN_PARITY);
 
   if (u != &gaugeEx) delete u;
 
   updateMomentum(cudaMom, -1.0, cudaForce, "clover");
-  profileCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE);
 
   // copy the outer product field back to the host
-  profileCloverForce.TPSTART(QUDA_PROFILE_D2H);
   cpuMom.copy(cudaMom);
-  profileCloverForce.TPSTOP(QUDA_PROFILE_D2H);
 
   profileCloverForce.TPSTART(QUDA_PROFILE_FREE);
 
@@ -4824,9 +4768,8 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
   if (inv_param->use_resident_solution) solutionResident.clear();
 #endif
   delete dirac;
-  profileCloverForce.TPSTOP(QUDA_PROFILE_FREE);
 
-  profileCloverForce.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
 }
 
 void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom, int exact, QudaGaugeParam* param)
@@ -4941,7 +4884,6 @@ void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
   gParam.reconstruct = param->reconstruct;
   GaugeField cudaGauge = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
 
-  profilePhase.TPSTART(QUDA_PROFILE_COMPUTE);
   *num_failures_h = 0;
 
   // apply / remove phase as appropriate
@@ -4950,8 +4892,6 @@ void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
   else
     cudaGauge.removeStaggeredPhase();
 
-  profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE);
-
   if (param->return_result_gauge) cpuGauge.copy(cudaGauge);
 
   if (param->make_resident_gauge) {
@@ -5023,21 +4963,19 @@ void gaussMomQuda(unsigned long long seed, double sigma)
  */
 void plaqQuda(double plaq[3])
 {
-  profilePlaq.TPSTART(QUDA_PROFILE_TOTAL);
+  pushProfile(profilePlaq);
 
   if (!gaugePrecise) errorQuda("Cannot compute plaquette as there is no resident gauge field");
 
   GaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq);
   extendedGaugeResident = data;
 
-  profilePlaq.TPSTART(QUDA_PROFILE_COMPUTE);
   double3 plaq3 = quda::plaquette(*data);
   plaq[0] = plaq3.x;
   plaq[1] = plaq3.y;
   plaq[2] = plaq3.z;
-  profilePlaq.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-  profilePlaq.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
 }
 
 /*
@@ -5165,12 +5103,11 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param,
 
 void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_param)
 {
-  if(smear_param->n_steps == 0) return;
+  if (smear_param->n_steps == 0) return;
+  pushProfile(profileGaussianSmear);
+  profileGaussianSmear.TPSTART(QUDA_PROFILE_INIT);
   
   QudaInvertParam *inv_param = smear_param->inv_param;
-  
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_TOTAL);
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_INIT);
 
   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
     
@@ -5258,9 +5195,7 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
   profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
 
   // Copy host data to device
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_H2D);
   in = in_h;
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_H2D);
 
   const double ftmp    = -(smear_param->width*smear_param->width)/(4.0*smear_param->n_steps*4.0);  /* Extra 4 to compensate for stride 2 */
   // Scale up the source to prevent underflow
@@ -5286,23 +5221,21 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
   profileGaussianSmear.TPSTOP(QUDA_PROFILE_COMPUTE);
 
   // Copy device data to host.
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_D2H);
   in_h = out;
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_D2H);
 
   profileGaussianSmear.TPSTART(QUDA_PROFILE_FREE);
 
   if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Finished 2link Gaussian smearing.\n");
 
   delete d;
+  profileGaussianSmear.TPSTOP(QUDA_PROFILE_FREE);
 
   smear_param->gflops = dirac.Flops();
 
   if (smear_param->delete_2link != 0) { freeUniqueGaugeQuda(QUDA_SMEARED_LINKS); }
 
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_FREE);
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_TOTAL);
   saveTuneCache();
+  popProfile();
 }
 
 
@@ -5515,10 +5448,8 @@ void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const Quda
   void *d_result = pool_device_malloc(data_bytes);
   profileContract.TPSTOP(QUDA_PROFILE_INIT);
 
-  profileContract.TPSTART(QUDA_PROFILE_H2D);
   x[0] = h_x;
   y[0] = h_y;
-  profileContract.TPSTOP(QUDA_PROFILE_H2D);
 
   contractQuda(x[0], y[0], d_result, cType);
 

From 442a4601d0816e2cc53b1832a41a42327e9112d6 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 25 May 2023 17:18:05 -0700
Subject: [PATCH 15/60] Add qudaMemsetAsync and qudaMemcpy overloads for
 quda_ptr

---
 include/quda_api.h            | 20 ++++++++++++++++++++
 lib/targets/cuda/quda_api.cpp | 18 ++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/include/quda_api.h b/include/quda_api.h
index ea475c43f6..d2abba24d1 100644
--- a/include/quda_api.h
+++ b/include/quda_api.h
@@ -43,6 +43,16 @@ namespace quda
   void qudaMemcpy_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const char *func, const char *file,
                    const char *line);
 
+  /**
+     @brief Wrapper around cudaMemcpy or driver API equivalent
+     @param[out] dst Destination pointer
+     @param[in] src Source pointer
+     @param[in] count Size of transfer
+     @param[in] kind Type of memory copy
+  */
+  void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func, const char *file,
+                   const char *line);
+
   /**
      @brief Wrapper around cudaMemcpyAsync or driver API equivalent
      @param[out] dst Destination pointer
@@ -101,6 +111,16 @@ namespace quda
   void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
                         const char *file, const char *line);
 
+  /**
+     @brief Wrapper around cudaMemsetAsync or driver API equivalent
+     @param[out] ptr Starting address pointer
+     @param[in] value Value to set for each byte of specified memory
+     @param[in] count Size in bytes to set
+     @param[in] stream Stream to issue memset
+   */
+  void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
+                        const char *file, const char *line);
+
   /**
      @brief Wrapper around cudaMemsetAsync or driver API equivalent
      @param[out] ptr Starting address pointer
diff --git a/lib/targets/cuda/quda_api.cpp b/lib/targets/cuda/quda_api.cpp
index 3e4ced01bc..8c57fe9079 100644
--- a/lib/targets/cuda/quda_api.cpp
+++ b/lib/targets/cuda/quda_api.cpp
@@ -325,6 +325,13 @@ namespace quda
     QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line);
   }
 
+  void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func, const char *file,
+                   const char *line)
+  {
+    if (count == 0) return;
+    QudaMem copy(dst.data(), src.data(), count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line);
+  }
+
   void qudaMemcpyAsync_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const qudaStream_t &stream,
                         const char *func, const char *file, const char *line)
   {
@@ -389,6 +396,17 @@ namespace quda
     QudaMem copy(ptr, value, count, stream, true, func, file, line);
   }
 
+  void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream,
+                        const char *func, const char *file, const char *line)
+  {
+    if (count == 0) return;
+    if (ptr.is_device()) {
+      QudaMem set(ptr.data(), value, count, stream, true, func, file, line);
+    } else {
+      memset(ptr.data(), value, count);
+    }
+  }
+
   void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height, const char *func,
                      const char *file, const char *line)
   {

From 44586cde7cfef274376ff8808a71d1a56138d2e2 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 25 May 2023 23:11:10 -0700
Subject: [PATCH 16/60] Use quda_ptr for both color_spinor_field.cpp and
 clover_field.cpp allocations.  Some cleanup

---
 include/clover_field.h       |  18 +++---
 include/color_spinor_field.h |  29 ++-------
 include/lattice_field.h      |   4 +-
 lib/clover_field.cpp         | 119 +++++++++++------------------------
 lib/color_spinor_field.cpp   | 117 ++++++++++------------------------
 lib/dirac_clover.cpp         |   2 +-
 lib/gauge_field.cpp          |  44 ++++++-------
 lib/lattice_field.cpp        |  24 +------
 8 files changed, 108 insertions(+), 249 deletions(-)

diff --git a/include/clover_field.h b/include/clover_field.h
index 402ee2936f..579e7eeb1e 100644
--- a/include/clover_field.h
+++ b/include/clover_field.h
@@ -178,9 +178,10 @@ namespace quda {
     int nColor = 0;
     int nSpin = 0;
 
-    void *clover = nullptr;
-    void *cloverInv = nullptr;
+    quda_ptr clover = {};
+    quda_ptr cloverInv = {};
 
+    bool inverse = false;
     double diagonal = 0.0;
     array<double, 2> max = {};
 
@@ -213,12 +214,15 @@ namespace quda {
 
   public:
     CloverField(const CloverFieldParam &param);
-    virtual ~CloverField();
 
     static CloverField *Create(const CloverFieldParam &param);
 
-    void* V(bool inverse=false) { return inverse ? cloverInv : clover; }
-    const void* V(bool inverse=false) const { return inverse ? cloverInv : clover; }
+    void *V(bool inverse = false) const { return inverse ? cloverInv.data() : clover.data(); }
+
+    /**
+       @return whether the inverse is explicitly been allocated
+     */
+    bool Inverse() const { return inverse; }
 
     /**
        @return diagonal scaling factor applied to the identity
@@ -406,10 +410,6 @@ namespace quda {
     */
     void copy_from_buffer(void *buffer);
 
-    friend class DiracClover;
-    friend class DiracCloverPC;
-    friend class DiracTwistedClover;
-    friend class DiracTwistedCloverPC;
   };
 
   /**
diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index 1bfd1be413..9b88534e58 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -329,8 +329,7 @@ namespace quda
 
     size_t length = 0; // length including pads, but not norm zone
 
-    void *v = nullptr;      // the field elements
-    void *v_h = nullptr;    // the field elements
+    quda_ptr v = {};        // the field elements
     size_t norm_offset = 0; /** offset to the norm (if applicable) */
 
     // multi-GPU parameters
@@ -463,37 +462,19 @@ namespace quda
     /**
        @brief Return pointer to the field allocation
     */
-    void *V()
+    void *V() const
     {
       if (ghost_only) errorQuda("Not defined for ghost-only field");
-      return v;
-    }
-
-    /**
-       @brief Return pointer to the field allocation
-    */
-    const void *V() const
-    {
-      if (ghost_only) errorQuda("Not defined for ghost-only field");
-      return v;
-    }
-
-    /**
-       @brief Return pointer to the norm base pointer in the field allocation
-    */
-    void *Norm()
-    {
-      if (ghost_only) errorQuda("Not defined for ghost-only field");
-      return static_cast<char *>(v) + norm_offset;
+      return v.data();
     }
 
     /**
        @brief Return pointer to the norm base pointer in the field allocation
     */
-    const void *Norm() const
+    void *Norm() const
     {
       if (ghost_only) errorQuda("Not defined for ghost-only field");
-      return static_cast<char *>(v) + norm_offset;
+      return static_cast<char *>(v.data()) + norm_offset;
     }
 
     size_t NormOffset() const { return norm_offset; }
diff --git a/include/lattice_field.h b/include/lattice_field.h
index 38653350cc..e7c43b7d69 100644
--- a/include/lattice_field.h
+++ b/include/lattice_field.h
@@ -460,9 +460,7 @@ namespace quda {
       }
     }
 
-    mutable char *backup_h = nullptr;
-    mutable char *backup_norm_h = nullptr;
-    mutable bool backed_up = false;
+    mutable std::vector<quda_ptr> backup_h = {};
 
   public:
     /**
diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp
index cd8cc04ba2..78076c0c9a 100644
--- a/lib/clover_field.cpp
+++ b/lib/clover_field.cpp
@@ -15,9 +15,7 @@ namespace quda {
   CloverFieldParam::CloverFieldParam(const CloverField &a) :
     LatticeFieldParam(a),
     reconstruct(clover::reconstruct()),
-    inverse(a.V(true)),
-    clover(nullptr),
-    cloverInv(nullptr),
+    inverse(a.Inverse()),
     csw(a.Csw()),
     coeff(a.Coeff()),
     twist_flavor(a.TwistFlavor()),
@@ -36,21 +34,16 @@ namespace quda {
   CloverField::CloverField(const CloverFieldParam &param) :
     LatticeField(param),
     reconstruct(param.reconstruct),
-    bytes(0),
     nColor(3),
     nSpin(4),
-    clover(nullptr),
-    cloverInv(nullptr),
-    diagonal(0.0),
-    max {0, 0},
+    inverse(param.inverse),
     csw(param.csw),
     coeff(param.coeff),
     twist_flavor(param.twist_flavor),
     mu2(param.mu2),
     rho(param.rho),
     order(param.order),
-    create(param.create),
-    trlog {0, 0}
+    create(param.create)
   {
     if (siteSubset != QUDA_FULL_SITE_SUBSET) errorQuda("Unexpected siteSubset %d", siteSubset);
     if (nDim != 4) errorQuda("Number of dimensions must be 4, not %d", nDim);
@@ -79,53 +72,26 @@ namespace quda {
 
     if (bytes) {
       if (create != QUDA_REFERENCE_FIELD_CREATE) {
-        if (location == QUDA_CUDA_FIELD_LOCATION) {
-          clover = pool_device_malloc(bytes);
-        } else {
-          clover = safe_malloc(bytes);
-        }
-
+        clover = std::move(quda_ptr(mem_type, bytes));
       } else {
-        clover = param.clover;
+        clover = std::move(quda_ptr(param.clover, mem_type));
       }
 
       total_bytes += bytes;
 
-      if (param.inverse) {
+      if (inverse) {
         if (create != QUDA_REFERENCE_FIELD_CREATE) {
-          if (location == QUDA_CUDA_FIELD_LOCATION) {
-            cloverInv = pool_device_malloc(bytes);
-          } else {
-            cloverInv = safe_malloc(bytes);
-          }
+          cloverInv = std::move(quda_ptr(mem_type, bytes));
         } else {
-          cloverInv = param.cloverInv;
+          cloverInv = std::move(quda_ptr(param.cloverInv, mem_type));
         }
 
         total_bytes += bytes;
       }
 
       if (create == QUDA_ZERO_FIELD_CREATE) {
-        if (location == QUDA_CUDA_FIELD_LOCATION) {
-          qudaMemset(clover, '\0', bytes);
-          if (param.inverse) qudaMemset(cloverInv, '\0', bytes);
-        } else {
-          memset(clover, '\0', bytes);
-          if (param.inverse) memset(cloverInv, '\0', bytes);
-        }
-      }
-    }
-  }
-
-  CloverField::~CloverField()
-  {
-    if (create != QUDA_REFERENCE_FIELD_CREATE) {
-      if (location == QUDA_CUDA_FIELD_LOCATION) {
-        if (clover) pool_device_free(clover);
-        if (cloverInv) pool_device_free(cloverInv);
-      } else {
-        if (clover) host_free(clover);
-        if (cloverInv) host_free(cloverInv);
+        qudaMemset(clover, '\0', bytes);
+        if (inverse) qudaMemset(cloverInv, '\0', bytes);
       }
     }
   }
@@ -141,38 +107,31 @@ namespace quda {
 
   void CloverField::backup(bool which) const
   {
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      qudaMemcpy(backup_h + which * bytes, V(which), bytes, qudaMemcpyDeviceToHost);
-    } else {
-      memcpy(backup_h + which * bytes, V(which), bytes);
-    }
+    qudaMemcpy(backup_h[which], which ? cloverInv : clover, bytes, qudaMemcpyDefault);
   }
 
   void CloverField::backup() const
   {
-    if (backup_h) errorQuda("Already allocated host backup");
-    backup_h = static_cast<char *>(safe_malloc(2 * bytes));
+    if (backup_h.size()) errorQuda("Already allocated host backup");
+    backup_h.resize(2);
+    for (auto &b : backup_h) b = std::move(quda_ptr(QUDA_MEMORY_HOST, bytes));
 
-    if (V(false)) backup(false);
-    if (V(true)) backup(true);
+    backup(false);
+    if (inverse) backup(true);
   }
 
   void CloverField::restore(bool which) const
   {
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      qudaMemcpy((void *)V(which), backup_h + which * bytes, bytes, qudaMemcpyHostToDevice);
-    } else {
-      memcpy((void *)V(which), backup_h + which * bytes, bytes);
-    }
+    qudaMemcpy(which ? cloverInv : clover, backup_h[which], bytes, qudaMemcpyDefault);
   }
 
   void CloverField::restore() const
   {
-    if (V(false)) restore(false);
-    if (V(true)) restore(true);
+    if (!backup_h.size()) errorQuda("Cannot restore since not backed up");
+    restore(false);
+    if (inverse) restore(true);
 
-    host_free(backup_h);
-    backup_h = nullptr;
+    backup_h.resize(0);
   }
 
   CloverField *CloverField::Create(const CloverFieldParam &param) { return new CloverField(param); }
@@ -192,7 +151,7 @@ namespace quda {
 
     // special case where we wish to make a copy of the inverse field when dynamic_inverse is enabled
     static bool dynamic_inverse_copy = false;
-    if (is_inverse && clover::dynamic_inverse() && V(true) && !src.V(true) && !dynamic_inverse_copy) {
+    if (is_inverse && clover::dynamic_inverse() && inverse && !src.inverse && !dynamic_inverse_copy) {
       dynamic_inverse_copy = true;
       // create a copy of the clover field that we will invert in place and use as the source
       CloverFieldParam param(src);
@@ -207,8 +166,8 @@ namespace quda {
     }
 
     checkField(src);
-    if (!V(is_inverse)) errorQuda("Destination field's is_inverse=%d component does not exist", is_inverse);
-    if (!src.V(is_inverse) && !dynamic_inverse_copy)
+    if (is_inverse && !inverse) errorQuda("Destination field's is_inverse=%d component does not exist", is_inverse);
+    if (is_inverse && !src.Inverse() && !dynamic_inverse_copy)
       errorQuda("Source field's is_inverse=%d component does not exist", is_inverse);
 
     auto src_v = dynamic_inverse_copy ? src.V(false) : src.V(is_inverse);
@@ -280,26 +239,22 @@ namespace quda {
   void CloverField::copy_to_buffer(void *buffer) const
   {
     size_t buffer_offset = 0;
-    if (V(false)) { // direct
-      qudaMemcpy(buffer, clover, bytes, qudaMemcpyDefault);
-      buffer_offset += bytes;
-    }
+    qudaMemcpy(buffer, clover.data(), bytes, qudaMemcpyDefault);
+    buffer_offset += bytes;
 
-    if (V(true)) { // inverse
-      qudaMemcpy(static_cast<char *>(buffer) + buffer_offset, cloverInv, bytes, qudaMemcpyDefault);
+    if (inverse) { // inverse
+      qudaMemcpy(static_cast<char *>(buffer) + buffer_offset, cloverInv.data(), bytes, qudaMemcpyDefault);
     }
   }
 
   void CloverField::copy_from_buffer(void *buffer)
   {
     size_t buffer_offset = 0;
-    if (V(false)) { // direct
-      qudaMemcpy(clover, static_cast<char *>(buffer), bytes, qudaMemcpyDefault);
-      buffer_offset += bytes;
-    }
+    qudaMemcpy(clover.data(), static_cast<char *>(buffer), bytes, qudaMemcpyDefault);
+    buffer_offset += bytes;
 
-    if (V(true)) { // inverse
-      qudaMemcpy(cloverInv, static_cast<char *>(buffer) + buffer_offset, bytes, qudaMemcpyDefault);
+    if (inverse) { // inverse
+      qudaMemcpy(cloverInv.data(), static_cast<char *>(buffer) + buffer_offset, bytes, qudaMemcpyDefault);
     }
   }
 
@@ -313,12 +268,12 @@ namespace quda {
                              QudaParity parity) const
   {
     if (location == QUDA_CUDA_FIELD_LOCATION && is_prefetch_enabled()) {
-      auto clover_parity = clover;
-      auto cloverInv_parity = cloverInv;
       auto bytes_parity = parity == QUDA_INVALID_PARITY ? bytes : bytes / 2;
+      auto clover_parity = clover.data();
+      auto cloverInv_parity = inverse ? cloverInv.data() : nullptr;
       if (parity == QUDA_ODD_PARITY) {
-        clover_parity = clover ? static_cast<char *>(clover_parity) + bytes_parity : nullptr;
-        cloverInv_parity = cloverInv ? static_cast<char *>(cloverInv_parity) + bytes_parity : nullptr;
+        clover_parity = static_cast<char *>(clover_parity) + bytes_parity;
+        cloverInv_parity = inverse ? static_cast<char *>(cloverInv_parity) + bytes_parity : nullptr;
       }
 
       switch (type) {
@@ -376,7 +331,7 @@ namespace quda {
     spinor_param.fieldOrder = colorspinor::getNative(a.Precision(), a.Nspin());
     spinor_param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
     spinor_param.create = QUDA_REFERENCE_FIELD_CREATE;
-    spinor_param.v = (void*)a.V(inverse);
+    spinor_param.v = a.V(inverse);
     spinor_param.location = a.Location();
     return spinor_param;
   }
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 9649ce9a7f..26a373d29e 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -23,15 +23,6 @@ namespace quda
     composite_descr(param.is_composite, param.composite_dim, param.is_component, param.component_id),
     components(0)
   {
-    // this must come before create
-    if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
-      v = param.v;
-      norm_offset = param.norm_offset;
-      reference = true;
-    } else if (param.create == QUDA_GHOST_FIELD_CREATE) {
-      ghost_only = true;
-    }
-
     create(param);
 
     switch (param.create) {
@@ -157,21 +148,13 @@ namespace quda
       errorQuda("Subset not implemented");
 
     if (param.create != QUDA_REFERENCE_FIELD_CREATE && param.create != QUDA_GHOST_FIELD_CREATE) {
-      if (location == QUDA_CPU_FIELD_LOCATION) {
-        v = safe_malloc(bytes);
-      } else if (location == QUDA_CUDA_FIELD_LOCATION) {
-        switch (mem_type) {
-        case QUDA_MEMORY_DEVICE: v = pool_device_malloc(bytes); break;
-        case QUDA_MEMORY_MAPPED:
-          v_h = mapped_malloc(bytes);
-          v = get_mapped_device_pointer(v_h);
-          break;
-        default: errorQuda("Unsupported memory type %d", mem_type);
-        }
-      } else {
-        errorQuda("Unexpected field location %d", location);
-      }
+      v = std::move(quda_ptr(mem_type, bytes));
       alloc = true;
+    } else  if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
+      v = std::move(quda_ptr(param.v, mem_type));
+      reference = true;
+    } else if (param.create == QUDA_GHOST_FIELD_CREATE) {
+      ghost_only = true;
     }
 
     if (composite_descr.is_composite && param.create != QUDA_REFERENCE_FIELD_CREATE
@@ -186,7 +169,7 @@ namespace quda
       components.reserve(composite_descr.dim);
       for (int cid = 0; cid < composite_descr.dim; cid++) {
         param.component_id = cid;
-        param.v = static_cast<void *>(static_cast<char *>(v) + cid * bytes / composite_descr.dim);
+        param.v = static_cast<void *>(static_cast<char *>(v.data()) + cid * bytes / composite_descr.dim);
         components.push_back(new ColorSpinorField(param));
       }
     }
@@ -203,7 +186,7 @@ namespace quda
       param.is_component = composite_descr.is_component;
       param.component_id = composite_descr.id;
       even = new ColorSpinorField(param);
-      param.v = static_cast<char *>(v) + bytes / 2;
+      param.v = static_cast<char *>(v.data()) + bytes / 2;
       odd = new ColorSpinorField(param);
     }
 
@@ -231,10 +214,10 @@ namespace quda
       size_t subset_bytes_raw = bytes_raw / siteSubset;
       for (int subset = 0; subset < siteSubset; subset++) {
         if (location == QUDA_CUDA_FIELD_LOCATION)
-          qudaMemsetAsync(static_cast<char *>(v) + subset_bytes_raw + subset_bytes * subset, 0,
+          qudaMemsetAsync(static_cast<char *>(v.data()) + subset_bytes_raw + subset_bytes * subset, 0,
                           subset_bytes - subset_bytes_raw, device::get_default_stream());
         else
-          memset(static_cast<char *>(v) + subset_bytes_raw + subset_bytes * subset, 0, subset_bytes - subset_bytes_raw);
+          memset(static_cast<char *>(v.data()) + subset_bytes_raw + subset_bytes * subset, 0, subset_bytes - subset_bytes_raw);
       }
     }
   }
@@ -252,8 +235,7 @@ namespace quda
     pc_type = std::exchange(src.pc_type, QUDA_PC_INVALID);
     suggested_parity = std::exchange(src.suggested_parity, QUDA_INVALID_PARITY);
     length = std::exchange(src.length, 0);
-    v = std::exchange(src.v, nullptr);
-    v_h = std::exchange(src.v_h, nullptr);
+    v = std::exchange(src.v, {});
     norm_offset = std::exchange(src.norm_offset, 0);
     ghost = std::exchange(src.ghost, {});
     ghostFace = std::exchange(src.ghostFace, {});
@@ -274,18 +256,7 @@ namespace quda
   void ColorSpinorField::destroy()
   {
     if (alloc) {
-      if (location == QUDA_CPU_FIELD_LOCATION) {
-        host_free(v);
-      } else { // device field
-        switch (mem_type) {
-        case QUDA_MEMORY_DEVICE: pool_device_free(v); break;
-        case QUDA_MEMORY_MAPPED: host_free(v_h); break;
-        default: errorQuda("Unsupported memory type %d", mem_type);
-        }
-      }
       alloc = false;
-      v = nullptr;
-      v_h = nullptr;
 
       if (composite_descr.is_composite) {
         CompositeColorSpinorField::iterator vec;
@@ -432,11 +403,7 @@ namespace quda
 
   void ColorSpinorField::zero()
   {
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      qudaMemsetAsync(v, 0, bytes, device::get_default_stream());
-    } else {
-      memset(v, '\0', bytes);
-    }
+    qudaMemsetAsync(v, 0, bytes, device::get_default_stream());
   }
 
   void ColorSpinorField::copy(const ColorSpinorField &src)
@@ -459,7 +426,7 @@ namespace quda
         void *buffer = pool_pinned_malloc(bytes);
         memset(buffer, 0, bytes); // FIXME (temporary?) bug fix for padding
         copyGenericColorSpinor(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, 0);
-        qudaMemcpy(v, buffer, bytes, qudaMemcpyDefault);
+        qudaMemcpy(v.data(), buffer, bytes, qudaMemcpyDefault);
         pool_pinned_free(buffer);
 
       } else { // reorder on device
@@ -467,7 +434,7 @@ namespace quda
         if (src.FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) {
           // special case where we use mapped memory to read/write directly from application's array
           void *src_d = get_mapped_device_pointer(src.V());
-          copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, v, src_d);
+          copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, v.data(), src_d);
         } else {
           void *Src = nullptr, *buffer = nullptr;
           if (!zeroCopy) {
@@ -494,7 +461,7 @@ namespace quda
 
       if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // reorder on the host
         void *buffer = pool_pinned_malloc(bytes);
-        qudaMemcpy(buffer, v, bytes, qudaMemcpyDefault);
+        qudaMemcpy(buffer, v.data(), bytes, qudaMemcpyDefault);
         copyGenericColorSpinor(*this, src, QUDA_CPU_FIELD_LOCATION, 0, buffer);
         pool_pinned_free(buffer);
 
@@ -502,7 +469,7 @@ namespace quda
 
         if (FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) {
           // special case where we use zero-copy memory to read/write directly from application's array
-          void *dest_d = get_mapped_device_pointer(v);
+          void *dest_d = get_mapped_device_pointer(v.data());
           copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, dest_d, src.V());
         } else {
           void *dst = nullptr, *buffer = nullptr;
@@ -517,10 +484,10 @@ namespace quda
           copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, dst, 0);
 
           if (!zeroCopy) {
-            qudaMemcpy(v, dst, Bytes(), qudaMemcpyDefault);
+            qudaMemcpy(v.data(), dst, Bytes(), qudaMemcpyDefault);
           } else {
             qudaDeviceSynchronize();
-            memcpy(v, buffer, bytes);
+            memcpy(v.data(), buffer, bytes);
           }
 
           if (zeroCopy)
@@ -545,7 +512,7 @@ namespace quda
   {
     LatticeField::fill(param);
     param.field = const_cast<ColorSpinorField *>(this);
-    param.v = v;
+    param.v = v.data();
     param.nColor = nColor;
     param.nSpin = nSpin;
     param.nVec = nVec;
@@ -1516,49 +1483,29 @@ namespace quda
 
   void ColorSpinorField::backup() const
   {
-    if (backed_up) errorQuda("ColorSpinorField already backed up");
-
-    backup_h = new char[bytes];
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      qudaMemcpy(backup_h, v, bytes, qudaMemcpyDefault);
-    } else {
-      memcpy(backup_h, v, bytes);
-    }
-
-    backed_up = true;
+    if (backup_h.size()) errorQuda("ColorSpinorField already backed up");
+    backup_h.resize(1);
+    backup_h[0] = std::move(quda_ptr(QUDA_MEMORY_HOST, bytes));
+    qudaMemcpy(backup_h[0], v, bytes, qudaMemcpyDefault);
   }
 
   void ColorSpinorField::restore() const
   {
-    if (!backed_up) errorQuda("Cannot restore since not backed up");
-
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      qudaMemcpy(v, backup_h, bytes, qudaMemcpyDefault);
-      delete[] backup_h;
-    } else {
-      memcpy(v, backup_h, bytes);
-      delete[] backup_h;
-    }
-
-    backed_up = false;
+    if (!backup_h.size()) errorQuda("Cannot restore since not backed up");
+    qudaMemcpy(v, backup_h[0], bytes, qudaMemcpyDefault);
+    backup_h.resize(0);
   }
 
   void ColorSpinorField::copy_to_buffer(void *buffer) const
   {
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      qudaMemcpy(buffer, v, bytes, qudaMemcpyDeviceToHost);
-    } else {
-      std::memcpy(buffer, v, bytes);
-    }
+    quda_ptr buf(buffer, QUDA_MEMORY_HOST);
+    qudaMemcpy(buf, v, bytes, qudaMemcpyDefault);
   }
 
   void ColorSpinorField::copy_from_buffer(void *buffer)
   {
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      qudaMemcpy(v, buffer, bytes, qudaMemcpyHostToDevice);
-    } else {
-      std::memcpy(v, buffer, bytes);
-    }
+    quda_ptr buf(buffer, QUDA_MEMORY_HOST);
+    qudaMemcpy(v, buf, bytes, qudaMemcpyDefault);
   }
 
   void ColorSpinorField::prefetch(QudaFieldLocation mem_space, qudaStream_t stream) const
@@ -1566,7 +1513,7 @@ namespace quda
     if (Location() == QUDA_CUDA_FIELD_LOCATION) {
       // conditionals based on destructor
       if (is_prefetch_enabled() && alloc && mem_type == QUDA_MEMORY_DEVICE)
-        qudaMemPrefetchAsync(v, bytes, mem_space, stream);
+        qudaMemPrefetchAsync(v.data(), bytes, mem_space, stream);
     }
   }
 
@@ -1607,7 +1554,7 @@ namespace quda
   std::ostream &operator<<(std::ostream &out, const ColorSpinorField &a)
   {
     out << "location = " << a.Location() << std::endl;
-    out << "v = " << a.v << std::endl;
+    out << "v = " << a.v.data() << std::endl;
     out << "alloc = " << a.alloc << std::endl;
     out << "reference = " << a.reference << std::endl;
     out << "init = " << a.init << std::endl;
diff --git a/lib/dirac_clover.cpp b/lib/dirac_clover.cpp
index cf57b39352..6bb8e56df5 100644
--- a/lib/dirac_clover.cpp
+++ b/lib/dirac_clover.cpp
@@ -105,7 +105,7 @@ namespace quda {
     DiracClover(param)
   {
     // For the preconditioned operator, we need to check that the inverse of the clover term is present
-    if (!clover->cloverInv && !clover::dynamic_inverse()) errorQuda("Clover inverse required for DiracCloverPC");
+    if (!clover->Inverse() && !clover::dynamic_inverse()) errorQuda("Clover inverse required for DiracCloverPC");
   }
 
   DiracCloverPC::DiracCloverPC(const DiracCloverPC &dirac) : DiracClover(dirac) { }
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index cb4319857e..e129de86bf 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -897,7 +897,8 @@ namespace quda {
     }
   }
 
-  void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
+  void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
+  {
     if (order == QUDA_QDP_GAUGE_ORDER) {
       void **buffer = new void*[geometry];
       for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes/geometry);
@@ -905,11 +906,10 @@ namespace quda {
     } else {
       return pool_device_malloc(bytes);
     }
-
   }
 
-  void **create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
-
+  void **create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
+  {
     if (order > 4) {
       void **buffer = new void*[geometry];
       for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes[d]);
@@ -917,10 +917,10 @@ namespace quda {
     } else {
       return 0;
     }
-
   }
 
-  void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
+  void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
+  {
     if (order == QUDA_QDP_GAUGE_ORDER) {
       for (int d=0; d<geometry; d++) pool_device_free(((void**)buffer)[d]);
       delete []((void**)buffer);
@@ -929,7 +929,8 @@ namespace quda {
     }
   }
 
-  void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
+  void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
+  {
     if (order > 4) {
       for (int d=0; d<geometry; d++) pool_device_free(buffer[d]);
       delete []buffer;
@@ -1281,39 +1282,34 @@ namespace quda {
 
   void GaugeField::backup() const
   {
-    if (backed_up) errorQuda("Gauge field already backed up");
+    if (backup_h.size()) errorQuda("Gauge field already backed up");
 
     if (order == QUDA_QDP_GAUGE_ORDER) {
-      char **buffer = new char *[geometry];
+      backup_h.resize(geometry);
       for (int d = 0; d < geometry; d++) {
-        buffer[d] = new char[bytes / geometry];
-        qudaMemcpy(buffer[d], gauge_array[d].data(), bytes / geometry, qudaMemcpyDefault);
+        backup_h[d] = std::move(quda_ptr(QUDA_MEMORY_HOST, bytes / geometry));
+        qudaMemcpy(backup_h[d], gauge_array[d], bytes / geometry, qudaMemcpyDefault);
       }
-      backup_h = reinterpret_cast<char *>(buffer);
     } else {
-      backup_h = new char[bytes];
-      qudaMemcpy(backup_h, gauge.data(), bytes, qudaMemcpyDefault);
+      backup_h.resize(1);
+      backup_h[0] = std::move(quda_ptr(QUDA_MEMORY_HOST, bytes));
+      qudaMemcpy(backup_h[0], gauge, bytes, qudaMemcpyDefault);
     }
-
-    backed_up = true;
   }
 
   void GaugeField::restore() const
   {
-    if (!backed_up) errorQuda("Cannot restore since not backed up");
+    if (!backup_h.size()) errorQuda("Cannot restore since not backed up");
 
     if (order == QUDA_QDP_GAUGE_ORDER) {
-      char **buffer = reinterpret_cast<char **>(backup_h);
       for (int d = 0; d < geometry; d++) {
-        qudaMemcpy(gauge_array[d].data(), buffer[d], bytes / geometry, qudaMemcpyDefault);
-        delete[] buffer[d];
+        qudaMemcpy(gauge_array[d], backup_h[d], bytes / geometry, qudaMemcpyDefault);
       }
-      delete[] buffer;
     } else {
-      qudaMemcpy(gauge.data(), backup_h, bytes, qudaMemcpyDefault);
-      delete[] backup_h;
+      qudaMemcpy(gauge, backup_h[0], bytes, qudaMemcpyDefault);
     }
-    backed_up = false;
+
+    backup_h.resize(0);
   }
 
   void GaugeField::copy_to_buffer(void *buffer) const
diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp
index 8b4b123776..b75b1dcff8 100644
--- a/lib/lattice_field.cpp
+++ b/lib/lattice_field.cpp
@@ -29,17 +29,13 @@ namespace quda {
     volume(1),
     localVolume(1),
     pad(param.pad),
-    total_bytes(0),
     nDim(param.nDim),
     location(param.location),
     precision(param.Precision()),
     ghost_precision(param.GhostPrecision()),
-    ghost_precision_reset(false),
     scale(param.scale),
     siteSubset(param.siteSubset),
     ghostExchange(param.ghostExchange),
-    ghost_bytes(0),
-    ghost_bytes_old(0),
     ghost_face_bytes {},
     ghost_face_bytes_aligned {},
     ghost_offset(),
@@ -59,11 +55,7 @@ namespace quda {
     mh_send {},
     mh_recv_rdma {},
     mh_send_rdma {},
-    initComms(false),
-    mem_type(param.mem_type),
-    backup_h(nullptr),
-    backup_norm_h(nullptr),
-    backed_up(false)
+    mem_type(param.mem_type)
   {
     create(param);
   }
@@ -75,18 +67,14 @@ namespace quda {
     localVolumeCB(field.localVolumeCB),
     stride(field.stride),
     pad(field.pad),
-    total_bytes(0),
     nDim(field.nDim),
     location(field.location),
     precision(field.precision),
     ghost_precision(field.ghost_precision),
-    ghost_precision_reset(false),
     scale(field.scale),
     siteSubset(field.siteSubset),
     ghostExchange(field.ghostExchange),
     nDimComms(field.nDimComms),
-    ghost_bytes(0),
-    ghost_bytes_old(0),
     ghost_face_bytes {},
     ghost_face_bytes_aligned {},
     ghost_offset(),
@@ -106,11 +94,7 @@ namespace quda {
     mh_send {},
     mh_recv_rdma {},
     mh_send_rdma {},
-    initComms(false),
-    mem_type(field.mem_type),
-    backup_h(nullptr),
-    backup_norm_h(nullptr),
-    backed_up(false)
+    mem_type(field.mem_type)
   {
     LatticeFieldParam param;
     field.fill(param);
@@ -247,9 +231,7 @@ namespace quda {
     vol_string = std::exchange(src.vol_string, {});
     aux_string = std::exchange(src.aux_string, {});
     mem_type = std::exchange(src.mem_type, QUDA_MEMORY_INVALID);
-    backup_h = std::exchange(src.backup_h, nullptr);
-    backup_norm_h = std::exchange(src.backup_norm_h, nullptr);
-    backed_up = std::exchange(src.backed_up, false);
+    backup_h = std::exchange(src.backup_h, {});
   }
 
   void LatticeField::fill(LatticeFieldParam &param) const

From ece19db8847cde90a40bb44c3ef9ae2189386ee7 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 26 May 2023 13:15:12 -0700
Subject: [PATCH 17/60] Fix clang warnings

---
 include/color_spinor_field.h | 2 +-
 include/gauge_field.h        | 2 +-
 lib/dslash_coarse.hpp        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index 9b88534e58..8186425d1c 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -905,7 +905,7 @@ namespace quda
     static void test_compatible_weak(const ColorSpinorField &a, const ColorSpinorField &b);
 
     friend std::ostream &operator<<(std::ostream &out, const ColorSpinorField &);
-    friend class ColorSpinorParam;
+    friend struct ColorSpinorParam;
   };
 
   /**
diff --git a/include/gauge_field.h b/include/gauge_field.h
index 52a4a40b06..bf75bc6bfa 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -599,7 +599,7 @@ namespace quda {
     */
     void copy_from_buffer(void *buffer);
 
-    friend class GaugeFieldParam;
+    friend struct GaugeFieldParam;
   };
 
   /**
diff --git a/lib/dslash_coarse.hpp b/lib/dslash_coarse.hpp
index 486217ddd5..a98290d129 100644
--- a/lib/dslash_coarse.hpp
+++ b/lib/dslash_coarse.hpp
@@ -740,7 +740,7 @@ namespace quda {
       strcat(aux, dslash.inA[0].AuxString().c_str());
       strcat(aux, ",gauge_prec=");
 
-      char prec_str[8];
+      char prec_str[16];
       i32toa(prec_str, dslash.Y.Precision());
       strcat(aux, prec_str);
       strcat(aux, ",halo_prec=");

From f5685855cc26d2a9faab7d3407b44df44bac41f4 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 30 May 2023 12:16:26 -0700
Subject: [PATCH 18/60] Clean up and fix some bugs that creeped in

---
 include/quda_api.h            | 25 ++++++-----------------
 lib/clover_field.cpp          | 18 ++++++++---------
 lib/coarse_op.in.cu           |  2 +-
 lib/color_spinor_field.cpp    | 13 ++++--------
 lib/gauge_field.cpp           | 37 ++++++++++++-----------------------
 lib/targets/cuda/quda_api.cpp | 20 +++----------------
 6 files changed, 35 insertions(+), 80 deletions(-)

diff --git a/include/quda_api.h b/include/quda_api.h
index d2abba24d1..b3b9f35b69 100644
--- a/include/quda_api.h
+++ b/include/quda_api.h
@@ -90,17 +90,6 @@ namespace quda
    */
   void qudaMemset_(void *ptr, int value, size_t count, const char *func, const char *file, const char *line);
 
-  /**
-     @brief Heterogenous memset2d function
-     @param[out] ptr Heterogeneous pointer
-     @param[in] offset Offset shift in bytes from the base pointer
-     @param[in] Pitch in bytes
-     @param[in] value Value to set for each byte of specified memory
-     @param[in] width Width in bytes
-     @param[in] height Height in bytes
-   */
-  void qudaMemset2D_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height, const char *func, const char *file, const char *line);
-
   /**
      @brief Wrapper around cudaMemsetAsync or driver API equivalent
      @param[out] ptr Starting address pointer
@@ -122,16 +111,17 @@ namespace quda
                         const char *file, const char *line);
 
   /**
-     @brief Wrapper around cudaMemsetAsync or driver API equivalent
+     @brief Asynchronous heterogenous memset2d function
      @param[out] ptr Starting address pointer
+     @param[in] Initial offset from pointer
      @param[in] Pitch in bytes
      @param[in] value Value to set for each byte of specified memory
      @param[in] width Width in bytes
      @param[in] height Height in bytes
      @param[in] stream Stream to issue memset
    */
-  void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream,
-                          const char *func, const char *file, const char *line);
+  void qudaMemset2DAsync_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height,
+                          const qudaStream_t &stream, const char *func, const char *file, const char *line);
 
   /**
      @brief Wrapper around cudaMemPrefetchAsync or driver API equivalent
@@ -253,14 +243,11 @@ namespace quda
 #define qudaMemset(ptr, value, count)                                                                                  \
   ::quda::qudaMemset_(ptr, value, count, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
 
-#define qudaMemset2D(ptr, offset, pitch, value, width, height)          \
-  ::quda::qudaMemset2D_(ptr, offset, pitch, value, width, height, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
-
 #define qudaMemsetAsync(ptr, value, count, stream)                                                                     \
   ::quda::qudaMemsetAsync_(ptr, value, count, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
 
-#define qudaMemset2DAsync(ptr, pitch, value, width, height, stream)                                                    \
-  ::quda::qudaMemset2DAsync_(ptr, pitch, value, width, height, stream, __func__, quda::file_name(__FILE__),            \
+#define qudaMemset2DAsync(ptr, offset, pitch, value, width, height, stream) \
+  ::quda::qudaMemset2DAsync_(ptr, offset, pitch, value, width, height, stream, __func__, quda::file_name(__FILE__),    \
                              __STRINGIFY__(__LINE__))
 
 #define qudaMemPrefetchAsync(ptr, count, mem_space, stream)                                                            \
diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp
index 78076c0c9a..2727069224 100644
--- a/lib/clover_field.cpp
+++ b/lib/clover_field.cpp
@@ -337,19 +337,17 @@ namespace quda {
   }
 
   // Return the L2 norm squared of the clover field
-  double norm2(const CloverField &a, bool inverse) {
-    ColorSpinorField *b = ColorSpinorField::Create(colorSpinorParam(a, inverse));
-    double nrm2 = blas::norm2(*b);
-    delete b;
-    return nrm2;
+  double norm2(const CloverField &a, bool inverse)
+  {
+    ColorSpinorField b(colorSpinorParam(a, inverse));
+    return blas::norm2(b);
   }
 
   // Return the L1 norm of the clover field
-  double norm1(const CloverField &a, bool inverse) {
-    ColorSpinorField *b = ColorSpinorField::Create(colorSpinorParam(a, inverse));
-    double nrm1 = blas::norm1(*b);
-    delete b;
-    return nrm1;
+  double norm1(const CloverField &a, bool inverse)
+  {
+    ColorSpinorField b(colorSpinorParam(a, inverse));
+    return blas::norm1(b);
   }
 
 } // namespace quda
diff --git a/lib/coarse_op.in.cu b/lib/coarse_op.in.cu
index 0684e0e97a..358c3ba0b9 100644
--- a/lib/coarse_op.in.cu
+++ b/lib/coarse_op.in.cu
@@ -97,7 +97,7 @@ namespace quda {
       gCoarseAtomic yAccessorAtomic(const_cast<GaugeField&>(Yatomic));
       gCoarseAtomic xAccessorAtomic(const_cast<GaugeField&>(Xatomic));
       cFine cAccessor(const_cast<CloverField&>(c), false);
-      cFine cInvAccessor(const_cast<CloverField&>(c), true);
+      cFine cInvAccessor(const_cast<CloverField&>(c), c.Inverse());
 
       calculateY<use_mma, QUDA_CUDA_FIELD_LOCATION, false,Float,fineSpin,fineColor,coarseSpin,coarseColor>
         (yAccessor, xAccessor, yAccessorAtomic, xAccessorAtomic, uvAccessor,
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 26a373d29e..a40f191712 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -186,7 +186,7 @@ namespace quda
       param.is_component = composite_descr.is_component;
       param.component_id = composite_descr.id;
       even = new ColorSpinorField(param);
-      param.v = static_cast<char *>(v.data()) + bytes / 2;
+      param.v = !ghost_only ? static_cast<char *>(v.data()) + bytes / 2 : nullptr;
       odd = new ColorSpinorField(param);
     }
 
@@ -208,17 +208,12 @@ namespace quda
 
   void ColorSpinorField::zeroPad()
   {
+    if (!isNative()) return;
     // zero the region added for alignment reasons
     if (bytes != bytes_raw) {
       size_t subset_bytes = bytes / siteSubset;
       size_t subset_bytes_raw = bytes_raw / siteSubset;
-      for (int subset = 0; subset < siteSubset; subset++) {
-        if (location == QUDA_CUDA_FIELD_LOCATION)
-          qudaMemsetAsync(static_cast<char *>(v.data()) + subset_bytes_raw + subset_bytes * subset, 0,
-                          subset_bytes - subset_bytes_raw, device::get_default_stream());
-        else
-          memset(static_cast<char *>(v.data()) + subset_bytes_raw + subset_bytes * subset, 0, subset_bytes - subset_bytes_raw);
-      }
+      qudaMemset2DAsync(v, subset_bytes_raw, subset_bytes, 0, subset_bytes - subset_bytes_raw, siteSubset, device::get_default_stream());
     }
   }
 
@@ -512,7 +507,7 @@ namespace quda
   {
     LatticeField::fill(param);
     param.field = const_cast<ColorSpinorField *>(this);
-    param.v = v.data();
+    param.v = !ghost_only ? v.data() : nullptr;
     param.nColor = nColor;
     param.nSpin = nSpin;
     param.nVec = nVec;
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index e129de86bf..51d5b59a47 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -309,7 +309,7 @@ namespace quda {
     size_t pitch = stride * order * precision;
     if (pad_bytes) {
       for (int parity = 0; parity < 2; parity++) {
-        qudaMemset2D(gauge, parity * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad);
+        qudaMemset2DAsync(gauge, parity * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad, device::get_default_stream());
       }
     }
   }
@@ -863,15 +863,6 @@ namespace quda {
       comm_wait(mh_recv[i]);
     }
 
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      for (int i=0; i<nDimComms; i++) {
-	if (!comm_dim_partitioned(i)) continue;
-        qudaMemcpy(ghost_link[i], receive[i], bytes[i], qudaMemcpyHostToDevice);
-        pool_pinned_free(send[i]);
-	pool_pinned_free(receive[i]);
-      }
-    }
-
     for (int i=0; i<nDimComms; i++) {
       if (!comm_dim_partitioned(i)) continue;
       comm_free(mh_send[i]);
@@ -1178,26 +1169,24 @@ namespace quda {
   }
 
   // Return the L2 norm squared of the gauge field
-  double norm2(const GaugeField &a) {
-    ColorSpinorField *b = ColorSpinorField::Create(colorSpinorParam(a));
-    double nrm2 = blas::norm2(*b);
-    delete b;
-    return nrm2;
+  double norm2(const GaugeField &a)
+  {
+    ColorSpinorField b(colorSpinorParam(a));
+    return blas::norm2(b);
   }
 
   // Return the L1 norm of the gauge field
-  double norm1(const GaugeField &a) {
-    ColorSpinorField *b = ColorSpinorField::Create(colorSpinorParam(a));
-    double nrm1 = blas::norm1(*b);
-    delete b;
-    return nrm1;
+  double norm1(const GaugeField &a)
+  {
+    ColorSpinorField b(colorSpinorParam(a));
+    return blas::norm1(b);
   }
 
   // Scale the gauge field by the constant a
-  void ax(const double &a, GaugeField &u) {
-    ColorSpinorField *b = ColorSpinorField::Create(colorSpinorParam(u));
-    blas::ax(a, *b);
-    delete b;
+  void ax(const double &a, GaugeField &u)
+  {
+    ColorSpinorField b(colorSpinorParam(u));
+    blas::ax(a, b);
   }
 
   uint64_t GaugeField::checksum(bool mini) const {
diff --git a/lib/targets/cuda/quda_api.cpp b/lib/targets/cuda/quda_api.cpp
index 8c57fe9079..00f829b7a4 100644
--- a/lib/targets/cuda/quda_api.cpp
+++ b/lib/targets/cuda/quda_api.cpp
@@ -407,31 +407,17 @@ namespace quda
     }
   }
 
-  void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height, const char *func,
-                     const char *file, const char *line)
-  {
-    cudaError_t error = cudaMemset2D(ptr, pitch, value, width, height);
-    set_runtime_error(error, __func__, func, file, line);
-  }
-
-  void qudaMemset2D_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height, const char *func,
-                     const char *file, const char *line)
+  void qudaMemset2DAsync_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height,
+                          const qudaStream_t &stream, const char *func, const char *file, const char *line)
   {
     if (ptr.is_device()) {
-      cudaError_t error = cudaMemset2D(static_cast<char*>(ptr.data()) + offset, pitch, value, width, height);
+      cudaError_t error = cudaMemset2DAsync(static_cast<char*>(ptr.data()) + offset, pitch, value, width, height, get_stream(stream));
       set_runtime_error(error, __func__, func, file, line);
     } else {
       for (auto i = 0u; i < height; i++) memset(static_cast<char*>(ptr.data()) + offset + i * pitch, value, width);
     }
   }
 
-  void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream,
-                          const char *func, const char *file, const char *line)
-  {
-    cudaError_t error = cudaMemset2DAsync(ptr, pitch, value, width, height, get_stream(stream));
-    set_runtime_error(error, __func__, func, file, line);
-  }
-
   void qudaMemPrefetchAsync_(void *ptr, size_t count, QudaFieldLocation mem_space, const qudaStream_t &stream,
                              const char *func, const char *file, const char *line)
   {

From b15f94d9dabb258eb5792671af1164fc7611498a Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 30 May 2023 22:44:39 -0700
Subject: [PATCH 19/60] Update MRE solver to use getProfile

---
 include/invert_quda.h  |  4 +---
 lib/interface_quda.cpp |  6 +++---
 lib/inv_mre.cpp        | 20 ++++++++++----------
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/include/invert_quda.h b/include/invert_quda.h
index b2699ad3b7..8043fb7f0d 100644
--- a/include/invert_quda.h
+++ b/include/invert_quda.h
@@ -1542,8 +1542,6 @@ namespace quda {
     bool apply_mat; //! Whether to compute q = Ap or assume it is provided
     bool hermitian; //! Whether A is hermitian or not
 
-    TimeProfile &profile;
-
     /**
        @brief Solve the equation A p_k psi_k = q_k psi_k = b by minimizing the
        residual and using Eigen's SVD algorithm for numerical stability
@@ -1562,7 +1560,7 @@ namespace quda {
        @param apply_mat Whether to apply the operator in place or assume q already contains this
        @profile Timing profile to use
     */
-    MinResExt(const DiracMatrix &mat, bool orthogonal, bool apply_mat, bool hermitian, TimeProfile &profile = dummy);
+    MinResExt(const DiracMatrix &mat, bool orthogonal, bool apply_mat, bool hermitian);
 
     /**
        @param x The optimum for the solution vector.
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index aa25b06621..3c32443205 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -2898,7 +2898,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
       bool orthogonal = true;
       bool apply_mat = false;
       bool hermitian = false;
-      MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert);
+      MinResExt mre(m, orthogonal, apply_mat, hermitian);
       mre(*out, *in, basis, Ap);
 
       profileInvert.TPSTOP(QUDA_PROFILE_CHRONO);
@@ -2933,7 +2933,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
       bool orthogonal = true;
       bool apply_mat = false;
       bool hermitian = true;
-      MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert);
+      MinResExt mre(m, orthogonal, apply_mat, hermitian);
       mre(*out, *in, basis, Ap);
 
       profileInvert.TPSTOP(QUDA_PROFILE_CHRONO);
@@ -3712,7 +3712,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
           bool orthogonal = false;
           bool apply_mat = true;
           bool hermitian = true;
-	  MinResExt mre(*m, orthogonal, apply_mat, hermitian, profileMulti);
+	  MinResExt mre(*m, orthogonal, apply_mat, hermitian);
           mre(x[i], b, z, q);
         }
 
diff --git a/lib/inv_mre.cpp b/lib/inv_mre.cpp
index 91a79bab55..10733a6aaa 100644
--- a/lib/inv_mre.cpp
+++ b/lib/inv_mre.cpp
@@ -5,8 +5,8 @@
 namespace quda
 {
 
-  MinResExt::MinResExt(const DiracMatrix &mat, bool orthogonal, bool apply_mat, bool hermitian, TimeProfile &profile) :
-    mat(mat), orthogonal(orthogonal), apply_mat(apply_mat), hermitian(hermitian), profile(profile)
+  MinResExt::MinResExt(const DiracMatrix &mat, bool orthogonal, bool apply_mat, bool hermitian) :
+    mat(mat), orthogonal(orthogonal), apply_mat(apply_mat), hermitian(hermitian)
   {
   }
 
@@ -44,14 +44,14 @@ namespace quda
       for (int j = 0; j < N; j++) { A(i, j) = A_[i * (N + 1) + j]; }
     }
 
-    profile.TPSTOP(QUDA_PROFILE_CHRONO);
-    profile.TPSTART(QUDA_PROFILE_EIGEN);
+    getProfile().TPSTOP(QUDA_PROFILE_CHRONO);
+    getProfile().TPSTART(QUDA_PROFILE_EIGEN);
 
     LDLT<matrix> cholesky(A);
     psi = cholesky.solve(phi);
 
-    profile.TPSTOP(QUDA_PROFILE_EIGEN);
-    profile.TPSTART(QUDA_PROFILE_CHRONO);
+    getProfile().TPSTOP(QUDA_PROFILE_EIGEN);
+    getProfile().TPSTART(QUDA_PROFILE_CHRONO);
 
     for (int i = 0; i < N; i++) psi_[i] = psi(i);
   }
@@ -70,8 +70,8 @@ namespace quda
   void MinResExt::operator()(ColorSpinorField &x, const ColorSpinorField &b, std::vector<ColorSpinorField> &p,
                              std::vector<ColorSpinorField> &q)
   {
-    bool running = profile.isRunning(QUDA_PROFILE_CHRONO);
-    if (!running) profile.TPSTART(QUDA_PROFILE_CHRONO);
+    bool running = getProfile().isRunning(QUDA_PROFILE_CHRONO);
+    if (!running) getProfile().TPSTART(QUDA_PROFILE_CHRONO);
 
     const int N = p.size();
     logQuda(QUDA_VERBOSE, "Constructing minimum residual extrapolation with basis size %d\n", N);
@@ -81,7 +81,7 @@ namespace quda
         blas::zero(x);
       else
         blas::copy(x, p[0]);
-      if (!running) profile.TPSTOP(QUDA_PROFILE_CHRONO);
+      if (!running) getProfile().TPSTOP(QUDA_PROFILE_CHRONO);
       return;
     }
 
@@ -133,7 +133,7 @@ namespace quda
       printfQuda("MinResExt: N = %d, |res| / |src| = %e\n", N, sqrt(blas::norm2(r) / blas::norm2(b)));
     }
 
-    if (!running) profile.TPSTOP(QUDA_PROFILE_CHRONO);
+    if (!running) getProfile().TPSTOP(QUDA_PROFILE_CHRONO);
   }
 
 } // namespace quda

From 2f4c41d6da30aa273b66fedbe8bea54515f82a68 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 30 May 2023 22:46:48 -0700
Subject: [PATCH 20/60] Include some missing headers that broke jitify

---
 include/multi_blas_helper.cuh      | 1 +
 include/reference_wrapper_helper.h | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/include/multi_blas_helper.cuh b/include/multi_blas_helper.cuh
index 6a470fe576..78aaa1ac4b 100644
--- a/include/multi_blas_helper.cuh
+++ b/include/multi_blas_helper.cuh
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <algorithm>
+#include <utility>
 #include <register_traits.h>
 #include <blas_helper.cuh>
 #include <reduce_helper.h>
diff --git a/include/reference_wrapper_helper.h b/include/reference_wrapper_helper.h
index 2b85c497fd..3f73709ca6 100644
--- a/include/reference_wrapper_helper.h
+++ b/include/reference_wrapper_helper.h
@@ -1,6 +1,8 @@
 #pragma once
 
+#include <type_traits>
 #include <functional>
+#include <iterator>
 #include <initializer_list>
 #include <enum_quda.h>
 #include <util_quda.h>

From 0178ab5f92c98a15ec25d660ffb057dee2c409ae Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 30 May 2023 22:49:39 -0700
Subject: [PATCH 21/60] Move contents of TimeProfile to timer.cpp to avoid
 breaking jitify.

---
 include/timer.h     | 94 ++++----------------------------------------
 include/tune_quda.h | 59 ++++------------------------
 lib/timer.cpp       | 96 ++++++++++++++++++++++++++++++++++++++++++---
 lib/tune.cpp        | 54 +++++++++++++++++++++++++
 4 files changed, 161 insertions(+), 142 deletions(-)

diff --git a/include/timer.h b/include/timer.h
index 2475fee154..0d529867cb 100644
--- a/include/timer.h
+++ b/include/timer.h
@@ -2,10 +2,6 @@
 
 #include <sys/time.h>
 
-#ifdef INTERFACE_NVTX
-#include "nvtx3/nvToolsExt.h"
-#endif
-
 #include <quda_internal.h>
 #include <util_quda.h>
 #include <device.h>
@@ -191,70 +187,25 @@ namespace quda {
     QUDA_PROFILE_COUNT  /**< The total number of timers we have.  Must be last enum type. */
   };
 
-#ifdef INTERFACE_NVTX
-
-#define PUSH_RANGE(name,cid) { \
-    int color_id = cid; \
-    color_id = color_id%nvtx_num_colors;\
-    nvtxEventAttributes_t eventAttrib = {}; \
-    eventAttrib.version = NVTX_VERSION; \
-    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
-    eventAttrib.colorType = NVTX_COLOR_ARGB; \
-    eventAttrib.color = nvtx_colors[color_id]; \
-    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
-    eventAttrib.message.ascii = name; \
-    eventAttrib.category = cid;\
-    nvtxRangePushEx(&eventAttrib); \
-}
-#define POP_RANGE nvtxRangePop();
-#else
-#define PUSH_RANGE(name,cid)
-#define POP_RANGE
-#endif
-
   class TimeProfile {
     std::string fname;  /**< Which function are we profiling */
 #ifdef INTERFACE_NVTX
     static const uint32_t nvtx_colors[];// = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff };
     static const int nvtx_num_colors;// = sizeof(nvtx_colors)/sizeof(uint32_t);
 #endif
-    host_timer_t profile[QUDA_PROFILE_COUNT];
+    array<host_timer_t, QUDA_PROFILE_COUNT> profile;
     static std::string pname[];
 
     bool switchOff;
     bool use_global;
 
-    // global timer
-    static host_timer_t global_profile[QUDA_PROFILE_COUNT];
-    static bool global_switchOff[QUDA_PROFILE_COUNT];
-    static int global_total_level[QUDA_PROFILE_COUNT]; // zero initialize
-
-    static void StopGlobal(const char *func, const char *file, int line, QudaProfileType idx) {
-
-      global_total_level[idx]--;
-      if (global_total_level[idx] == 0) global_profile[idx].stop(func, file, line);
-
-      // switch off total timer if we need to
-      if (global_switchOff[idx]) {
-        global_total_level[idx]--;
-        if (global_total_level[idx] == 0) global_profile[idx].stop(func, file, line);
-        global_switchOff[idx] = false;
-      }
-    }
-
-    static void StartGlobal(const char *func, const char *file, int line, QudaProfileType idx) {
-      // if total timer isn't running, then start it running
-      if (!global_profile[idx].running) {
-        global_profile[idx].start(func, file, line);
-        global_total_level[idx]++;
-        global_switchOff[idx] = true;
-      }
-
-      if (global_total_level[idx] == 0) global_profile[idx].start(func, file, line);
-      global_total_level[idx]++;
-    }
+    static void StopGlobal(const char *func, const char *file, int line, QudaProfileType idx);
+    static void StartGlobal(const char *func, const char *file, int line, QudaProfileType idx);
 
   public:
+    TimeProfile() = default;
+    TimeProfile(const TimeProfile &) = default;
+
     TimeProfile(std::string fname) : fname(fname), switchOff(false), use_global(true) { ; }
 
     TimeProfile(std::string fname, bool use_global) : fname(fname), switchOff(false), use_global(use_global) { ; }
@@ -262,32 +213,8 @@ namespace quda {
     /**< Print out the profile information */
     void Print();
 
-    void Start_(const char *func, const char *file, int line, QudaProfileType idx)
-    {
-      // if total timer isn't running, then start it running
-      if (!profile[QUDA_PROFILE_TOTAL].running && idx != QUDA_PROFILE_TOTAL) {
-        profile[QUDA_PROFILE_TOTAL].start(func, file, line);
-        switchOff = true;
-      }
-
-      profile[idx].start(func, file, line);
-      PUSH_RANGE(fname.c_str(),idx)
-	if (use_global) StartGlobal(func,file,line,idx);
-    }
-
-    void Stop_(const char *func, const char *file, int line, QudaProfileType idx) {
-      if (idx == QUDA_PROFILE_COMPUTE || idx == QUDA_PROFILE_H2D || idx == QUDA_PROFILE_D2H)
-        qudaDeviceSynchronize(); // ensure accurate profiling
-      profile[idx].stop(func, file, line);
-      POP_RANGE
-
-      // switch off total timer if we need to
-      if (switchOff && idx != QUDA_PROFILE_TOTAL) {
-        profile[QUDA_PROFILE_TOTAL].stop(func, file, line);
-        switchOff = false;
-      }
-      if (use_global) StopGlobal(func,file,line,idx);
-    }
+    void Start_(const char *func, const char *file, int line, QudaProfileType idx);
+    void Stop_(const char *func, const char *file, int line, QudaProfileType idx);
 
     void Reset_(const char *func, const char *file, int line) {
       for (int idx = 0; idx < QUDA_PROFILE_COUNT; idx++) profile[idx].reset(func, file, line);
@@ -301,8 +228,6 @@ namespace quda {
 
   };
 
-  static TimeProfile dummy("dummy");
-
   void pushProfile(TimeProfile &profile);
 
   void popProfile();
@@ -311,9 +236,6 @@ namespace quda {
 
 } // namespace quda
 
-#undef PUSH_RANGE
-#undef POP_RANGE
-
 #define TPSTART(idx) Start_(__func__, __FILE__, __LINE__, idx)
 #define TPSTOP(idx) Stop_(__func__, __FILE__, __LINE__, idx)
 #define TPRESET() Reset_(__func__, __FILE__, __LINE__)
diff --git a/include/tune_quda.h b/include/tune_quda.h
index 1511f6f881..2aacde55f7 100644
--- a/include/tune_quda.h
+++ b/include/tune_quda.h
@@ -17,7 +17,7 @@
 
 namespace quda {
 
-  class TuneParam {
+  struct TuneParam {
 
   public:
     dim3 block;
@@ -35,16 +35,10 @@ namespace quda {
     TuneParam(TuneParam &&) = default;
     TuneParam &operator=(const TuneParam &) = default;
     TuneParam &operator=(TuneParam &&) = default;
-
-    friend std::ostream& operator<<(std::ostream& output, const TuneParam& param) {
-      output << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
-      output << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), ";
-      output << "shared_bytes=" << param.shared_bytes;
-      output << ", aux=(" << param.aux.x << "," << param.aux.y << "," << param.aux.z << "," << param.aux.w << ")";
-      return output;
-    }
   };
 
+  std::ostream &operator<<(std::ostream &, const TuneParam &);
+
   /**
    * @brief Returns a reference to the tunecache map
    * @return tunecache reference
@@ -68,20 +62,7 @@ namespace quda {
     virtual bool tuneGridDim() const { return true; }
     virtual bool tuneAuxDim() const { return false; }
 
-    virtual bool tuneSharedBytes() const
-    {
-      static bool tune_shared = true;
-      static bool init = false;
-
-      if (!init) {
-        char *enable_shared_env = getenv("QUDA_ENABLE_TUNING_SHARED");
-        if (enable_shared_env) {
-          if (strcmp(enable_shared_env, "0") == 0) { tune_shared = false; }
-        }
-        init = true;
-      }
-      return tune_shared;
-    }
+    virtual bool tuneSharedBytes() const;
 
     virtual bool advanceGridDim(TuneParam &param) const
     {
@@ -239,16 +220,7 @@ namespace quda {
        @brief Whether the present instance has already been tuned or not
        @return True if tuned, false if not
     */
-    bool tuned()
-    {
-      // not tuning is equivalent to already tuned
-      if (!getTuning()) return true;
-
-      TuneKey key = tuneKey();
-      if (use_managed_memory()) strcat(key.aux, ",managed");
-      // if key is present in cache then already tuned
-      return getTuneCache().find(key) != getTuneCache().end();
-    }
+    bool tuned() const;
 
   public:
     Tunable() : launch_error(QUDA_SUCCESS) { aux[0] = '\0'; }
@@ -287,24 +259,9 @@ namespace quda {
      */
     virtual float min_tune_time() const { return 1e-3; }
 
-    virtual std::string paramString(const TuneParam &param) const
-    {
-      std::stringstream ps;
-      ps << param;
-      return ps.str();
-    }
-
-    virtual std::string perfString(float time) const
-    {
-      float gflops = flops() / (1e9 * time);
-      float gbytes = bytes() / (1e9 * time);
-      std::stringstream ss;
-      ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops << " Gflop/s, ";
-      ss << gbytes << " GB/s";
-      return ss.str();
-    }
-
-    virtual std::string miscString(const TuneParam &) const { return std::string(); }
+    virtual std::string paramString(const TuneParam &param) const;
+    virtual std::string perfString(float time) const;
+    virtual std::string miscString(const TuneParam &) const;
 
     virtual void initTuneParam(TuneParam &param) const
     {
diff --git a/lib/timer.cpp b/lib/timer.cpp
index c4e924ee6e..2214ebd0ec 100644
--- a/lib/timer.cpp
+++ b/lib/timer.cpp
@@ -2,10 +2,15 @@
 #include <quda_internal.h>
 #include <timer.h>
 
+#ifdef INTERFACE_NVTX
+#include "nvtx3/nvToolsExt.h"
+#endif
+
 namespace quda {
 
   /**< Print out the profile information */
-  void TimeProfile::Print() {
+  void TimeProfile::Print()
+  {
     if (profile[QUDA_PROFILE_TOTAL].time > 0.0) {
       printfQuda("\n   %20s Total time = %9.3f secs\n", fname.c_str(), profile[QUDA_PROFILE_TOTAL].time);
     }
@@ -31,7 +36,6 @@ namespace quda {
       warningQuda("Accounted time %9.3f secs in %s is greater than total time %9.3f secs", accounted,
                   (const char *)&fname[0], profile[QUDA_PROFILE_TOTAL].time);
     }
-
   }
 
   std::string TimeProfile::pname[] = {"download",
@@ -79,9 +83,89 @@ namespace quda {
   const int TimeProfile::nvtx_num_colors = sizeof(nvtx_colors)/sizeof(uint32_t);
 #endif
 
-  Timer<> TimeProfile::global_profile[QUDA_PROFILE_COUNT];
-  bool TimeProfile::global_switchOff[QUDA_PROFILE_COUNT] = {};
-  int TimeProfile::global_total_level[QUDA_PROFILE_COUNT] = {};
+  // global timer
+  host_timer_t global_profile[QUDA_PROFILE_COUNT] = {};
+  static bool global_switchOff[QUDA_PROFILE_COUNT] = {};
+  static int global_total_level[QUDA_PROFILE_COUNT] = {};
+
+  void TimeProfile::StopGlobal(const char *func, const char *file, int line, QudaProfileType idx)
+  {
+    global_total_level[idx]--;
+    if (global_total_level[idx] == 0) global_profile[idx].stop(func, file, line);
+
+    // switch off total timer if we need to
+    if (global_switchOff[idx]) {
+      global_total_level[idx]--;
+      if (global_total_level[idx] == 0) global_profile[idx].stop(func, file, line);
+      global_switchOff[idx] = false;
+    }
+  }
+
+  void TimeProfile::StartGlobal(const char *func, const char *file, int line, QudaProfileType idx)
+  {
+    // if total timer isn't running, then start it running
+    if (!global_profile[idx].running) {
+      global_profile[idx].start(func, file, line);
+      global_total_level[idx]++;
+      global_switchOff[idx] = true;
+    }
+
+    if (global_total_level[idx] == 0) global_profile[idx].start(func, file, line);
+    global_total_level[idx]++;
+  }
+
+#ifdef INTERFACE_NVTX
+
+#define PUSH_RANGE(name, cid)                                                                                          \
+  {                                                                                                                    \
+    int color_id = cid;                                                                                                \
+    color_id = color_id % nvtx_num_colors;                                                                             \
+    nvtxEventAttributes_t eventAttrib = {};                                                                            \
+    eventAttrib.version = NVTX_VERSION;                                                                                \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;                                                                  \
+    eventAttrib.colorType = NVTX_COLOR_ARGB;                                                                           \
+    eventAttrib.color = nvtx_colors[color_id];                                                                         \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;                                                                 \
+    eventAttrib.message.ascii = name;                                                                                  \
+    eventAttrib.category = cid;                                                                                        \
+    nvtxRangePushEx(&eventAttrib);                                                                                     \
+  }
+#define POP_RANGE nvtxRangePop();
+#else
+#define PUSH_RANGE(name, cid)
+#define POP_RANGE
+#endif
+
+  void TimeProfile::Start_(const char *func, const char *file, int line, QudaProfileType idx)
+  {
+    // if total timer isn't running, then start it running
+    if (!profile[QUDA_PROFILE_TOTAL].running && idx != QUDA_PROFILE_TOTAL) {
+      profile[QUDA_PROFILE_TOTAL].start(func, file, line);
+      switchOff = true;
+    }
+
+    profile[idx].start(func, file, line);
+    PUSH_RANGE(fname.c_str(), idx)
+    if (use_global) StartGlobal(func, file, line, idx);
+  }
+
+  void TimeProfile::Stop_(const char *func, const char *file, int line, QudaProfileType idx)
+  {
+    if (idx == QUDA_PROFILE_COMPUTE || idx == QUDA_PROFILE_H2D || idx == QUDA_PROFILE_D2H)
+      qudaDeviceSynchronize(); // ensure accurate profiling
+    profile[idx].stop(func, file, line);
+    POP_RANGE
+
+    // switch off total timer if we need to
+    if (switchOff && idx != QUDA_PROFILE_TOTAL) {
+      profile[QUDA_PROFILE_TOTAL].stop(func, file, line);
+      switchOff = false;
+    }
+    if (use_global) StopGlobal(func, file, line, idx);
+  }
+
+#undef PUSH_RANGE
+#undef POP_RANGE
 
   void TimeProfile::PrintGlobal() {
     if (global_profile[QUDA_PROFILE_TOTAL].time > 0.0) {
@@ -114,6 +198,8 @@ namespace quda {
     }
   }
 
+  TimeProfile dummy("dummy");
+
   static std::stack<TimeProfile*> tpstack;
 
   void pushProfile(TimeProfile &profile)
diff --git a/lib/tune.cpp b/lib/tune.cpp
index 57134ec3d4..1d6971db3c 100644
--- a/lib/tune.cpp
+++ b/lib/tune.cpp
@@ -656,9 +656,63 @@ namespace quda
     aux = make_int4(1, 1, 1, 1);
   }
 
+  std::ostream &operator<<(std::ostream &output, const TuneParam &param)
+  {
+    output << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
+    output << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), ";
+    output << "shared_bytes=" << param.shared_bytes;
+    output << ", aux=(" << param.aux.x << "," << param.aux.y << "," << param.aux.z << "," << param.aux.w << ")";
+    return output;
+  }
+
+  bool Tunable::tuneSharedBytes() const
+  {
+    static bool tune_shared = true;
+    static bool init = false;
+
+    if (!init) {
+      char *enable_shared_env = getenv("QUDA_ENABLE_TUNING_SHARED");
+      if (enable_shared_env) {
+        if (strcmp(enable_shared_env, "0") == 0) { tune_shared = false; }
+      }
+      init = true;
+    }
+    return tune_shared;
+  }
+
   int Tunable::blockStep() const { return device::warp_size(); }
   int Tunable::blockMin() const { return device::warp_size(); }
 
+  bool Tunable::tuned() const
+  {
+    // not tuning is equivalent to already tuned
+    if (!getTuning()) return true;
+
+    TuneKey key = tuneKey();
+    if (use_managed_memory()) strcat(key.aux, ",managed");
+    // if key is present in cache then already tuned
+    return getTuneCache().find(key) != getTuneCache().end();
+  }
+
+  std::string Tunable::paramString(const TuneParam &param) const
+  {
+    std::stringstream ps;
+    ps << param;
+    return ps.str();
+  }
+
+  std::string Tunable::perfString(float time) const
+  {
+    float gflops = flops() / (1e9 * time);
+    float gbytes = bytes() / (1e9 * time);
+    std::stringstream ss;
+    ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops << " Gflop/s, ";
+    ss << gbytes << " GB/s";
+    return ss.str();
+  }
+
+  std::string Tunable::miscString(const TuneParam &) const { return std::string(); }
+
   int32_t Tunable::getTuneRank() const
   {
     static bool init = false;

From bf14e687ec317693b322b4cea491235d7b8fe8e2 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 30 May 2023 22:51:17 -0700
Subject: [PATCH 22/60] Fixed for covdev_test

---
 tests/covdev_test.cpp                     |  4 +--
 tests/host_reference/covdev_reference.cpp | 40 +++++++++++------------
 tests/host_reference/covdev_reference.h   |  5 ++-
 3 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/tests/covdev_test.cpp b/tests/covdev_test.cpp
index 0a5d5d38c7..d296a553af 100644
--- a/tests/covdev_test.cpp
+++ b/tests/covdev_test.cpp
@@ -161,9 +161,9 @@ void covdevRef(int mu)
   // compare to dslash reference implementation
   printfQuda("Calculating reference implementation...");
 #ifdef MULTI_GPU
-  mat_mg4dir(*spinorRef, *cpuLink, *spinor, dagger, mu, inv_param.cpu_prec, gauge_param.cpu_prec);
+  mat_mg4dir(*spinorRef, *cpuLink, *spinor, dagger, mu);
 #else
-  mat(spinorRef->V(), *cpuLink, spinor->V(), dagger, mu, inv_param.cpu_prec, gauge_param.cpu_prec);
+  mat(*spinorRef, *cpuLink, *spinor, dagger, mu);
 #endif
   printfQuda("done.\n");
 }
diff --git a/tests/host_reference/covdev_reference.cpp b/tests/host_reference/covdev_reference.cpp
index a8c178af00..97dae09402 100644
--- a/tests/host_reference/covdev_reference.cpp
+++ b/tests/host_reference/covdev_reference.cpp
@@ -82,32 +82,31 @@ void covdev_dslash(void *res, void **link, void *spinorField, int oddBit, int da
   }
 }
 
-template <typename sFloat, typename gFloat> void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu)
+template <typename sFloat, typename gFloat>
+void Mat(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu)
 {
-  sFloat *inEven = in;
-  sFloat *inOdd = in + Vh * spinor_site_size;
-  sFloat *outEven = out;
-  sFloat *outOdd = out + Vh * spinor_site_size;
-
   // full dslash operator
-  covdevReference(outOdd, link, inEven, 1, daggerBit, mu);
-  covdevReference(outEven, link, inOdd, 0, daggerBit, mu);
+  void *data[4] = {link.data(0), link.data(1), link.data(2), link.data(3)};
+  covdevReference(reinterpret_cast<sFloat *>(out.Odd().V()), reinterpret_cast<gFloat **>(data),
+                  reinterpret_cast<sFloat *>(in.Even().V()), 1, daggerBit, mu);
+  covdevReference(reinterpret_cast<sFloat *>(out.Even().V()), reinterpret_cast<gFloat **>(data),
+                  reinterpret_cast<sFloat *>(in.Odd().V()), 0, daggerBit, mu);
 }
 
-void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
+void mat(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu)
 {
 
-  if (sPrecision == QUDA_DOUBLE_PRECISION) {
-    if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      Mat((double *)out, (double **)link, (double *)in, dagger_bit, mu);
+  if (checkPrecision(in, out) == QUDA_DOUBLE_PRECISION) {
+    if (link.Precision() == QUDA_DOUBLE_PRECISION) {
+      Mat<double, double>(out, link, in, dagger_bit, mu);
     } else {
-      Mat((double *)out, (float **)link, (double *)in, dagger_bit, mu);
+      Mat<double, float>(out, link, in, dagger_bit, mu);
     }
   } else {
-    if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      Mat((float *)out, (double **)link, (float *)in, dagger_bit, mu);
+    if (link.Precision() == QUDA_DOUBLE_PRECISION) {
+      Mat<float, double>(out, link, in, dagger_bit, mu);
     } else {
-      Mat((float *)out, (float **)link, (float *)in, dagger_bit, mu);
+      Mat<float, float>(out, link, in, dagger_bit, mu);
     }
   }
 }
@@ -252,17 +251,16 @@ void Mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinor
   }
 }
 
-void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit,
-                int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
+void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu)
 {
-  if (sPrecision == QUDA_DOUBLE_PRECISION) {
-    if (gPrecision == QUDA_DOUBLE_PRECISION) {
+  if (checkPrecision(in, out) == QUDA_DOUBLE_PRECISION) {
+    if (link.Precision() == QUDA_DOUBLE_PRECISION) {
       Mat_mg4dir<double, double>(out, link, in, dagger_bit, mu);
     } else {
       Mat_mg4dir<double, float>(out, link, in, dagger_bit, mu);
     }
   } else {
-    if (gPrecision == QUDA_DOUBLE_PRECISION) {
+    if (link.Precision() == QUDA_DOUBLE_PRECISION) {
       Mat_mg4dir<float, double>(out, link, in, dagger_bit, mu);
     } else {
       Mat_mg4dir<float, float>(out, link, in, dagger_bit, mu);
diff --git a/tests/host_reference/covdev_reference.h b/tests/host_reference/covdev_reference.h
index c2045773ed..679736109a 100644
--- a/tests/host_reference/covdev_reference.h
+++ b/tests/host_reference/covdev_reference.h
@@ -11,13 +11,12 @@ void covdev_dslash(void *res, const GaugeField &link, void *spinorField, int odd
 void covdev_dslash_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int oddBit,
                           int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision);
 
-void mat(void *out, const GaugeField &link, void *in, int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision);
+void mat(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu);
 
 void matdagmat(void *out, const GaugeField &link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision,
                QudaPrecision gPrecision, void *tmp, QudaParity parity);
 
-void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu,
-                QudaPrecision sPrecision, QudaPrecision gPrecision);
+void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu);
 void matdagmat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit,
                       int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp,
                       QudaParity parity);

From 7bf774c1dc311b0b60bb8215755dbe4c967225a5 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 30 May 2023 22:59:05 -0700
Subject: [PATCH 23/60] Update jitify to latest with some custom additions yet
 to be back ported

---
 include/targets/cuda/externals/jitify.hpp | 258 ++++++++++++++++++----
 1 file changed, 212 insertions(+), 46 deletions(-)

diff --git a/include/targets/cuda/externals/jitify.hpp b/include/targets/cuda/externals/jitify.hpp
index 46a51a97cd..110be5d22e 100644
--- a/include/targets/cuda/externals/jitify.hpp
+++ b/include/targets/cuda/externals/jitify.hpp
@@ -365,7 +365,7 @@ inline std::string path_base(std::string p) {
   // "foo/bar"  -> "foo"
   // "foo/bar/" -> "foo/bar"
 #if defined _WIN32 || defined _WIN64
-  char sep = '\\';
+  const char* sep = "\\/";
 #else
   char sep = '/';
 #endif
@@ -496,10 +496,13 @@ inline std::string comment_out_code_line(int line_num, std::string source) {
 inline void print_with_line_numbers(std::string const& source) {
   int linenum = 1;
   std::stringstream source_ss(source);
+  std::stringstream output_ss;
+  output_ss.imbue(std::locale::classic());
   for (std::string line; std::getline(source_ss, line); ++linenum) {
-    std::cout << std::setfill(' ') << std::setw(3) << linenum << " " << line
+    output_ss << std::setfill(' ') << std::setw(3) << linenum << " " << line
               << std::endl;
   }
+  std::cout << output_ss.str();
 }
 
 inline void print_compile_log(std::string program_name,
@@ -554,7 +557,7 @@ inline bool load_source(
     std::string filename, std::map<std::string, std::string>& sources,
     std::string current_dir = "",
     std::vector<std::string> include_paths = std::vector<std::string>(),
-    file_callback_type file_callback = 0,
+    file_callback_type file_callback = 0, std::string* program_name = nullptr,
     std::map<std::string, std::string>* fullpaths = nullptr,
     bool search_current_dir = true) {
   std::istream* source_stream = 0;
@@ -568,6 +571,9 @@ inline bool load_source(
     string_stream << source;
     source_stream = &string_stream;
   }
+  if (program_name) {
+    *program_name = filename;
+  }
   if (sources.count(filename)) {
     // Already got this one
     return true;
@@ -672,6 +678,8 @@ inline bool load_source(
       // TODO: Handle block comments (currently they cause a compilation error).
       size_t comment_start = line_after_pragma.find("//");
       std::string pragma_args = line_after_pragma.substr(0, comment_start);
+      // handle quote character used in #pragma expression
+      pragma_args = replace_token(pragma_args, "\"", "\\\"");
       std::string comment = comment_start != std::string::npos
                                 ? line_after_pragma.substr(comment_start)
                                 : "";
@@ -682,7 +690,7 @@ inline bool load_source(
     source += line + "\n";
   }
   // HACK TESTING (WAR for cub)
-  // source = "#define cudaDeviceSynchronize() cudaSuccess\n" + source;
+  source = "#define cudaDeviceSynchronize() cudaSuccess\n" + source;
   ////source = "cudaError_t cudaDeviceSynchronize() { return cudaSuccess; }\n" +
   /// source;
 
@@ -690,6 +698,7 @@ inline bool load_source(
   //   of the same header from different paths.
   if (pragma_once) {
     std::stringstream ss;
+    ss.imbue(std::locale::classic());
     ss << std::uppercase << std::hex << std::setw(8) << std::setfill('0')
        << hash;
     std::string include_guard_name = "_JITIFY_INCLUDE_GUARD_" + ss.str() + "\n";
@@ -1385,7 +1394,16 @@ static const char* jitsafe_header_preinclude_h = R"(
 // WAR to allow exceptions to be parsed
 #define try
 #define catch(...)
-)";
+)"
+#if defined(_WIN32) || defined(_WIN64)
+// WAR for NVRTC <= 11.0 not defining _WIN64.
+R"(
+#ifndef _WIN64
+#define _WIN64 1
+#endif
+)"
+#endif
+;
 
 static const char* jitsafe_header_float_h = R"(
 #pragma once
@@ -1403,12 +1421,12 @@ static const char* jitsafe_header_float_h = R"(
 #define DBL_MAX_EXP     1024
 #define FLT_MAX_10_EXP  38
 #define DBL_MAX_10_EXP  308
-#define FLT_MAX         3.4028234e38f 
-#define DBL_MAX         1.7976931348623157e308 
-#define FLT_EPSILON     1.19209289e-7f 
-#define DBL_EPSILON     2.220440492503130e-16 
-#define FLT_MIN         1.1754943e-38f; 
-#define DBL_MIN         2.2250738585072013e-308 
+#define FLT_MAX         3.4028234e38f
+#define DBL_MAX         1.7976931348623157e308
+#define FLT_EPSILON     1.19209289e-7f
+#define DBL_EPSILON     2.220440492503130e-16
+#define FLT_MIN         1.1754943e-38f
+#define DBL_MIN         2.2250738585072013e-308
 #define FLT_ROUNDS      1
 #if defined __cplusplus && __cplusplus >= 201103L
 #define FLT_EVAL_METHOD 0
@@ -1596,14 +1614,28 @@ struct IntegerLimits {
 #endif  // __cplusplus >= 201103L
 	enum {
        is_specialized = true,
-       digits = (Digits == -1) ? (int)(sizeof(T)*8 - (Min != 0)) : Digits,
-       digits10   = (digits * 30103) / 100000,
-       is_signed  = ((T)(-1)<0),
-       is_integer = true,
-       is_exact   = true,
-       radix      = 2,
-       is_bounded = true,
-       is_modulo  = false
+       digits            = (Digits == -1) ? (int)(sizeof(T)*8 - (Min != 0)) : Digits,
+       digits10          = (digits * 30103) / 100000,
+       is_signed         = ((T)(-1)<0),
+       is_integer        = true,
+       is_exact          = true,
+       has_infinity      = false,
+       has_quiet_NaN     = false,
+       has_signaling_NaN = false,
+       has_denorm        = 0,
+       has_denorm_loss   = false,
+       round_style       = 0,
+       is_iec559         = false,
+       is_bounded        = true,
+       is_modulo         = !(is_signed || Max == 1 /*is bool*/),
+       max_digits10      = 0,
+       radix             = 2,
+       min_exponent      = 0,
+       min_exponent10    = 0,
+       max_exponent      = 0,
+       max_exponent10    = 0,
+       tinyness_before   = false,
+       traps             = false
 	};
 };
 } // namespace __jitify_detail
@@ -1910,6 +1942,46 @@ static const char* jitsafe_header_type_traits = R"(
     template<size_t len, size_t alignment> struct aligned_storage { struct type { alignas(alignment) char data[len]; }; };
     template <class T> struct alignment_of : std::integral_constant<size_t,alignof(T)> {};
 
+    template <typename T> struct make_unsigned;
+    template <> struct make_unsigned<signed char>        { typedef unsigned char type; };
+    template <> struct make_unsigned<signed short>       { typedef unsigned short type; };
+    template <> struct make_unsigned<signed int>         { typedef unsigned int type; };
+    template <> struct make_unsigned<signed long>        { typedef unsigned long type; };
+    template <> struct make_unsigned<signed long long>   { typedef unsigned long long type; };
+    template <> struct make_unsigned<unsigned char>      { typedef unsigned char type; };
+    template <> struct make_unsigned<unsigned short>     { typedef unsigned short type; };
+    template <> struct make_unsigned<unsigned int>       { typedef unsigned int type; };
+    template <> struct make_unsigned<unsigned long>      { typedef unsigned long type; };
+    template <> struct make_unsigned<unsigned long long> { typedef unsigned long long type; };
+    template <> struct make_unsigned<char>               { typedef unsigned char type; };
+    #if defined _WIN32 || defined _WIN64
+    template <> struct make_unsigned<wchar_t>            { typedef unsigned short type; };
+    #else
+    template <> struct make_unsigned<wchar_t>            { typedef unsigned int type; };
+    #endif
+
+    template <typename T> struct make_signed;
+    template <> struct make_signed<signed char>        { typedef signed char type; };
+    template <> struct make_signed<signed short>       { typedef signed short type; };
+    template <> struct make_signed<signed int>         { typedef signed int type; };
+    template <> struct make_signed<signed long>        { typedef signed long type; };
+    template <> struct make_signed<signed long long>   { typedef signed long long type; };
+    template <> struct make_signed<unsigned char>      { typedef signed char type; };
+    template <> struct make_signed<unsigned short>     { typedef signed short type; };
+    template <> struct make_signed<unsigned int>       { typedef signed int type; };
+    template <> struct make_signed<unsigned long>      { typedef signed long type; };
+    template <> struct make_signed<unsigned long long> { typedef signed long long type; };
+    template <> struct make_signed<char>               { typedef signed char type; };
+    #if defined _WIN32 || defined _WIN64
+    template <> struct make_signed<wchar_t>            { typedef signed short type; };
+    #else
+    template <> struct make_signed<wchar_t>            { typedef signed int type; };
+    #endif
+
+    #if __cplusplus >= 201703L
+    template< typename... Ts > struct make_void { typedef void type; };
+    template< typename... Ts > using void_t = typename make_void<Ts...>::type;
+    #endif  // __cplusplus >= 201703L
     }  // namespace std
     #endif // c++11
 )";
@@ -1949,8 +2021,8 @@ static const char* jitsafe_header_stdint_h =
     "#define INT8_MIN    SCHAR_MIN\n"
     "#define INT16_MIN   SHRT_MIN\n"
     "#if defined _WIN32 || defined _WIN64\n"
-    "#define WCHAR_MIN   SHRT_MIN\n"
-    "#define WCHAR_MAX   SHRT_MAX\n"
+    "#define WCHAR_MIN   0\n"
+    "#define WCHAR_MAX   USHRT_MAX\n"
     "typedef unsigned long long uintptr_t; //optional\n"
     "#else\n"
     "#define WCHAR_MIN   INT_MIN\n"
@@ -2083,24 +2155,33 @@ static const char* jitsafe_header_sstream =
     "#include <ostream>\n"
     "#include <istream>\n";
 
-static const char* jitsafe_header_utility =
-    "#pragma once\n"
-    "namespace std {\n"
-    "template<class T1, class T2>\n"
-    "struct pair {\n"
-    "	T1 first;\n"
-    "	T2 second;\n"
-    "	inline pair() {}\n"
-    "	inline pair(T1 const& first_, T2 const& second_)\n"
-    "		: first(first_), second(second_) {}\n"
-    "	// TODO: Standard includes many more constructors...\n"
-    "	// TODO: Comparison operators\n"
-    "};\n"
-    "template<class T1, class T2>\n"
-    "pair<T1,T2> make_pair(T1 const& first, T2 const& second) {\n"
-    "	return pair<T1,T2>(first, second);\n"
-    "}\n"
-    "}  // namespace std\n";
+static const char* jitsafe_header_utility = R"(
+    #pragma once
+    namespace std {
+    template<class T1, class T2>
+    struct pair {
+        T1 first;
+        T2 second;
+        inline pair() {}
+        inline pair(T1 const& first_, T2 const& second_): first(first_), second(second_) {}
+        // TODO: Standard includes many more constructors...
+        // TODO: Comparison operators
+    };
+    template<class T1, class T2>
+    pair<T1,T2> make_pair(T1 const& first, T2 const& second) {
+        return pair<T1,T2>(first, second);
+    }
+
+    template<typename T>
+    constexpr bool always_false = false;
+
+    template<typename T>
+    typename std::add_rvalue_reference<T>::type declval() noexcept
+    {
+    static_assert(always_false<T>, "declval not allowed in an evaluated context");
+    }
+    }  // namespace std
+)";
 
 // TODO: incomplete
 static const char* jitsafe_header_vector =
@@ -2340,14 +2421,81 @@ static const char* jitsafe_header_tuple = R"(
     #if __cplusplus >= 201103L
     namespace std {
     template<class... Types > class tuple;
+
+    template< size_t I, class T >
+    struct tuple_element;
+    // recursive case
+    template< size_t I, class Head, class... Tail >
+    struct tuple_element<I, tuple<Head, Tail...>>
+        : tuple_element<I-1, tuple<Tail...>> { };
+    // base case
+    template< class Head, class... Tail >
+    struct tuple_element<0, tuple<Head, Tail...>> {
+      using type = Head;
+    };
     } // namespace std
     #endif
  )";
 
+static const char* jitsafe_header_functional = R"(
+    #pragma once
+    #if __cplusplus >= 201103L
+    namespace std {
+    template<class T>
+    class reference_wrapper
+    {
+    public:
+    // types
+    using type = T;
+    reference_wrapper(const reference_wrapper&) noexcept = default;
+    // assignment
+    reference_wrapper& operator=(const reference_wrapper& x) noexcept = default;
+    // access
+    constexpr operator T& () const noexcept { return *_ptr; }
+    constexpr T& get() const noexcept { return *_ptr; }
+    private:
+    T* _ptr;
+    };
+    } // namespace std
+    #endif
+)";
+
+static const char* jitsafe_header_map = R"(
+    #pragma once
+    namespace std {
+    template<class Key, class T, class Compare = void, class Allocator = void> class map {};
+    } // namespace std
+)";
+
+static const char* jitsafe_header_stack = R"(
+    #pragma once
+    namespace std {
+    template<class T, class = void> class stack {};
+    } // namespace std
+)";
+
+static const char* jitsafe_header_initializer_list = R"(
+    #pragma once
+)";
+
 static const char* jitsafe_header_assert = R"(
     #pragma once
  )";
 
+static const char* jitsafe_header_sys_time = R"(
+    #pragma once
+    struct timeval {
+    unsigned long long tv_sec;
+    unsigned long long tv_usec;
+    };
+    struct timeval it_interval;
+    struct timeval it_value;
+    int getitimer(int, struct itimerval *);
+    int gettimeofday(struct timeval *, void *);
+    int setitimer(int, const struct itimerval *, struct itimerval *);
+    int utimes(const char *, const struct timeval [2]);
+ )";
+
 // WAR: These need to be pre-included as a workaround for NVRTC implicitly using
 // /usr/include as an include path. The other built-in headers will be included
 // lazily as needed.
@@ -2406,8 +2554,13 @@ static const std::map<std::string, std::string>& get_jitsafe_headers_map() {
       {"time.h", jitsafe_header_time_h},
       {"ctime", jitsafe_header_time_h},
       {"tuple", jitsafe_header_tuple},
+      {"functional", jitsafe_header_functional},
+      {"map", jitsafe_header_map},
+      {"stack", jitsafe_header_stack},
+      {"initializer_list", jitsafe_header_initializer_list},
       {"assert.h", jitsafe_header_assert},
-      {"cassert", jitsafe_header_assert}};
+      {"cassert", jitsafe_header_assert},
+      {"sys/time.h", jitsafe_header_sys_time}};
   return jitsafe_headers_map;
 }
 
@@ -2673,6 +2826,17 @@ inline nvrtcResult compile_kernel(std::string program_name,
       &nvrtc_program, program_source.c_str(), program_name.c_str(), num_headers,
       header_sources_c.data(), header_names_c.data()));
 
+  // Ensure nvrtc_program gets destroyed.
+  struct ScopedNvrtcProgramDestroyer {
+    nvrtcProgram& nvrtc_program_;
+    ScopedNvrtcProgramDestroyer(nvrtcProgram& nvrtc_program)
+        : nvrtc_program_(nvrtc_program) {}
+    ~ScopedNvrtcProgramDestroyer() { nvrtcDestroyProgram(&nvrtc_program_); }
+    ScopedNvrtcProgramDestroyer(const ScopedNvrtcProgramDestroyer&) = delete;
+    ScopedNvrtcProgramDestroyer& operator=(const ScopedNvrtcProgramDestroyer&) =
+        delete;
+  } nvrtc_program_scope_guard{nvrtc_program};
+
 #if CUDA_VERSION >= 8000
   if (!instantiation.empty()) {
     CHECK_NVRTC(nvrtcAddNameExpression(nvrtc_program, instantiation.c_str()));
@@ -2720,7 +2884,6 @@ inline nvrtcResult compile_kernel(std::string program_name,
 #endif
   }
 
-  CHECK_NVRTC(nvrtcDestroyProgram(&nvrtc_program));
 #undef CHECK_NVRTC
   return NVRTC_SUCCESS;
 }
@@ -2746,10 +2909,9 @@ inline void load_program(std::string const& cuda_source,
 
   // Load program source
   if (!detail::load_source(cuda_source, *program_sources, "", *include_paths,
-                           file_callback)) {
+                           file_callback, program_name)) {
     throw std::runtime_error("Source not found: " + cuda_source);
   }
-  *program_name = program_sources->begin()->first;
 
   // Maps header include names to their full file paths.
   std::map<std::string, std::string> header_fullpaths;
@@ -2757,7 +2919,7 @@ inline void load_program(std::string const& cuda_source,
   // Load header sources
   for (std::string const& header : headers) {
     if (!detail::load_source(header, *program_sources, "", *include_paths,
-                             file_callback, &header_fullpaths)) {
+                             file_callback, nullptr, &header_fullpaths)) {
       // **TODO: Deal with source not found
       throw std::runtime_error("Source not found: " + header);
     }
@@ -2816,8 +2978,8 @@ inline void load_program(std::string const& cuda_source,
     std::string include_parent_fullpath = header_fullpaths[include_parent];
     std::string include_path = detail::path_base(include_parent_fullpath);
     if (detail::load_source(include_name, *program_sources, include_path,
-                            *include_paths, file_callback, &header_fullpaths,
-                            is_included_with_quotes)) {
+                            *include_paths, file_callback, nullptr,
+                            &header_fullpaths, is_included_with_quotes)) {
 #if JITIFY_PRINT_HEADER_PATHS
       std::cout << "Found #include " << include_name << " from "
                 << include_parent << ":" << line_num << " ["
@@ -3067,6 +3229,7 @@ class KernelLauncher {
   std::unique_ptr<KernelLauncher_impl const> _impl;
 
  public:
+  KernelLauncher() = default;
   inline KernelLauncher(KernelInstantiation const& kernel_inst, dim3 grid,
                         dim3 block, unsigned int smem = 0,
                         cudaStream_t stream = 0);
@@ -3135,6 +3298,7 @@ class KernelInstantiation {
   std::unique_ptr<KernelInstantiation_impl const> _impl;
 
  public:
+  KernelInstantiation() = default;
   inline KernelInstantiation(Kernel const& kernel,
                              std::vector<std::string> const& template_args);
 
@@ -3282,6 +3446,7 @@ class Kernel {
   std::unique_ptr<Kernel_impl const> _impl;
 
  public:
+  Kernel() = default;
   Kernel(Program const& program, std::string name,
          jitify::detail::vector<std::string> options = 0);
 
@@ -3346,6 +3511,7 @@ class Program {
   std::unique_ptr<Program_impl const> _impl;
 
  public:
+  Program() = default;
   Program(JitCache& cache, std::string source,
           jitify::detail::vector<std::string> headers = 0,
           jitify::detail::vector<std::string> options = 0,

From c42794fbe74b4e80ad08b5c2b3f1c755f7252a55 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 31 May 2023 22:25:07 -0700
Subject: [PATCH 24/60] Rename ColorSpinorField/CloverField::V methods to data,
 with an optional template cast type

---
 include/blas_helper.cuh                       |   6 +-
 include/blas_quda.h                           |   2 +-
 include/clover_field.h                        |   5 +-
 include/clover_field_order.h                  |  16 +-
 include/color_spinor_field.h                  |   4 +-
 include/color_spinor_field_order.h            |  17 +-
 include/dslash_helper.cuh                     |  12 +-
 include/kernels/covDev.cuh                    |   8 +-
 include/kernels/dslash_staggered.cuh          |   8 +-
 include/kernels/dslash_wilson.cuh             |   8 +-
 include/kernels/laplace.cuh                   |   9 +-
 .../staggered_kd_apply_xinv_kernel.cuh        |   2 +-
 include/kernels/staggered_quark_smearing.cuh  |   9 +-
 lib/block_orthogonalize.in.cu                 |   2 +-
 lib/clover_field.cpp                          |   8 +-
 lib/coarse_op.in.cu                           |   2 +-
 lib/color_spinor_field.cpp                    |  12 +-
 lib/color_spinor_util.in.cu                   |   2 +-
 lib/copy_clover_offset.cu                     |   4 +-
 lib/copy_color_spinor_mg.in.hpp               |   4 +-
 lib/dirac.cpp                                 |   2 +-
 lib/dslash_clover_helper.cu                   |   8 +-
 lib/dslash_coarse.hpp                         |   2 +-
 lib/dslash_gamma_helper.cu                    |   4 +-
 ..._clover_hasenbusch_twist_preconditioned.cu |  10 -
 lib/interface_quda.cpp                        |   4 +-
 lib/inv_gmresdr_quda.cpp                      |   2 +-
 lib/inv_mr_quda.cpp                           |   2 +-
 lib/max_clover.cu                             |   2 +-
 lib/multi_reduce_quda.cu                      |   2 +-
 lib/staggered_kd_apply_xinv.cu                |   2 +-
 lib/vector_io.cpp                             |   4 +-
 tests/dslash_test_utils.h                     | 204 +++++++++---------
 tests/eigensolve_test.cpp                     |   6 +-
 tests/host_reference/covdev_reference.cpp     |  22 +-
 tests/host_reference/dslash_reference.cpp     |  12 +-
 .../staggered_dslash_reference.cpp            |  20 +-
 tests/invert_test.cpp                         |  12 +-
 tests/staggered_dslash_test_utils.h           |   8 +-
 tests/staggered_gsmear_test_utils.h           |  10 +-
 tests/staggered_invert_test.cpp               |  10 +-
 tests/utils/staggered_host_utils.cpp          |   4 +-
 42 files changed, 228 insertions(+), 264 deletions(-)

diff --git a/include/blas_helper.cuh b/include/blas_helper.cuh
index e2617e879a..80e974c0c9 100644
--- a/include/blas_helper.cuh
+++ b/include/blas_helper.cuh
@@ -111,7 +111,7 @@ namespace quda
       {}
 
       data_t(const ColorSpinorField &x) :
-        spinor(static_cast<store_t *>(const_cast<ColorSpinorField &>(x).V())),
+        spinor(x.data<store_t *>()),
         stride(x.VolumeCB()),
         cb_offset(x.Bytes() / (2 * sizeof(store_t) * N))
       {}
@@ -141,8 +141,8 @@ namespace quda
       {}
 
       data_t(const ColorSpinorField &x) :
-        spinor(static_cast<store_t *>(const_cast<ColorSpinorField &>(x).V())),
-        norm(static_cast<norm_t *>(const_cast<ColorSpinorField &>(x).Norm())),
+        spinor(x.data<store_t *>()),
+        norm(static_cast<norm_t *>(x.Norm())),
         stride(x.VolumeCB()),
         cb_offset(x.Bytes() / (2 * sizeof(store_t) * N)),
         cb_norm_offset(x.Bytes() / (2 * sizeof(norm_t)))
diff --git a/include/blas_quda.h b/include/blas_quda.h
index 3fc051d3ff..8df40df452 100644
--- a/include/blas_quda.h
+++ b/include/blas_quda.h
@@ -33,7 +33,7 @@ namespace quda {
 
     inline void copy(ColorSpinorField &dst, const ColorSpinorField &src)
     {
-      if (dst.V() == src.V()) {
+      if (dst.data() == src.data()) {
         // check the fields are equivalent else error
         if (ColorSpinorField::are_compatible(dst, src))
           return;
diff --git a/include/clover_field.h b/include/clover_field.h
index 579e7eeb1e..380a399492 100644
--- a/include/clover_field.h
+++ b/include/clover_field.h
@@ -217,7 +217,10 @@ namespace quda {
 
     static CloverField *Create(const CloverFieldParam &param);
 
-    void *V(bool inverse = false) const { return inverse ? cloverInv.data() : clover.data(); }
+    template <typename T = void *> auto data(bool inverse = false) const
+    {
+      return inverse ? reinterpret_cast<T>(cloverInv.data()) : reinterpret_cast<T>(clover.data());
+    }
 
     /**
        @return whether the inverse is explicitly been allocated
diff --git a/include/clover_field_order.h b/include/clover_field_order.h
index 1464a02629..05b77eee63 100644
--- a/include/clover_field_order.h
+++ b/include/clover_field_order.h
@@ -312,7 +312,7 @@ namespace quda {
       static constexpr int N = nColor * nSpin / 2;
       reconstruct_t<Float, N * N, clover::reconstruct()> recon;
       FloatNAccessor(const CloverField &A, bool inverse = false) :
-        a(static_cast<Float *>(const_cast<void *>(A.V(inverse)))),
+        a(A.data<Float *>(inverse)),
         stride(A.VolumeCB()),
         offset_cb(A.Bytes() / (2 * sizeof(Float))),
         compressed_block_size(A.compressed_block_size()),
@@ -403,9 +403,7 @@ namespace quda {
       const int N = nSpin * nColor / 2;
       const complex<Float> zero;
       Accessor(const CloverField &A, bool inverse = false) :
-        a(static_cast<Float *>(const_cast<void *>(A.V(inverse)))),
-        offset_cb(A.Bytes() / (2 * sizeof(Float))),
-        zero(complex<Float>(0.0, 0.0))
+        a(A.data<Float *>(inverse)), offset_cb(A.Bytes() / (2 * sizeof(Float))), zero(complex<Float>(0.0, 0.0))
       {
       }
 
@@ -639,7 +637,7 @@ namespace quda {
           if (clover.max_element(is_inverse) == 0.0 && isFixed<Float>::value)
             errorQuda("%p max_element(%d) appears unset", &clover, is_inverse);
           if (clover.Diagonal() == 0.0 && clover.Reconstruct()) errorQuda("%p diagonal appears unset", &clover);
-          this->clover = clover_ ? clover_ : (Float *)(clover.V(is_inverse));
+          this->clover = clover_ ? clover_ : clover.data<Float *>(is_inverse);
         }
 
         QudaTwistFlavorType TwistFlavor() const { return twist_flavor; }
@@ -844,7 +842,7 @@ namespace quda {
           if (clover.Order() != QUDA_PACKED_CLOVER_ORDER) {
             errorQuda("Invalid clover order %d for this accessor", clover.Order());
           }
-          this->clover = clover_ ? clover_ : (Float *)(clover.V(inverse));
+          this->clover = clover_ ? clover_ : clover.data<Float *>(inverse);
         }
 
         QudaTwistFlavorType TwistFlavor() const { return twist_flavor; }
@@ -892,8 +890,8 @@ namespace quda {
           if (clover.Order() != QUDA_QDPJIT_CLOVER_ORDER) {
             errorQuda("Invalid clover order %d for this accessor", clover.Order());
           }
-          offdiag = clover_ ? ((Float **)clover_)[0] : ((Float **)clover.V(inverse))[0];
-          diag = clover_ ? ((Float **)clover_)[1] : ((Float **)clover.V(inverse))[1];
+          offdiag = clover_ ? ((Float **)clover_)[0] : clover.data<Float **>(inverse)[0];
+          diag = clover_ ? ((Float **)clover_)[1] : clover.data<Float **>(inverse)[1];
         }
 
         QudaTwistFlavorType TwistFlavor() const { return twist_flavor; }
@@ -970,7 +968,7 @@ namespace quda {
           if (clover.Order() != QUDA_BQCD_CLOVER_ORDER) {
             errorQuda("Invalid clover order %d for this accessor", clover.Order());
           }
-          this->clover[0] = clover_ ? clover_ : (Float *)(clover.V(inverse));
+          this->clover[0] = clover_ ? clover_ : clover.data<Float *>(inverse);
           this->clover[1] = (Float *)((char *)this->clover[0] + clover.Bytes() / 2);
         }
 
diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index 8186425d1c..1bb81a450d 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -462,10 +462,10 @@ namespace quda
     /**
        @brief Return pointer to the field allocation
     */
-    void *V() const
+    template <typename T = void *> auto data() const
     {
       if (ghost_only) errorQuda("Not defined for ghost-only field");
-      return v.data();
+      return reinterpret_cast<T>(v.data());
     }
 
     /**
diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index 48b8d20a62..dab488931f 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -861,14 +861,13 @@ namespace quda
       FieldOrderCB(const ColorSpinorField &field, int nFace = 1, void *const v_ = 0, void *const *ghost_ = 0) :
         GhostOrder(field, nFace, ghost_), volumeCB(field.VolumeCB()), accessor(field)
       {
-        v.v = v_ ? static_cast<complex<storeFloat> *>(const_cast<void *>(v_)) :
-                   static_cast<complex<storeFloat> *>(const_cast<void *>(field.V()));
+        v.v = v_ ? static_cast<complex<storeFloat> *>(const_cast<void *>(v_)) : field.data<complex<storeFloat> *>();
         resetScale(field.Scale());
 
         if constexpr (fixed && block_float) {
           if constexpr (nColor == 3 && nSpin == 1 && nVec == 1 && order == 2)
             // special case where the norm is packed into the per site struct
-            v.norm = reinterpret_cast<norm_t *>(const_cast<void *>(field.V()));
+            v.norm = field.data<norm_t *>();
           else
             v.norm = static_cast<norm_t *>(const_cast<void *>(field.Norm()));
           v.norm_offset = field.Bytes() / (2 * sizeof(norm_t));
@@ -1072,7 +1071,7 @@ namespace quda
       size_t bytes;
 
       FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) :
-        field(buffer ? buffer : (Float *)a.V()),
+        field(buffer ? buffer : a.data<Float *>()),
         norm(buffer ? reinterpret_cast<norm_type *>(reinterpret_cast<char *>(buffer) + a.NormOffset()) :
                       const_cast<norm_type *>(reinterpret_cast<const norm_type *>(a.Norm()))),
         offset(a.Bytes() / (2 * sizeof(Float) * N)),
@@ -1300,7 +1299,7 @@ namespace quda
       size_t bytes;
 
       FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) :
-        field(buffer ? buffer : (Float *)a.V()),
+        field(buffer ? buffer : a.data<Float *>()),
         offset(a.Bytes() / (2 * sizeof(Vector))),
         volumeCB(a.VolumeCB()),
         nParity(a.SiteSubset()),
@@ -1489,7 +1488,7 @@ namespace quda
       int faceVolumeCB[4];
       int nParity;
       SpaceColorSpinorOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) :
-        field(field_ ? field_ : (Float *)a.V()),
+        field(field_ ? field_ : a.data<Float *>()),
         offset(a.Bytes() / (2 * sizeof(Float))),
         volumeCB(a.VolumeCB()),
         nParity(a.SiteSubset())
@@ -1573,7 +1572,7 @@ namespace quda
       int faceVolumeCB[4];
       int nParity;
       SpaceSpinorColorOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) :
-        field(field_ ? field_ : (Float *)a.V()),
+        field(field_ ? field_ : a.data<Float *>()),
         offset(a.Bytes() / (2 * sizeof(Float))),
         volumeCB(a.VolumeCB()),
         nParity(a.SiteSubset())
@@ -1652,7 +1651,7 @@ namespace quda
       int exDim[4]; // full field dimensions
       PaddedSpaceSpinorColorOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0,
                                   Float **ghost_ = 0) :
-        field(field_ ? field_ : (Float *)a.V()),
+        field(field_ ? field_ : a.data<Float *>()),
         volumeCB(a.VolumeCB()),
         exVolumeCB(1),
         nParity(a.SiteSubset()),
@@ -1747,7 +1746,7 @@ namespace quda
       int volumeCB;
       int nParity;
       QDPJITDiracOrder(const ColorSpinorField &a, int = 1, Float *field_ = 0, float * = 0) :
-        field(field_ ? field_ : (Float *)a.V()), volumeCB(a.VolumeCB()), nParity(a.SiteSubset())
+        field(field_ ? field_ : a.data<Float *>()), volumeCB(a.VolumeCB()), nParity(a.SiteSubset())
       {
       }
 
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index 836b474cf0..e67582b682 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -305,8 +305,8 @@ namespace quda
 #endif
 
     // constructor needed for staggered to set xpay from derived class
-    DslashArg(const ColorSpinorField &in, const GaugeField &U, int parity, bool dagger, bool xpay, int nFace,
-              int spin_project, const int *comm_override,
+    DslashArg(const ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, const ColorSpinorField &x,
+              int parity, bool dagger, bool xpay, int nFace, int spin_project, const int *comm_override,
 #ifdef NVSHMEM_COMMS
               int shmem_ = 0) :
 #else
@@ -348,8 +348,14 @@ namespace quda
       retcount_intra(dslash::get_shmem_retcount_intra()),
       retcount_inter(dslash::get_shmem_retcount_inter())
 #endif
-
     {
+      if (in.data() == out.data()) errorQuda("Aliasing pointers");
+      checkOrder(out, in, x);        // check all orders match
+      checkPrecision(out, in, x, U); // check all precisions match
+      checkLocation(out, in, x, U);  // check all locations match
+      if (!in.isNative() || !U.isNative())
+        errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder());
+
       for (int d = 0; d < 4; d++) {
         commDim[d] = (comm_override[d] == 0) ? 0 : comm_dim_partitioned(d);
       }
diff --git a/include/kernels/covDev.cuh b/include/kernels/covDev.cuh
index b86e989bf7..28c52e9b38 100644
--- a/include/kernels/covDev.cuh
+++ b/include/kernels/covDev.cuh
@@ -37,19 +37,13 @@ namespace quda
 
     CovDevArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int mu, int parity, bool dagger,
               const int *comm_override) :
-      DslashArg<Float, nDim>(in, U, parity, dagger, false, 1, spin_project, comm_override),
+      DslashArg<Float, nDim>(out, in, U, in, parity, dagger, false, 1, spin_project, comm_override),
       out(out),
       in(in),
       in_pack(in),
       U(U),
       mu(mu)
     {
-      if (in.V() == out.V()) errorQuda("Aliasing pointers");
-      checkOrder(out, in);        // check all orders match
-      checkPrecision(out, in, U); // check all precisions match
-      checkLocation(out, in, U);  // check all locations match
-      if (!out.isNative() || !in.isNative() || !U.isNative())
-        errorQuda("Unsupported field order colorspinor(in)=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder());
     }
   };
 
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 8f772165bf..deb38455f8 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -51,7 +51,7 @@ namespace quda
 
     StaggeredArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, const GaugeField &L, double a,
                  const ColorSpinorField &x, int parity, bool dagger, const int *comm_override) :
-      DslashArg<Float, nDim>(in, U, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1, spin_project,
+      DslashArg<Float, nDim>(out, in, U, x, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1, spin_project,
                              comm_override),
       out(out),
       in(in, improved_ ? 3 : 1),
@@ -65,12 +65,6 @@ namespace quda
       is_last_time_slice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
       dagger_scale(dagger ? static_cast<real>(-1.0) : static_cast<real>(1.0))
     {
-      if (in.V() == out.V()) errorQuda("Aliasing pointers");
-      checkOrder(out, in, x);        // check all orders match
-      checkPrecision(out, in, x, U); // check all precisions match
-      checkLocation(out, in, x, U);  // check all locations match
-      if (!in.isNative() || !U.isNative())
-        errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder());
     }
   };
 
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index cd7575974a..f87e8f9865 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -38,7 +38,7 @@ namespace quda
 
     WilsonArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a,
               const ColorSpinorField &x, int parity, bool dagger, const int *comm_override) :
-      DslashArg<Float, nDim>(in, U, parity, dagger, a != 0.0 ? true : false, 1, spin_project, comm_override),
+      DslashArg<Float, nDim>(out, in, U, x, parity, dagger, a != 0.0 ? true : false, 1, spin_project, comm_override),
       out(out),
       in(in),
       in_pack(in),
@@ -46,12 +46,6 @@ namespace quda
       U(U),
       a(a)
     {
-      if (in.V() == out.V()) errorQuda("Aliasing pointers");
-      checkOrder(out, in, x);        // check all orders match
-      checkPrecision(out, in, x, U); // check all precisions match
-      checkLocation(out, in, x, U);  // check all locations match
-      if (!in.isNative() || !U.isNative())
-        errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder());
     }
   };
 
diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh
index ac09ddc5ed..a029242210 100644
--- a/include/kernels/laplace.cuh
+++ b/include/kernels/laplace.cuh
@@ -40,8 +40,7 @@ namespace quda
 
     LaplaceArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir, double a, double b,
                const ColorSpinorField &x, int parity, bool dagger, const int *comm_override) :
-
-      DslashArg<Float, nDim>(in, U, parity, dagger, a != 0.0 ? true : false, 1, false, comm_override),
+      DslashArg<Float, nDim>(out, in, U, x, parity, dagger, a != 0.0 ? true : false, 1, false, comm_override),
       out(out),
       in(in),
       in_pack(in),
@@ -51,12 +50,6 @@ namespace quda
       b(b),
       dir(dir)
     {
-      if (in.V() == out.V()) errorQuda("Aliasing pointers");
-      checkOrder(out, in, x);        // check all orders match
-      checkPrecision(out, in, x, U); // check all precisions match
-      checkLocation(out, in, x, U);  // check all locations match
-      if (!in.isNative() || !U.isNative())
-        errorQuda("Unsupported field order colorspinor(in)=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder());
       if (dir < 3 || dir > 4) errorQuda("Unsupported laplace direction %d (must be 3 or 4)", dir);
     }
   };
diff --git a/include/kernels/staggered_kd_apply_xinv_kernel.cuh b/include/kernels/staggered_kd_apply_xinv_kernel.cuh
index bbe8b70166..f5b137486f 100644
--- a/include/kernels/staggered_kd_apply_xinv_kernel.cuh
+++ b/include/kernels/staggered_kd_apply_xinv_kernel.cuh
@@ -39,7 +39,7 @@ namespace quda {
       X0h(out.X()[0]/2),
       volumeCB(in.VolumeCB())
     {
-      if (in.V() == out.V()) errorQuda("Aliasing pointers");
+      if (in.data() == out.data()) errorQuda("Aliasing pointers");
       checkOrder(out, in); // check all orders match
       checkPrecision(out, in, xInv); // check all precisions match
       checkLocation(out, in, xInv);
diff --git a/include/kernels/staggered_quark_smearing.cuh b/include/kernels/staggered_quark_smearing.cuh
index 2fdb42f17a..9f4db096e8 100644
--- a/include/kernels/staggered_quark_smearing.cuh
+++ b/include/kernels/staggered_quark_smearing.cuh
@@ -45,8 +45,7 @@ namespace quda
 
     StaggeredQSmearArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int t0,
                        bool is_t0_kernel, int parity, int dir, bool dagger, const int *comm_override) :
-
-      DslashArg<Float, nDim>(in, U, parity, dagger, false, 3, false, comm_override),
+      DslashArg<Float, nDim>(out, in, U, in, parity, dagger, false, 3, false, comm_override),
       out(out, 3),
       in(in, 3),
       in_pack(in, 3),
@@ -56,12 +55,6 @@ namespace quda
       is_t0_kernel(is_t0_kernel),
       t0_offset(is_t0_kernel ? in.VolumeCB() / in.X(3) : 0)
     {
-      if (in.V() == out.V()) errorQuda("Aliasing pointers");
-      checkOrder(out, in);        // check all orders match
-      checkPrecision(out, in, U); // check all precisions match
-      checkLocation(out, in, U);  // check all locations match
-      if (!in.isNative() || !U.isNative())
-        errorQuda("Unsupported field order colorspinor(in)=%d gauge=%d combination", in.FieldOrder(), U.FieldOrder());
       if (dir < 3 || dir > 4) errorQuda("Unsupported laplace direction %d (must be 3 or 4)", dir);
 
       for (int i = 0; i < 4; i++) {
diff --git a/lib/block_orthogonalize.in.cu b/lib/block_orthogonalize.in.cu
index 27b7d68f22..64651fb55f 100644
--- a/lib/block_orthogonalize.in.cu
+++ b/lib/block_orthogonalize.in.cu
@@ -278,7 +278,7 @@ namespace quda {
                 QUDA_PRECISION, V.Precision(), B[0]->Precision());
 
     if constexpr (is_enabled_multigrid()) {
-      if (B[0]->V() == nullptr) {
+      if (B[0]->data() == nullptr) {
         warningQuda("Trying to BlockOrthogonalize staggered transform, skipping...");
         return;
       }
diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp
index 2727069224..e91600b6c3 100644
--- a/lib/clover_field.cpp
+++ b/lib/clover_field.cpp
@@ -170,7 +170,7 @@ namespace quda {
     if (is_inverse && !src.Inverse() && !dynamic_inverse_copy)
       errorQuda("Source field's is_inverse=%d component does not exist", is_inverse);
 
-    auto src_v = dynamic_inverse_copy ? src.V(false) : src.V(is_inverse);
+    auto src_v = dynamic_inverse_copy ? src.data(false) : src.data(is_inverse);
 
     // if we copying to a reconstruction field, we must find the overall scale factor to allow us to reconstruct
     if (Reconstruct()) {
@@ -192,7 +192,7 @@ namespace quda {
         void *packClover = pool_pinned_malloc(bytes);
 
         copyGenericClover(*this, src, is_inverse, QUDA_CPU_FIELD_LOCATION, packClover, src_v);
-        qudaMemcpy(V(is_inverse), packClover, bytes, qudaMemcpyHostToDevice);
+        qudaMemcpy(data(is_inverse), packClover, bytes, qudaMemcpyHostToDevice);
 
         pool_pinned_free(packClover);
       } else if (reorder_location() == QUDA_CUDA_FIELD_LOCATION && src.Location() == QUDA_CPU_FIELD_LOCATION) {
@@ -217,7 +217,7 @@ namespace quda {
         void *packClover = pool_device_malloc(bytes);
 
         copyGenericClover(*this, src, is_inverse, QUDA_CUDA_FIELD_LOCATION, packClover, src_v);
-        qudaMemcpy(V(is_inverse), packClover, bytes, qudaMemcpyDeviceToHost);
+        qudaMemcpy(data(is_inverse), packClover, bytes, qudaMemcpyDeviceToHost);
 
         pool_device_free(packClover);
       }
@@ -331,7 +331,7 @@ namespace quda {
     spinor_param.fieldOrder = colorspinor::getNative(a.Precision(), a.Nspin());
     spinor_param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
     spinor_param.create = QUDA_REFERENCE_FIELD_CREATE;
-    spinor_param.v = a.V(inverse);
+    spinor_param.v = a.data(inverse);
     spinor_param.location = a.Location();
     return spinor_param;
   }
diff --git a/lib/coarse_op.in.cu b/lib/coarse_op.in.cu
index 358c3ba0b9..259da32c98 100644
--- a/lib/coarse_op.in.cu
+++ b/lib/coarse_op.in.cu
@@ -197,7 +197,7 @@ namespace quda {
     for (int i = 0; i < cf_param.nDim; i++) cf_param.x[i] = clover ? clover->X()[i] : 0;
 
     // only create inverse if not doing dynamic clover and one already exists
-    cf_param.inverse = !clover::dynamic_inverse() && clover && clover->V(true);
+    cf_param.inverse = !clover::dynamic_inverse() && clover && clover->Inverse();
     cf_param.clover = nullptr;
     cf_param.cloverInv = nullptr;
     cf_param.create = QUDA_NULL_FIELD_CREATE;
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index a40f191712..b26897948b 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -428,17 +428,17 @@ namespace quda
 
         if (src.FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) {
           // special case where we use mapped memory to read/write directly from application's array
-          void *src_d = get_mapped_device_pointer(src.V());
+          void *src_d = get_mapped_device_pointer(src.data());
           copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, v.data(), src_d);
         } else {
           void *Src = nullptr, *buffer = nullptr;
           if (!zeroCopy) {
             buffer = pool_device_malloc(src.Bytes());
             Src = buffer;
-            qudaMemcpy(Src, src.V(), src.Bytes(), qudaMemcpyDefault);
+            qudaMemcpy(Src, src.data(), src.Bytes(), qudaMemcpyDefault);
           } else {
             buffer = pool_pinned_malloc(src.Bytes());
-            memcpy(buffer, src.V(), src.Bytes());
+            memcpy(buffer, src.data(), src.Bytes());
             Src = get_mapped_device_pointer(buffer);
           }
 
@@ -465,7 +465,7 @@ namespace quda
         if (FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) {
           // special case where we use zero-copy memory to read/write directly from application's array
           void *dest_d = get_mapped_device_pointer(v.data());
-          copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, dest_d, src.V());
+          copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, dest_d, src.data());
         } else {
           void *dst = nullptr, *buffer = nullptr;
           if (!zeroCopy) {
@@ -837,7 +837,7 @@ namespace quda
       errorQuda("Cannot create an alias to source with lower precision than the alias");
     ColorSpinorParam param = param_.init ? param_ : ColorSpinorParam(*this);
     param.create = QUDA_REFERENCE_FIELD_CREATE;
-    param.v = V();
+    param.v = data();
 
     return ColorSpinorField(param);
   }
@@ -848,7 +848,7 @@ namespace quda
       errorQuda("Cannot create an alias to source with lower precision than the alias");
     ColorSpinorParam param(param_);
     param.create = QUDA_REFERENCE_FIELD_CREATE;
-    param.v = V();
+    param.v = data();
 
     return new ColorSpinorField(param);
   }
diff --git a/lib/color_spinor_util.in.cu b/lib/color_spinor_util.in.cu
index 3681438c9f..b018bc0e3f 100644
--- a/lib/color_spinor_util.in.cu
+++ b/lib/color_spinor_util.in.cu
@@ -417,7 +417,7 @@ namespace quda {
 
     param.create = create;
     if (create == QUDA_COPY_FIELD_CREATE) param.field = &const_cast<ColorSpinorField&>(src);
-    else if (create == QUDA_REFERENCE_FIELD_CREATE) param.v = const_cast<ColorSpinorField&>(src).V();
+    else if (create == QUDA_REFERENCE_FIELD_CREATE) param.v = src.data();
 
     resize(v, new_size, param);
   }
diff --git a/lib/copy_clover_offset.cu b/lib/copy_clover_offset.cu
index 1300082c24..f29e663c14 100644
--- a/lib/copy_clover_offset.cu
+++ b/lib/copy_clover_offset.cu
@@ -70,8 +70,8 @@ namespace quda
 
     if (pc_type != QUDA_4D_PC) { errorQuda("Gauge field copy must use 4d even-odd preconditioning."); }
 
-    if (in.V(true)) { instantiate<CopyCloverOffset>(out, in, offset, true); }
-    if (in.V(false)) { instantiate<CopyCloverOffset>(out, in, offset, false); }
+    if (in.Inverse()) instantiate<CopyCloverOffset>(out, in, offset, true);
+    instantiate<CopyCloverOffset>(out, in, offset, false);
   }
 #else
   void copyFieldOffset(CloverField &, const CloverField &, CommKey, QudaPCType)
diff --git a/lib/copy_color_spinor_mg.in.hpp b/lib/copy_color_spinor_mg.in.hpp
index a6678143b4..d28ffa4e80 100644
--- a/lib/copy_color_spinor_mg.in.hpp
+++ b/lib/copy_color_spinor_mg.in.hpp
@@ -117,14 +117,14 @@ namespace quda {
       }
 
       // set for the source subset ordering
-      srcFloat *srcEven = Src ? Src : (srcFloat*)src.V();
+      srcFloat *srcEven = Src ? Src : src.data<srcFloat*>();
       srcFloat *srcOdd = (srcFloat*)((char*)srcEven + src.Bytes()/2);
       if (src.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER) {
 	std::swap<srcFloat*>(srcEven, srcOdd);
       }
 
       // set for the destination subset ordering
-      dstFloat *dstEven = Dst ? Dst : (dstFloat*)dst.V();
+      dstFloat *dstEven = Dst ? Dst : dst.data<dstFloat*>();
       dstFloat *dstOdd = (dstFloat*)((char*)dstEven + dst.Bytes()/2);
       if (dst.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER) {
 	std::swap<dstFloat*>(dstEven, dstOdd);
diff --git a/lib/dirac.cpp b/lib/dirac.cpp
index 6e0a5912d3..e7be7cdc6d 100644
--- a/lib/dirac.cpp
+++ b/lib/dirac.cpp
@@ -115,7 +115,7 @@ namespace quda {
   }
 
   void Dirac::checkSpinorAlias(const ColorSpinorField &a, const ColorSpinorField &b) const {
-    if (a.V() == b.V()) errorQuda("Aliasing pointers");
+    if (a.data() == b.data()) errorQuda("Aliasing pointers");
   }
 
   // Dirac operator factory
diff --git a/lib/dslash_clover_helper.cu b/lib/dslash_clover_helper.cu
index 7389394ba1..accc50d31a 100644
--- a/lib/dslash_clover_helper.cu
+++ b/lib/dslash_clover_helper.cu
@@ -35,8 +35,8 @@ namespace quda {
       launch<CloverApply>(tp, stream, CloverArg<Float, nColor>(out, in, clover, parity));
     }
 
-    void preTune() { if (out.V() == in.V()) out.backup(); }  // Backup if in and out fields alias
-    void postTune() { if (out.V() == in.V()) out.restore(); } // Restore if the in and out fields alias
+    void preTune() { if (out.data() == in.data()) out.backup(); }  // Backup if in and out fields alias
+    void postTune() { if (out.data() == in.data()) out.restore(); } // Restore if the in and out fields alias
     long long flops() const { return in.Volume()*504ll; }
     long long bytes() const { return out.Bytes() + in.Bytes() + clover.Bytes() / (3 - in.SiteSubset()); }
   };
@@ -115,8 +115,8 @@ namespace quda {
       }
     }
 
-    void preTune() { if (out.V() == in.V()) out.backup(); } // Restore if the in and out fields alias
-    void postTune() { if (out.V() == in.V()) out.restore(); } // Restore if the in and out fields alias
+    void preTune() { if (out.data() == in.data()) out.backup(); } // Restore if the in and out fields alias
+    void postTune() { if (out.data() == in.data()) out.restore(); } // Restore if the in and out fields alias
     long long flops() const { return (inverse ? 1056ll : 552ll) * in.Volume(); }
     long long bytes() const {
       long long rtn = out.Bytes() + in.Bytes() + clover.Bytes() / (3 - in.SiteSubset());
diff --git a/lib/dslash_coarse.hpp b/lib/dslash_coarse.hpp
index a98290d129..9816a9af0c 100644
--- a/lib/dslash_coarse.hpp
+++ b/lib/dslash_coarse.hpp
@@ -413,7 +413,7 @@ namespace quda {
      */
     inline void operator()(DslashCoarsePolicy policy)
     {
-      if (inA[0].V() == out[0].V()) errorQuda("Aliasing pointers");
+      if (inA[0].data() == out[0].data()) errorQuda("Aliasing pointers");
 
       // check all precisions match
       QudaPrecision precision = checkPrecision(out[0], inA[0], inB[0]);
diff --git a/lib/dslash_gamma_helper.cu b/lib/dslash_gamma_helper.cu
index 4b7ef2458c..2e76504afd 100644
--- a/lib/dslash_gamma_helper.cu
+++ b/lib/dslash_gamma_helper.cu
@@ -74,8 +74,8 @@ namespace quda {
       launch<TwistGamma>(tp, stream, GammaArg<Float, nColor>(out, in, d, kappa, mu, epsilon, dagger, type));
     }
 
-    void preTune() { if (out.V() == in.V()) out.backup(); }
-    void postTune() { if (out.V() == in.V()) out.restore(); }
+    void preTune() { if (out.data() == in.data()) out.backup(); }
+    void postTune() { if (out.data() == in.data()) out.restore(); }
     long long flops() const { return 0; }
     long long bytes() const { return out.Bytes() + in.Bytes(); }
   };
diff --git a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu
index d169f4f0e1..ca8ce572d9 100644
--- a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu
+++ b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu
@@ -140,16 +140,6 @@ namespace quda
                                                    const ColorSpinorField &x, int parity, bool dagger,
                                                    const int *comm_override, TimeProfile &profile)
   {
-    if (in.V() == out.V()) errorQuda("Aliasing pointers");
-    if (in.FieldOrder() != out.FieldOrder())
-      errorQuda("Field order mismatch in = %d, out = %d", in.FieldOrder(), out.FieldOrder());
-
-    // check all precisions match
-    checkPrecision(out, in, U, A);
-
-    // check all locations match
-    checkLocation(out, in, U, A);
-
     instantiate<WilsonCloverHasenbuschTwistPCNoClovInvApply>(out, in, U, A, a, b, x, parity, dagger, comm_override,
                                                              profile);
   }
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 3c32443205..17b6bd4391 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -3322,7 +3322,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
         || param->dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) {
       if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loading clover field...\n"); }
       if (collected_clover) {
-        loadCloverQuda(collected_clover->V(false), collected_clover->V(true), param);
+        loadCloverQuda(collected_clover->data(false), collected_clover->data(true), param);
       } else {
         loadCloverQuda(nullptr, nullptr, param);
       }
@@ -3330,7 +3330,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     }
 
     for (int n = 0; n < param->num_src_per_sub_partition; n++) {
-      op(_collect_x[n]->V(), _collect_b[n]->V(), param, args...);
+      op(_collect_x[n]->data(), _collect_b[n]->data(), param, args...);
     }
 
     profileInvertMultiSrc.TPSTART(QUDA_PROFILE_TOTAL);
diff --git a/lib/inv_gmresdr_quda.cpp b/lib/inv_gmresdr_quda.cpp
index 62d7685eda..389206853e 100644
--- a/lib/inv_gmresdr_quda.cpp
+++ b/lib/inv_gmresdr_quda.cpp
@@ -282,7 +282,7 @@ namespace quda {
         blas::zero(Vm->Component(i));
     }
 
-    if (Zm->V() != Vm->V()) {
+    if (Zm->data() != Vm->data()) {
       std::vector<ColorSpinorField *> z(Zm->Components());
       std::vector<ColorSpinorField *> vk(args.Vkp1->Components().begin(), args.Vkp1->Components().begin() + args.k);
 
diff --git a/lib/inv_mr_quda.cpp b/lib/inv_mr_quda.cpp
index 44078ce783..4f636bf279 100644
--- a/lib/inv_mr_quda.cpp
+++ b/lib/inv_mr_quda.cpp
@@ -38,7 +38,7 @@ namespace quda
       bool mixed = param.precision != param.precision_sloppy;
 
       if (!mixed) csParam.create = QUDA_REFERENCE_FIELD_CREATE;
-      csParam.v = r.V();
+      csParam.v = r.data();
       r_sloppy = ColorSpinorField(csParam);
 
       init = true;
diff --git a/lib/max_clover.cu b/lib/max_clover.cu
index 18c84ca7a3..48e9630421 100644
--- a/lib/max_clover.cu
+++ b/lib/max_clover.cu
@@ -50,7 +50,7 @@ namespace quda {
 #ifdef GPU_CLOVER_DIRAC
   double _norm(const CloverField &u, bool inverse, norm_type_ type)
   {
-    if (!u.V(inverse)) errorQuda("reqeusted clover is_inverse=%d, but not allocated", inverse);
+    if (!u.data(inverse)) errorQuda("reqeusted clover is_inverse=%d, but not allocated", inverse);
     double nrm = 0.0;
     switch(u.Precision()) {
     case QUDA_DOUBLE_PRECISION: nrm = _norm<double>(u, inverse, type); break;
diff --git a/lib/multi_reduce_quda.cu b/lib/multi_reduce_quda.cu
index 6af44e8107..f93ab431e4 100644
--- a/lib/multi_reduce_quda.cu
+++ b/lib/multi_reduce_quda.cu
@@ -88,7 +88,7 @@ namespace quda {
         if (NXZ == NYW) {
           is_norm = true;
           for (int i = 0; i < NXZ; i++) {
-            if (x[i].V() != y[i].V() || x[i].V() != z[i].V() || x[i].V() != w[i].V()) {
+            if (x[i].data() != y[i].data() || x[i].data() != z[i].data() || x[i].data() != w[i].data()) {
               is_norm = false;
               break;
             }
diff --git a/lib/staggered_kd_apply_xinv.cu b/lib/staggered_kd_apply_xinv.cu
index 60e9034663..247668cb1c 100644
--- a/lib/staggered_kd_apply_xinv.cu
+++ b/lib/staggered_kd_apply_xinv.cu
@@ -22,7 +22,7 @@ namespace quda {
       Xinv(Xinv),
       dagger(dagger)
     {
-      if (out.V() == in.V()) errorQuda("Spinor fields cannot alias");
+      if (out.data() == in.data()) errorQuda("Spinor fields cannot alias");
       if (in.Nspin() != 1 || out.Nspin() != 1) errorQuda("Unsupported nSpin=%d %d", out.Nspin(), in.Nspin());
       if (Xinv.Geometry() != QUDA_KDINVERSE_GEOMETRY)
         errorQuda("Unsupported gauge geometry %d , expected %d for Xinv", Xinv.Geometry(), QUDA_KDINVERSE_GEOMETRY);
diff --git a/lib/vector_io.cpp b/lib/vector_io.cpp
index 736cc4d84d..52da9b2cb5 100644
--- a/lib/vector_io.cpp
+++ b/lib/vector_io.cpp
@@ -52,7 +52,7 @@ namespace quda
       std::vector<void *> V(Nvec * Ls);
       for (int i = 0; i < Nvec; i++) {
         auto &v = create_tmp ? tmp[i] : vecs[i];
-        for (int j = 0; j < Ls; j++) { V[i * Ls + j] = static_cast<char *>(v.V()) + j * stride; }
+        for (int j = 0; j < Ls; j++) { V[i * Ls + j] = v.data<char *>() + j * stride; }
       }
 
       read_spinor_field(filename.c_str(), V.data(), v0.Precision(), v0.X(), v0.SiteSubset(),
@@ -125,7 +125,7 @@ namespace quda
       std::vector<const void *> V(Nvec * Ls);
       for (int i = 0; i < Nvec; i++) {
         auto &v = create_tmp ? tmp[i] : vecs[i];
-        for (int j = 0; j < Ls; j++) { V[i * Ls + j] = static_cast<const char *>(v.V()) + j * stride; }
+        for (int j = 0; j < Ls; j++) { V[i * Ls + j] = v.data<const char *>() + j * stride; }
       }
 
       write_spinor_field(filename.c_str(), V.data(), save_prec, v0.X(), v0.SiteSubset(),
diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h
index abc1270cb4..8f01594579 100644
--- a/tests/dslash_test_utils.h
+++ b/tests/dslash_test_utils.h
@@ -347,51 +347,51 @@ struct DslashTestWrapper {
     if (dslash_type == QUDA_WILSON_DSLASH) {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        wil_dslash(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, inv_param.cpu_prec, gauge_param);
+        wil_dslash(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatPC:
-        wil_matpc(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.matpc_type, inv_param.dagger,
+        wil_matpc(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.matpc_type, inv_param.dagger,
                   inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::Mat:
-        wil_mat(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec, gauge_param);
+        wil_mat(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatPCDagMatPC:
-        wil_matpc(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.matpc_type, inv_param.dagger,
+        wil_matpc(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.matpc_type, inv_param.dagger,
                   inv_param.cpu_prec, gauge_param);
-        wil_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, inv_param.matpc_type, not_dagger,
+        wil_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, inv_param.matpc_type, not_dagger,
                   inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatDagMat:
-        wil_mat(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-        wil_mat(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, not_dagger, inv_param.cpu_prec, gauge_param);
+        wil_mat(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec, gauge_param);
+        wil_mat(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, not_dagger, inv_param.cpu_prec, gauge_param);
         break;
       default: printfQuda("Test type not defined\n"); exit(-1);
       }
     } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        clover_dslash(spinorRef.V(), hostGauge, hostCloverInv, spinor.V(), parity, inv_param.dagger, inv_param.cpu_prec,
+        clover_dslash(spinorRef.data(), hostGauge, hostCloverInv, spinor.data(), parity, inv_param.dagger, inv_param.cpu_prec,
                       gauge_param);
         break;
       case dslash_test_type::MatPC:
-        clover_matpc(spinorRef.V(), hostGauge, hostClover, hostCloverInv, spinor.V(), inv_param.kappa,
+        clover_matpc(spinorRef.data(), hostGauge, hostClover, hostCloverInv, spinor.data(), inv_param.kappa,
                      inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::Mat:
-        clover_mat(spinorRef.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.dagger,
+        clover_mat(spinorRef.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.dagger,
                    inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatPCDagMatPC:
-        clover_matpc(spinorTmp.V(), hostGauge, hostClover, hostCloverInv, spinor.V(), inv_param.kappa,
+        clover_matpc(spinorTmp.data(), hostGauge, hostClover, hostCloverInv, spinor.data(), inv_param.kappa,
                      inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-        clover_matpc(spinorRef.V(), hostGauge, hostClover, hostCloverInv, spinorTmp.V(), inv_param.kappa,
+        clover_matpc(spinorRef.data(), hostGauge, hostClover, hostCloverInv, spinorTmp.data(), inv_param.kappa,
                      inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatDagMat:
-        clover_mat(spinorTmp.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.dagger,
+        clover_mat(spinorTmp.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.dagger,
                    inv_param.cpu_prec, gauge_param);
-        clover_mat(spinorRef.V(), hostGauge, hostClover, spinorTmp.V(), inv_param.kappa, not_dagger, inv_param.cpu_prec,
+        clover_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa, not_dagger, inv_param.cpu_prec,
                    gauge_param);
         break;
       default: printfQuda("Test type not defined\n"); exit(-1);
@@ -401,37 +401,37 @@ struct DslashTestWrapper {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
         // My dslash should be the same as the clover dslash
-        clover_dslash(spinorRef.V(), hostGauge, hostCloverInv, spinor.V(), parity, inv_param.dagger, inv_param.cpu_prec,
+        clover_dslash(spinorRef.data(), hostGauge, hostCloverInv, spinor.data(), parity, inv_param.dagger, inv_param.cpu_prec,
                       gauge_param);
         break;
       case dslash_test_type::MatPC:
         // my matpc op
-        cloverHasenbuschTwist_matpc(spinorRef.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa,
+        cloverHasenbuschTwist_matpc(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa,
                                     inv_param.mu, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec,
                                     gauge_param);
 
         break;
       case dslash_test_type::Mat:
         // my mat
-        cloverHasenbuchTwist_mat(spinorRef.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu,
+        cloverHasenbuchTwist_mat(spinorRef.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu,
                                  inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.matpc_type);
         break;
       case dslash_test_type::MatPCDagMatPC:
         // matpc^\dagger matpc
         // my matpc op
-        cloverHasenbuschTwist_matpc(spinorTmp.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa,
+        cloverHasenbuschTwist_matpc(spinorTmp.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa,
                                     inv_param.mu, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec,
                                     gauge_param);
 
-        cloverHasenbuschTwist_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), hostClover, hostCloverInv, inv_param.kappa,
+        cloverHasenbuschTwist_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), hostClover, hostCloverInv, inv_param.kappa,
                                     inv_param.mu, inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param);
 
         break;
       case dslash_test_type::MatDagMat:
         // my mat
-        cloverHasenbuchTwist_mat(spinorTmp.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu,
+        cloverHasenbuchTwist_mat(spinorTmp.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu,
                                  inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.matpc_type);
-        cloverHasenbuchTwist_mat(spinorRef.V(), hostGauge, hostClover, spinorTmp.V(), inv_param.kappa, inv_param.mu,
+        cloverHasenbuchTwist_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa, inv_param.mu,
                                  not_dagger, inv_param.cpu_prec, gauge_param, inv_param.matpc_type);
 
         break;
@@ -441,54 +441,54 @@ struct DslashTestWrapper {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET)
-          tm_dslash(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, parity,
+          tm_dslash(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, parity,
                     inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         else {
-          tm_ndeg_dslash(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon, parity,
+          tm_ndeg_dslash(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon, parity,
                          inv_param.dagger, inv_param.matpc_type, inv_param.cpu_prec, gauge_param);
         }
         break;
       case dslash_test_type::MatPC:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET)
-          tm_matpc(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+          tm_matpc(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
                    inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         else {
-          tm_ndeg_matpc(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
+          tm_ndeg_matpc(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
                         inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         }
         break;
       case dslash_test_type::Mat:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET)
-          tm_mat(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+          tm_mat(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
                  inv_param.dagger, inv_param.cpu_prec, gauge_param);
         else {
-          tm_ndeg_mat(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
+          tm_ndeg_mat(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
                       inv_param.dagger, inv_param.cpu_prec, gauge_param);
         }
         break;
       case dslash_test_type::MatPCDagMatPC:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) {
-          tm_matpc(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+          tm_matpc(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
                    inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tm_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+          tm_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
                    inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param);
         } else {
-          tm_ndeg_matpc(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
+          tm_ndeg_matpc(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
                         inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tm_ndeg_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
+          tm_ndeg_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
                         inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param);
         }
         break;
       case dslash_test_type::MatDagMat:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) {
-          tm_mat(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+          tm_mat(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
                  inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tm_mat(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+          tm_mat(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
                  not_dagger, inv_param.cpu_prec, gauge_param);
         } else {
-          tm_ndeg_mat(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
+          tm_ndeg_mat(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
                       inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tm_ndeg_mat(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
+          tm_ndeg_mat(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
                       not_dagger, inv_param.cpu_prec, gauge_param);
         }
         break;
@@ -498,54 +498,54 @@ struct DslashTestWrapper {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET)
-          tmc_dslash(spinorRef.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
+          tmc_dslash(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
                      inv_param.twist_flavor, parity, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec,
                      gauge_param);
         else
-          tmc_ndeg_dslash(spinorRef.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa,
+          tmc_ndeg_dslash(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa,
                           inv_param.mu, inv_param.epsilon, parity, inv_param.matpc_type, inv_param.dagger,
                           inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatPC:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET)
-          tmc_matpc(spinorRef.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
+          tmc_matpc(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
                     inv_param.twist_flavor, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         else
-          tmc_ndeg_matpc(spinorRef.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
+          tmc_ndeg_matpc(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
                          inv_param.epsilon, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::Mat:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET)
-          tmc_mat(spinorRef.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu,
+          tmc_mat(spinorRef.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu,
                   inv_param.twist_flavor, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         else
-          tmc_ndeg_mat(spinorRef.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu,
+          tmc_ndeg_mat(spinorRef.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu,
                        inv_param.epsilon, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatPCDagMatPC:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) {
-          tmc_matpc(spinorTmp.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
+          tmc_matpc(spinorTmp.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
                     inv_param.twist_flavor, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tmc_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
+          tmc_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
                     inv_param.twist_flavor, inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param);
         } else {
-          tmc_ndeg_matpc(spinorTmp.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
+          tmc_ndeg_matpc(spinorTmp.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
                          inv_param.epsilon, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tmc_ndeg_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), hostClover, hostCloverInv, inv_param.kappa,
+          tmc_ndeg_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), hostClover, hostCloverInv, inv_param.kappa,
                          inv_param.mu, inv_param.epsilon, inv_param.matpc_type, not_dagger, inv_param.cpu_prec,
                          gauge_param);
         }
         break;
       case dslash_test_type::MatDagMat:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) {
-          tmc_mat(spinorTmp.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu,
+          tmc_mat(spinorTmp.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu,
                   inv_param.twist_flavor, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tmc_mat(spinorRef.V(), hostGauge, hostClover, spinorTmp.V(), inv_param.kappa, inv_param.mu,
+          tmc_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa, inv_param.mu,
                   inv_param.twist_flavor, not_dagger, inv_param.cpu_prec, gauge_param);
         } else {
-          tmc_ndeg_mat(spinorTmp.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu,
+          tmc_ndeg_mat(spinorTmp.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu,
                        inv_param.epsilon, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tmc_ndeg_mat(spinorRef.V(), hostGauge, hostClover, spinorTmp.V(), inv_param.kappa, inv_param.mu,
+          tmc_ndeg_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa, inv_param.mu,
                        inv_param.epsilon, not_dagger, inv_param.cpu_prec, gauge_param);
         }
         break;
@@ -554,25 +554,25 @@ struct DslashTestWrapper {
     } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        dw_dslash(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
+        dw_dslash(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
                   inv_param.mass);
         break;
       case dslash_test_type::MatPC:
-        dw_matpc(spinorRef.V(), hostGauge, spinor.V(), kappa5, inv_param.matpc_type, inv_param.dagger,
+        dw_matpc(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.matpc_type, inv_param.dagger,
                  gauge_param.cpu_prec, gauge_param, inv_param.mass);
         break;
       case dslash_test_type::Mat:
-        dw_mat(spinorRef.V(), hostGauge, spinor.V(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
+        dw_mat(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
                inv_param.mass);
         break;
       case dslash_test_type::MatPCDagMatPC:
-        dw_matpc(spinorTmp.V(), hostGauge, spinor.V(), kappa5, inv_param.matpc_type, inv_param.dagger,
+        dw_matpc(spinorTmp.data(), hostGauge, spinor.data(), kappa5, inv_param.matpc_type, inv_param.dagger,
                  gauge_param.cpu_prec, gauge_param, inv_param.mass);
-        dw_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), kappa5, inv_param.matpc_type, not_dagger,
+        dw_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), kappa5, inv_param.matpc_type, not_dagger,
                  gauge_param.cpu_prec, gauge_param, inv_param.mass);
         break;
       case dslash_test_type::MatDagMat:
-        dw_matdagmat(spinorRef.V(), hostGauge, spinor.V(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
+        dw_matdagmat(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
                      inv_param.mass);
         break;
       default: printf("Test type not supported for domain wall\n"); exit(-1);
@@ -582,35 +582,35 @@ struct DslashTestWrapper {
       for (int xs = 0; xs < Ls; xs++) kappa_5[xs] = kappa5;
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        dslash_4_4d(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
+        dslash_4_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
                     inv_param.mass);
         break;
       case dslash_test_type::M5:
-        dw_dslash_5_4d(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec,
+        dw_dslash_5_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
                        gauge_param, inv_param.mass, true);
         break;
       case dslash_test_type::M5inv:
-        dslash_5_inv(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
+        dslash_5_inv(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
                      inv_param.mass, kappa_5);
         break;
       case dslash_test_type::MatPC:
-        dw_4d_matpc(spinorRef.V(), hostGauge, spinor.V(), kappa5, inv_param.matpc_type, inv_param.dagger,
+        dw_4d_matpc(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.matpc_type, inv_param.dagger,
                     gauge_param.cpu_prec, gauge_param, inv_param.mass);
         break;
       case dslash_test_type::Mat:
-        dw_4d_mat(spinorRef.V(), hostGauge, spinor.V(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
+        dw_4d_mat(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
                   inv_param.mass);
         break;
       case dslash_test_type::MatPCDagMatPC:
-        dw_4d_matpc(spinorTmp.V(), hostGauge, spinor.V(), kappa5, inv_param.matpc_type, inv_param.dagger,
+        dw_4d_matpc(spinorTmp.data(), hostGauge, spinor.data(), kappa5, inv_param.matpc_type, inv_param.dagger,
                     gauge_param.cpu_prec, gauge_param, inv_param.mass);
-        dw_4d_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), kappa5, inv_param.matpc_type, not_dagger,
+        dw_4d_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), kappa5, inv_param.matpc_type, not_dagger,
                     gauge_param.cpu_prec, gauge_param, inv_param.mass);
         break;
       case dslash_test_type::MatDagMat:
-        dw_4d_mat(spinorTmp.V(), hostGauge, spinor.V(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
+        dw_4d_mat(spinorTmp.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
                   inv_param.mass);
-        dw_4d_mat(spinorRef.V(), hostGauge, spinorTmp.V(), kappa5, not_dagger, gauge_param.cpu_prec, gauge_param,
+        dw_4d_mat(spinorRef.data(), hostGauge, spinorTmp.data(), kappa5, not_dagger, gauge_param.cpu_prec, gauge_param,
                   inv_param.mass);
         break;
       default: printf("Test type not supported for domain wall\n"); exit(-1);
@@ -629,44 +629,44 @@ struct DslashTestWrapper {
       }
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        dslash_4_4d(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
+        dslash_4_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
                     inv_param.mass);
         break;
       case dslash_test_type::M5:
-        mdw_dslash_5(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
+        mdw_dslash_5(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
                      inv_param.mass, kappa_5, true);
         break;
       case dslash_test_type::Dslash4pre:
-        mdw_dslash_4_pre(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec,
+        mdw_dslash_4_pre(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
                          gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5, true);
         break;
       case dslash_test_type::M5inv:
-        mdw_dslash_5_inv(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec,
+        mdw_dslash_5_inv(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
                          gauge_param, inv_param.mass, kappa_mdwf);
         break;
       case dslash_test_type::MatPC:
-        mdw_matpc(spinorRef.V(), hostGauge, spinor.V(), kappa_b, kappa_c, inv_param.matpc_type, inv_param.dagger,
+        mdw_matpc(spinorRef.data(), hostGauge, spinor.data(), kappa_b, kappa_c, inv_param.matpc_type, inv_param.dagger,
                   gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5);
         break;
       case dslash_test_type::Mat:
-        mdw_mat(spinorRef.V(), hostGauge, spinor.V(), kappa_b, kappa_c, inv_param.dagger, gauge_param.cpu_prec,
+        mdw_mat(spinorRef.data(), hostGauge, spinor.data(), kappa_b, kappa_c, inv_param.dagger, gauge_param.cpu_prec,
                 gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5);
         break;
       case dslash_test_type::MatPCDagMatPC:
-        mdw_matpc(spinorTmp.V(), hostGauge, spinor.V(), kappa_b, kappa_c, inv_param.matpc_type, inv_param.dagger,
+        mdw_matpc(spinorTmp.data(), hostGauge, spinor.data(), kappa_b, kappa_c, inv_param.matpc_type, inv_param.dagger,
                   gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5);
-        mdw_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), kappa_b, kappa_c, inv_param.matpc_type, not_dagger,
+        mdw_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), kappa_b, kappa_c, inv_param.matpc_type, not_dagger,
                   gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5);
         break;
       case dslash_test_type::MatDagMat:
-        mdw_mat(spinorTmp.V(), hostGauge, spinor.V(), kappa_b, kappa_c, inv_param.dagger, gauge_param.cpu_prec,
+        mdw_mat(spinorTmp.data(), hostGauge, spinor.data(), kappa_b, kappa_c, inv_param.dagger, gauge_param.cpu_prec,
                 gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5);
-        mdw_mat(spinorRef.V(), hostGauge, spinorTmp.V(), kappa_b, kappa_c, not_dagger, gauge_param.cpu_prec,
+        mdw_mat(spinorRef.data(), hostGauge, spinorTmp.data(), kappa_b, kappa_c, not_dagger, gauge_param.cpu_prec,
                 gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5);
         break;
       case dslash_test_type::MatPCDagMatPCLocal:
         // reference for MdagM local operator
-        mdw_mdagm_local(spinorRef.V(), hostGauge, spinor.V(), kappa_b, kappa_c, inv_param.matpc_type,
+        mdw_mdagm_local(spinorRef.data(), hostGauge, spinor.data(), kappa_b, kappa_c, inv_param.matpc_type,
                         gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5);
         break;
       default: printf("Test type not supported for Mobius domain wall\n"); exit(-1);
@@ -688,48 +688,48 @@ struct DslashTestWrapper {
       }
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        dslash_4_4d(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
+        dslash_4_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
                     inv_param.mass);
         break;
       case dslash_test_type::M5:
-        mdw_eofa_m5(spinorRef.V(), spinor.V(), parity, inv_param.dagger, inv_param.mass, inv_param.m5,
+        mdw_eofa_m5(spinorRef.data(), spinor.data(), parity, inv_param.dagger, inv_param.mass, inv_param.m5,
                     (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2,
                     inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift, gauge_param.cpu_prec);
         break;
       case dslash_test_type::Dslash4pre:
-        mdw_dslash_4_pre(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec,
+        mdw_dslash_4_pre(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
                          gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5, true);
         break;
       case dslash_test_type::M5inv:
-        mdw_eofa_m5inv(spinorRef.V(), spinor.V(), parity, inv_param.dagger, inv_param.mass, inv_param.m5,
+        mdw_eofa_m5inv(spinorRef.data(), spinor.data(), parity, inv_param.dagger, inv_param.mass, inv_param.m5,
                        (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2,
                        inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift, gauge_param.cpu_prec);
         break;
       case dslash_test_type::Mat:
-        mdw_eofa_mat(spinorRef.V(), hostGauge, spinor.V(), inv_param.dagger, gauge_param.cpu_prec, gauge_param,
+        mdw_eofa_mat(spinorRef.data(), hostGauge, spinor.data(), inv_param.dagger, gauge_param.cpu_prec, gauge_param,
                      inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]),
                      inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift);
         break;
       case dslash_test_type::MatDagMat:
-        mdw_eofa_mat(spinorTmp.V(), hostGauge, spinor.V(), inv_param.dagger, gauge_param.cpu_prec, gauge_param,
+        mdw_eofa_mat(spinorTmp.data(), hostGauge, spinor.data(), inv_param.dagger, gauge_param.cpu_prec, gauge_param,
                      inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]),
                      inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift);
-        mdw_eofa_mat(spinorRef.V(), hostGauge, spinorTmp.V(), not_dagger, gauge_param.cpu_prec, gauge_param,
+        mdw_eofa_mat(spinorRef.data(), hostGauge, spinorTmp.data(), not_dagger, gauge_param.cpu_prec, gauge_param,
                      inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]),
                      inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift);
         break;
       case dslash_test_type::MatPC:
-        mdw_eofa_matpc(spinorRef.V(), hostGauge, spinor.V(), inv_param.matpc_type, inv_param.dagger,
+        mdw_eofa_matpc(spinorRef.data(), hostGauge, spinor.data(), inv_param.matpc_type, inv_param.dagger,
                        gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]),
                        (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm,
                        inv_param.eofa_shift);
         break;
       case dslash_test_type::MatPCDagMatPC:
-        mdw_eofa_matpc(spinorTmp.V(), hostGauge, spinor.V(), inv_param.matpc_type, inv_param.dagger,
+        mdw_eofa_matpc(spinorTmp.data(), hostGauge, spinor.data(), inv_param.matpc_type, inv_param.dagger,
                        gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]),
                        (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm,
                        inv_param.eofa_shift);
-        mdw_eofa_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.matpc_type, not_dagger, gauge_param.cpu_prec,
+        mdw_eofa_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.matpc_type, not_dagger, gauge_param.cpu_prec,
                        gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]),
                        (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm,
                        inv_param.eofa_shift);
@@ -764,8 +764,8 @@ struct DslashTestWrapper {
       std::vector<void *> _hp_x(inv_param.num_src);
       std::vector<void *> _hp_b(inv_param.num_src);
       for (int i = 0; i < inv_param.num_src; i++) {
-        _hp_x[i] = vp_spinorOut[i].V();
-        _hp_b[i] = vp_spinor[i].V();
+        _hp_x[i] = vp_spinorOut[i].data();
+        _hp_b[i] = vp_spinor[i].data();
       }
 
       if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH
@@ -786,21 +786,21 @@ struct DslashTestWrapper {
           switch (dtest_type) {
           case dslash_test_type::Dslash:
             if (transfer) {
-              dslashQuda_4dpc(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type);
+              dslashQuda_4dpc(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type);
             } else {
               static_cast<quda::DiracDomainWall4DPC *>(dirac)->Dslash4(cudaSpinorOut, cudaSpinor, parity);
             }
             break;
           case dslash_test_type::M5:
             if (transfer) {
-              dslashQuda_4dpc(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type);
+              dslashQuda_4dpc(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type);
             } else {
               static_cast<quda::DiracDomainWall4DPC *>(dirac)->Dslash5(cudaSpinorOut, cudaSpinor);
             }
             break;
           case dslash_test_type::M5inv:
             if (transfer) {
-              dslashQuda_4dpc(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type);
+              dslashQuda_4dpc(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type);
             } else {
               static_cast<quda::DiracDomainWall4DPC *>(dirac)->M5inv(cudaSpinorOut, cudaSpinor);
             }
@@ -808,7 +808,7 @@ struct DslashTestWrapper {
           case dslash_test_type::MatPC:
           case dslash_test_type::Mat:
             if (transfer) {
-              MatQuda(spinorOut.V(), spinor.V(), &inv_param);
+              MatQuda(spinorOut.data(), spinor.data(), &inv_param);
             } else {
               dirac->M(cudaSpinorOut, cudaSpinor);
             }
@@ -816,7 +816,7 @@ struct DslashTestWrapper {
           case dslash_test_type::MatPCDagMatPC:
           case dslash_test_type::MatDagMat:
             if (transfer) {
-              MatDagMatQuda(spinorOut.V(), spinor.V(), &inv_param);
+              MatDagMatQuda(spinorOut.data(), spinor.data(), &inv_param);
             } else {
               dirac->MdagM(cudaSpinorOut, cudaSpinor);
             }
@@ -828,28 +828,28 @@ struct DslashTestWrapper {
           switch (dtest_type) {
           case dslash_test_type::Dslash:
             if (transfer) {
-              dslashQuda_mdwf(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type);
+              dslashQuda_mdwf(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type);
             } else {
               static_cast<quda::DiracMobiusPC *>(dirac)->Dslash4(cudaSpinorOut, cudaSpinor, parity);
             }
             break;
           case dslash_test_type::M5:
             if (transfer) {
-              dslashQuda_mdwf(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type);
+              dslashQuda_mdwf(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type);
             } else {
               static_cast<quda::DiracMobiusPC *>(dirac)->Dslash5(cudaSpinorOut, cudaSpinor);
             }
             break;
           case dslash_test_type::Dslash4pre:
             if (transfer) {
-              dslashQuda_mdwf(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type);
+              dslashQuda_mdwf(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type);
             } else {
               static_cast<quda::DiracMobiusPC *>(dirac)->Dslash4pre(cudaSpinorOut, cudaSpinor);
             }
             break;
           case dslash_test_type::M5inv:
             if (transfer) {
-              dslashQuda_mdwf(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type);
+              dslashQuda_mdwf(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type);
             } else {
               static_cast<quda::DiracMobiusPC *>(dirac)->M5inv(cudaSpinorOut, cudaSpinor);
             }
@@ -857,7 +857,7 @@ struct DslashTestWrapper {
           case dslash_test_type::MatPC:
           case dslash_test_type::Mat:
             if (transfer) {
-              MatQuda(spinorOut.V(), spinor.V(), &inv_param);
+              MatQuda(spinorOut.data(), spinor.data(), &inv_param);
             } else {
               dirac->M(cudaSpinorOut, cudaSpinor);
             }
@@ -865,7 +865,7 @@ struct DslashTestWrapper {
           case dslash_test_type::MatPCDagMatPC:
           case dslash_test_type::MatDagMat:
             if (transfer) {
-              MatDagMatQuda(spinorOut.V(), spinor.V(), &inv_param);
+              MatDagMatQuda(spinorOut.data(), spinor.data(), &inv_param);
             } else {
               dirac->MdagM(cudaSpinorOut, cudaSpinor);
             }
@@ -940,13 +940,13 @@ struct DslashTestWrapper {
           case dslash_test_type::Dslash:
             if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
               if (transfer) {
-                dslashQuda(spinorOut.V(), spinor.V(), &inv_param, parity);
+                dslashQuda(spinorOut.data(), spinor.data(), &inv_param, parity);
               } else {
                 dirac->Dslash(cudaSpinorOut, cudaSpinor, parity);
               }
             } else {
               if (transfer) {
-                dslashQuda(spinorOut.V(), spinor.V(), &inv_param, parity);
+                dslashQuda(spinorOut.data(), spinor.data(), &inv_param, parity);
               } else {
                 dirac->Dslash(cudaSpinorOut, cudaSpinor, parity);
               }
@@ -955,7 +955,7 @@ struct DslashTestWrapper {
           case dslash_test_type::MatPC:
           case dslash_test_type::Mat:
             if (transfer) {
-              MatQuda(spinorOut.V(), spinor.V(), &inv_param);
+              MatQuda(spinorOut.data(), spinor.data(), &inv_param);
             } else {
               dirac->M(cudaSpinorOut, cudaSpinor);
             }
@@ -963,7 +963,7 @@ struct DslashTestWrapper {
           case dslash_test_type::MatPCDagMatPC:
           case dslash_test_type::MatDagMat:
             if (transfer) {
-              MatDagMatQuda(spinorOut.V(), spinor.V(), &inv_param);
+              MatDagMatQuda(spinorOut.data(), spinor.data(), &inv_param);
             } else {
               dirac->MdagM(cudaSpinorOut, cudaSpinor);
             }
diff --git a/tests/eigensolve_test.cpp b/tests/eigensolve_test.cpp
index e22879ff92..7c17540a60 100644
--- a/tests/eigensolve_test.cpp
+++ b/tests/eigensolve_test.cpp
@@ -179,7 +179,7 @@ std::vector<double> eigensolve(test_t test_param)
   // Allocate host side memory and pointers
   for (int i = 0; i < n_eig; i++) {
     evecs[i] = quda::ColorSpinorField(cs_param);
-    host_evecs_ptr[i] = evecs[i].V();
+    host_evecs_ptr[i] = evecs[i].data();
   }
 
   // Complex eigenvalues
@@ -208,12 +208,12 @@ std::vector<double> eigensolve(test_t test_param)
     for (int i = 0; i < eig_n_conv; i++) {
       if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) {
         double _Complex sigma = evals[i];
-        residua[i] = verifyWilsonTypeSingularVector(evecs[i].V(), evecs[i + eig_n_conv].V(), sigma, i, gauge_param,
+        residua[i] = verifyWilsonTypeSingularVector(evecs[i].data(), evecs[i + eig_n_conv].data(), sigma, i, gauge_param,
                                                     eig_param, gauge.data(), clover.data(), clover_inv.data());
 
       } else {
         double _Complex lambda = evals[i];
-        residua[i] = verifyWilsonTypeEigenvector(evecs[i].V(), lambda, i, gauge_param, eig_param, gauge.data(),
+        residua[i] = verifyWilsonTypeEigenvector(evecs[i].data(), lambda, i, gauge_param, eig_param, gauge.data(),
                                                  clover.data(), clover_inv.data());
       }
     }
diff --git a/tests/host_reference/covdev_reference.cpp b/tests/host_reference/covdev_reference.cpp
index 97dae09402..05a8fe839b 100644
--- a/tests/host_reference/covdev_reference.cpp
+++ b/tests/host_reference/covdev_reference.cpp
@@ -87,10 +87,10 @@ void Mat(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &
 {
   // full dslash operator
   void *data[4] = {link.data(0), link.data(1), link.data(2), link.data(3)};
-  covdevReference(reinterpret_cast<sFloat *>(out.Odd().V()), reinterpret_cast<gFloat **>(data),
-                  reinterpret_cast<sFloat *>(in.Even().V()), 1, daggerBit, mu);
-  covdevReference(reinterpret_cast<sFloat *>(out.Even().V()), reinterpret_cast<gFloat **>(data),
-                  reinterpret_cast<sFloat *>(in.Odd().V()), 0, daggerBit, mu);
+  covdevReference(reinterpret_cast<sFloat *>(out.Odd().data()), reinterpret_cast<gFloat **>(data),
+                  reinterpret_cast<sFloat *>(in.Even().data()), 1, daggerBit, mu);
+  covdevReference(reinterpret_cast<sFloat *>(out.Even().data()), reinterpret_cast<gFloat **>(data),
+                  reinterpret_cast<sFloat *>(in.Odd().data()), 0, daggerBit, mu);
 }
 
 void mat(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu)
@@ -178,7 +178,7 @@ void covdevReference_mg4dir(sFloat *res, gFloat **link, gFloat **ghostLink, cons
     int offset = spinor_site_size * sid;
 
     gFloat *lnk = gaugeLink_mg4dir(sid, mu, oddBit, linkEven, linkOdd, ghostLinkEven, ghostLinkOdd, 1, 1);
-    const sFloat *spinor = spinorNeighbor_mg4dir(sid, mu, oddBit, static_cast<const sFloat *>(in.V()), fwd_nbr_spinor,
+    const sFloat *spinor = spinorNeighbor_mg4dir(sid, mu, oddBit, static_cast<const sFloat *>(in.data()), fwd_nbr_spinor,
                                                  back_nbr_spinor, 1, 1);
 
     sFloat gaugedSpinor[spinor_site_size];
@@ -212,15 +212,15 @@ void covdev_dslash_mg4dir(ColorSpinorField &out, const GaugeField &link, const C
 
   if (sPrecision == QUDA_DOUBLE_PRECISION) {
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      covdevReference_mg4dir((double *)out.V(), reinterpret_cast<double**>(data), (double **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((double *)out.data(), reinterpret_cast<double**>(data), (double **)ghostLink, in, oddBit, daggerBit, mu);
     } else {
-      covdevReference_mg4dir((double *)out.V(), reinterpret_cast<float**>(data), (float **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((double *)out.data(), reinterpret_cast<float**>(data), (float **)ghostLink, in, oddBit, daggerBit, mu);
     }
   } else {
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      covdevReference_mg4dir((float *)out.V(), reinterpret_cast<double**>(data), (double **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((float *)out.data(), reinterpret_cast<double**>(data), (double **)ghostLink, in, oddBit, daggerBit, mu);
     } else {
-      covdevReference_mg4dir((float *)out.V(), reinterpret_cast<float**>(data), (float **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((float *)out.data(), reinterpret_cast<float**>(data), (float **)ghostLink, in, oddBit, daggerBit, mu);
     }
   }
 }
@@ -237,7 +237,7 @@ void Mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinor
     auto &outOdd = out.Odd();
 
     inEven.exchangeGhost(QUDA_EVEN_PARITY, nFace, daggerBit);
-    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outOdd.V()), reinterpret_cast<gFloat**>(data),
+    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outOdd.data()), reinterpret_cast<gFloat**>(data),
                            reinterpret_cast<gFloat**>(ghostLink), in.Even(), 1, daggerBit, mu);
   }
 
@@ -246,7 +246,7 @@ void Mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinor
     auto &outEven = out.Even();
 
     inOdd.exchangeGhost(QUDA_ODD_PARITY, nFace, daggerBit);
-    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outEven.V()), reinterpret_cast<gFloat**>(data),
+    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outEven.data()), reinterpret_cast<gFloat**>(data),
                            reinterpret_cast<gFloat**>(ghostLink), in.Odd(), 0, daggerBit, mu);
   }
 }
diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index 907a857824..65af1a6680 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -766,10 +766,10 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi
                     QUDA_DAG_YES, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
 
     if (dslash_type == QUDA_LAPLACE_DSLASH) {
-      xpay(out.V(), kappa, ref.V(), ref.Length(), gauge_param.cpu_prec);
-      ax(0.5 / kappa, ref.V(), ref.Length(), gauge_param.cpu_prec);
+      xpay(out.data(), kappa, ref.data(), ref.Length(), gauge_param.cpu_prec);
+      ax(0.5 / kappa, ref.data(), ref.Length(), gauge_param.cpu_prec);
     } else {
-      axpy(2 * mass, out.V(), ref.V(), ref.Length(), gauge_param.cpu_prec);
+      axpy(2 * mass, out.data(), ref.data(), ref.Length(), gauge_param.cpu_prec);
     }
     break;
 
@@ -791,9 +791,9 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi
     len = Vh;
   }
 
-  mxpy(in.V(), ref.V(), len * stag_spinor_site_size, inv_param.cpu_prec);
-  double nrm2 = norm_2(ref.V(), len * stag_spinor_site_size, inv_param.cpu_prec);
-  double src2 = norm_2(in.V(), len * stag_spinor_site_size, inv_param.cpu_prec);
+  mxpy(in.data(), ref.data(), len * stag_spinor_site_size, inv_param.cpu_prec);
+  double nrm2 = norm_2(ref.data(), len * stag_spinor_site_size, inv_param.cpu_prec);
+  double src2 = norm_2(in.data(), len * stag_spinor_site_size, inv_param.cpu_prec);
   double hqr = sqrt(quda::blas::HeavyQuarkResidualNorm(out, ref).z);
   double l2r = sqrt(nrm2 / src2);
 
diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
index 6fbdf91c42..04fc5d035e 100644
--- a/tests/host_reference/staggered_dslash_reference.cpp
+++ b/tests/host_reference/staggered_dslash_reference.cpp
@@ -143,22 +143,22 @@ void staggeredDslash(ColorSpinorField &out, void *const *fatlink, void *const *l
 
   if (sPrecision == QUDA_DOUBLE_PRECISION) {
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      staggeredDslashReference((double *)out.V(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink,
-                               (double **)ghost_longlink, (double *)in.V(), (double **)fwd_nbr_spinor,
+      staggeredDslashReference((double *)out.data(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink,
+                               (double **)ghost_longlink, (double *)in.data(), (double **)fwd_nbr_spinor,
                                (double **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
     } else {
-      staggeredDslashReference((double *)out.V(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink,
-                               (float **)ghost_longlink, (double *)in.V(), (double **)fwd_nbr_spinor,
+      staggeredDslashReference((double *)out.data(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink,
+                               (float **)ghost_longlink, (double *)in.data(), (double **)fwd_nbr_spinor,
                                (double **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
     }
   } else {
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      staggeredDslashReference((float *)out.V(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink,
-                               (double **)ghost_longlink, (float *)in.V(), (float **)fwd_nbr_spinor,
+      staggeredDslashReference((float *)out.data(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink,
+                               (double **)ghost_longlink, (float *)in.data(), (float **)fwd_nbr_spinor,
                                (float **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
     } else {
-      staggeredDslashReference((float *)out.V(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink,
-                               (float **)ghost_longlink, (float *)in.V(), (float **)fwd_nbr_spinor,
+      staggeredDslashReference((float *)out.data(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink,
+                               (float **)ghost_longlink, (float *)in.data(), (float **)fwd_nbr_spinor,
                                (float **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
     }
   }
@@ -189,8 +189,8 @@ void staggeredMatDagMat(ColorSpinorField &out, void *const *fatlink, void *const
 
   double msq_x4 = mass * mass * 4;
   if (sPrecision == QUDA_DOUBLE_PRECISION) {
-    axmy((double *)in.V(), (double)msq_x4, (double *)out.V(), Vh * stag_spinor_site_size);
+    axmy((double *)in.data(), (double)msq_x4, (double *)out.data(), Vh * stag_spinor_site_size);
   } else {
-    axmy((float *)in.V(), (float)msq_x4, (float *)out.V(), Vh * stag_spinor_site_size);
+    axmy((float *)in.data(), (float)msq_x4, (float *)out.data(), Vh * stag_spinor_site_size);
   }
 }
diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp
index 2f29a3de08..6bcf0bc380 100644
--- a/tests/invert_test.cpp
+++ b/tests/invert_test.cpp
@@ -248,7 +248,7 @@ std::vector<double> solve(test_t param)
       // Allocate memory and set pointers
       for (int n = 0; n < Nsrc; n++) {
         out_multishift[n * multishift + i] = quda::ColorSpinorField(cs_param);
-        _hp_multi_x[n][i] = out_multishift[n * multishift + i].V();
+        _hp_multi_x[n][i] = out_multishift[n * multishift + i].data();
       }
     }
   }
@@ -273,9 +273,9 @@ std::vector<double> solve(test_t param)
       if (inv_deflate) eig_param.preserve_deflation = i < Nsrc - 1 ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
       // Perform QUDA inversions
       if (multishift > 1) {
-        invertMultiShiftQuda(_hp_multi_x[i].data(), in[i].V(), &inv_param);
+        invertMultiShiftQuda(_hp_multi_x[i].data(), in[i].data(), &inv_param);
       } else {
-        invertQuda(out[i].V(), in[i].V(), &inv_param);
+        invertQuda(out[i].data(), in[i].data(), &inv_param);
       }
 
       time[i] = inv_param.secs;
@@ -292,8 +292,8 @@ std::vector<double> solve(test_t param)
     std::vector<void *> _hp_x(Nsrc);
     std::vector<void *> _hp_b(Nsrc);
     for (int i = 0; i < Nsrc; i++) {
-      _hp_x[i] = out[i].V();
-      _hp_b[i] = in[i].V();
+      _hp_x[i] = out[i].data();
+      _hp_b[i] = in[i].data();
     }
     // Run split grid
     if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH
@@ -326,7 +326,7 @@ std::vector<double> solve(test_t param)
   // Perform host side verification of inversion if requested
   if (verify_results) {
     for (int i = 0; i < Nsrc; i++) {
-      res[i] = verifyInversion(out[i].V(), _hp_multi_x[i].data(), in[i].V(), check.V(), gauge_param, inv_param,
+      res[i] = verifyInversion(out[i].data(), _hp_multi_x[i].data(), in[i].data(), check.data(), gauge_param, inv_param,
                                gauge.data(), clover.data(), clover_inv.data());
     }
   }
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 5ee2616ad8..5cae0d80c2 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -103,9 +103,9 @@ struct StaggeredDslashTestWrapper {
       staggeredDslash(spinorRef.Odd(), qdp_fatlink_cpu, qdp_longlink_cpu, ghost_fatlink_cpu, ghost_longlink_cpu,
                       spinor.Even(), QUDA_ODD_PARITY, !dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
       if (dslash_type == QUDA_LAPLACE_DSLASH) {
-        xpay(spinor.V(), kappa, spinorRef.V(), spinor.Length(), gauge_param.cpu_prec);
+        xpay(spinor.data(), kappa, spinorRef.data(), spinor.Length(), gauge_param.cpu_prec);
       } else {
-        axpy(2 * mass, spinor.V(), spinorRef.V(), spinor.Length(), gauge_param.cpu_prec);
+        axpy(2 * mass, spinor.data(), spinorRef.data(), spinor.Length(), gauge_param.cpu_prec);
       }
       break;
     default: errorQuda("Test type %d not defined", static_cast<int>(dtest_type));
@@ -364,8 +364,8 @@ struct StaggeredDslashTestWrapper {
       std::vector<void *> _hp_x(inv_param.num_src);
       std::vector<void *> _hp_b(inv_param.num_src);
       for (int i = 0; i < inv_param.num_src; i++) {
-        _hp_x[i] = vp_spinor_out[i].V();
-        _hp_b[i] = vp_spinor[i].V();
+        _hp_x[i] = vp_spinor_out[i].data();
+        _hp_b[i] = vp_spinor[i].data();
       }
       dslashMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, parity, milc_fatlink_gpu, milc_longlink_gpu,
                                   &gauge_param);
diff --git a/tests/staggered_gsmear_test_utils.h b/tests/staggered_gsmear_test_utils.h
index 7266844798..b9adfe4361 100644
--- a/tests/staggered_gsmear_test_utils.h
+++ b/tests/staggered_gsmear_test_utils.h
@@ -134,11 +134,11 @@ struct StaggeredGSmearTestWrapper { //
                                       &gauge_param, &inv_param, 1, smear_coeff, smear_t0, gauge_param.cpu_prec);
 
         // blas::xpay(*tmp2, -1.0, *spinorRef);
-        xpay(tmp2.Even().V(), -1.0, spinorRef.Even().V(), spinor.Even().Length(), gauge_param.cpu_prec);
-        xpay(tmp2.Odd().V(), -1.0, spinorRef.Odd().V(), spinor.Odd().Length(), gauge_param.cpu_prec);
+        xpay(tmp2.Even().data(), -1.0, spinorRef.Even().data(), spinor.Even().Length(), gauge_param.cpu_prec);
+        xpay(tmp2.Odd().data(), -1.0, spinorRef.Odd().data(), spinor.Odd().Length(), gauge_param.cpu_prec);
         //
-        memset(tmp2.Even().V(), 0, spinor.Even().Length() * gauge_param.cpu_prec);
-        memset(tmp2.Odd().V(), 0, spinor.Odd().Length() * gauge_param.cpu_prec);
+        memset(tmp2.Even().data(), 0, spinor.Even().Length() * gauge_param.cpu_prec);
+        memset(tmp2.Odd().data(), 0, spinor.Odd().Length() * gauge_param.cpu_prec);
       }
       break;
     }
@@ -327,7 +327,7 @@ struct StaggeredGSmearTestWrapper { //
         qsm_param.delete_2link = smear_delete_two_link;
         qsm_param.t0 = smear_t0;
 
-        performTwoLinkGaussianSmearNStep(spinor.V(), &qsm_param);
+        performTwoLinkGaussianSmearNStep(spinor.data(), &qsm_param);
 
         quda_gflops = qsm_param.gflops;
 
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 87c574d974..1fab095147 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -335,7 +335,7 @@ int main(int argc, char **argv)
     if (!use_split_grid) {
       for (int k = 0; k < Nsrc; k++) {
         if (inv_deflate) eig_param.preserve_deflation = k < Nsrc - 1 ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
-        invertQuda(out[k]->V(), in[k]->V(), &inv_param);
+        invertQuda(out[k]->data(), in[k]->data(), &inv_param);
         time[k] = inv_param.secs;
         gflops[k] = inv_param.gflops / inv_param.secs;
         iter[k] = inv_param.iter;
@@ -346,8 +346,8 @@ int main(int argc, char **argv)
       std::vector<void *> _hp_x(Nsrc);
       std::vector<void *> _hp_b(Nsrc);
       for (int k = 0; k < Nsrc; k++) {
-        _hp_x[k] = out[k]->V();
-        _hp_b[k] = in[k]->V();
+        _hp_x[k] = out[k]->data();
+        _hp_b[k] = in[k]->data();
       }
       inv_param.num_src = Nsrc;
       inv_param.num_src_per_sub_partition = Nsrc / num_sub_partition;
@@ -389,12 +389,12 @@ int main(int argc, char **argv)
       inv_param.tol_hq_offset[i] = inv_param.tol_hq;
       // Allocate memory and set pointers
       qudaOutArray[i] = ColorSpinorField::Create(cs_param);
-      outArray[i] = qudaOutArray[i]->V();
+      outArray[i] = qudaOutArray[i]->data();
     }
 
     for (int k = 0; k < Nsrc; k++) {
       quda::spinorNoise(*in[k], *rng, QUDA_NOISE_UNIFORM);
-      invertMultiShiftQuda((void **)outArray, in[k]->V(), &inv_param);
+      invertMultiShiftQuda((void **)outArray, in[k]->data(), &inv_param);
 
       time[k] = inv_param.secs;
       gflops[k] = inv_param.gflops / inv_param.secs;
diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp
index 365781c7d0..cc9148fca5 100644
--- a/tests/utils/staggered_host_utils.cpp
+++ b/tests/utils/staggered_host_utils.cpp
@@ -462,12 +462,12 @@ void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk
 
   if (prec == QUDA_DOUBLE_PRECISION) {
     {
-      staggeredTwoLinkGaussianSmear((double *)out.V(), (double **)qdp_twolnk, (double **)ghost, (double *)in.V(),
+      staggeredTwoLinkGaussianSmear((double *)out.data(), (double **)qdp_twolnk, (double **)ghost, (double *)in.data(),
                                     (double **)fwd_nbr_spinor, (double **)back_nbr_spinor, t0, oddBit);
     } 
   } else {
     {
-      staggeredTwoLinkGaussianSmear((float *)out.V(), (float **)qdp_twolnk, (float **)ghost, (float *)in.V(),
+      staggeredTwoLinkGaussianSmear((float *)out.data(), (float **)qdp_twolnk, (float **)ghost, (float *)in.data(),
                                     (float **)fwd_nbr_spinor, (float **)back_nbr_spinor, t0, oddBit);
     }
   }

From 838ff4f711ead8efd538879d7760b76d0094f3dd Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 31 May 2023 23:32:11 -0700
Subject: [PATCH 25/60] Fix clang warning

---
 include/quda_api.h  |  2 +-
 include/tune_quda.h | 14 ++++++--------
 lib/tune.cpp        | 12 +-----------
 3 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/include/quda_api.h b/include/quda_api.h
index b3b9f35b69..9feea16297 100644
--- a/include/quda_api.h
+++ b/include/quda_api.h
@@ -23,7 +23,7 @@ enum qudaMemcpyKind {
 namespace quda
 {
 
-  class TuneParam;
+  struct TuneParam;
 
   struct qudaStream_t {
     int idx;
diff --git a/include/tune_quda.h b/include/tune_quda.h
index 2aacde55f7..ff99826149 100644
--- a/include/tune_quda.h
+++ b/include/tune_quda.h
@@ -18,17 +18,15 @@
 namespace quda {
 
   struct TuneParam {
-
-  public:
-    dim3 block;
+    dim3 block = {1, 1, 1};
     dim3 grid;
-    unsigned int shared_bytes;
-    bool set_max_shared_bytes; // whether to opt in to max shared bytes per thread block
-    int4 aux; // free parameter that can be used as an arbitrary autotuning dimension outside of launch parameters
+    unsigned int shared_bytes = 0;
+    bool set_max_shared_bytes = false; // whether to opt in to max shared bytes per thread block
+    int4 aux = {1, 1, 1, 1}; // free parameter that can be used as an arbitrary autotuning dimension outside of launch parameters
 
     std::string comment;
-    float time;
-    long long n_calls;
+    float time = FLT_MAX;
+    long long n_calls = 0;
 
     TuneParam();
     TuneParam(const TuneParam &) = default;
diff --git a/lib/tune.cpp b/lib/tune.cpp
index 1d6971db3c..fea2a7b509 100644
--- a/lib/tune.cpp
+++ b/lib/tune.cpp
@@ -644,17 +644,7 @@ namespace quda
     }
   }
 
-  TuneParam::TuneParam() :
-    block(device::warp_size(), 1, 1),
-    grid(1, 1, 1),
-    shared_bytes(0),
-    set_max_shared_bytes(false),
-    aux(),
-    time(FLT_MAX),
-    n_calls(0)
-  {
-    aux = make_int4(1, 1, 1, 1);
-  }
+  TuneParam::TuneParam() : block(device::warp_size(), 1, 1) { }
 
   std::ostream &operator<<(std::ostream &output, const TuneParam &param)
   {

From 9aa20ce752829c4f9093680e4cbba9b8fefd9d3e Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 1 Jun 2023 09:29:35 -0700
Subject: [PATCH 26/60] Remove std::move on temporary quda_ptr objects since
 this prevents the compiler from doing copy elision

---
 lib/clover_field.cpp       | 10 +++++-----
 lib/color_spinor_field.cpp |  6 +++---
 lib/gauge_field.cpp        | 20 ++++++++++----------
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp
index e91600b6c3..0d859c4fdc 100644
--- a/lib/clover_field.cpp
+++ b/lib/clover_field.cpp
@@ -72,18 +72,18 @@ namespace quda {
 
     if (bytes) {
       if (create != QUDA_REFERENCE_FIELD_CREATE) {
-        clover = std::move(quda_ptr(mem_type, bytes));
+        clover = quda_ptr(mem_type, bytes);
       } else {
-        clover = std::move(quda_ptr(param.clover, mem_type));
+        clover = quda_ptr(param.clover, mem_type);
       }
 
       total_bytes += bytes;
 
       if (inverse) {
         if (create != QUDA_REFERENCE_FIELD_CREATE) {
-          cloverInv = std::move(quda_ptr(mem_type, bytes));
+          cloverInv = quda_ptr(mem_type, bytes);
         } else {
-          cloverInv = std::move(quda_ptr(param.cloverInv, mem_type));
+          cloverInv = quda_ptr(param.cloverInv, mem_type);
         }
 
         total_bytes += bytes;
@@ -114,7 +114,7 @@ namespace quda {
   {
     if (backup_h.size()) errorQuda("Already allocated host backup");
     backup_h.resize(2);
-    for (auto &b : backup_h) b = std::move(quda_ptr(QUDA_MEMORY_HOST, bytes));
+    for (auto &b : backup_h) b = quda_ptr(QUDA_MEMORY_HOST, bytes);
 
     backup(false);
     if (inverse) backup(true);
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index b26897948b..a76a29b0eb 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -148,10 +148,10 @@ namespace quda
       errorQuda("Subset not implemented");
 
     if (param.create != QUDA_REFERENCE_FIELD_CREATE && param.create != QUDA_GHOST_FIELD_CREATE) {
-      v = std::move(quda_ptr(mem_type, bytes));
+      v = quda_ptr(mem_type, bytes);
       alloc = true;
     } else  if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
-      v = std::move(quda_ptr(param.v, mem_type));
+      v = quda_ptr(param.v, mem_type);
       reference = true;
     } else if (param.create == QUDA_GHOST_FIELD_CREATE) {
       ghost_only = true;
@@ -1480,7 +1480,7 @@ namespace quda
   {
     if (backup_h.size()) errorQuda("ColorSpinorField already backed up");
     backup_h.resize(1);
-    backup_h[0] = std::move(quda_ptr(QUDA_MEMORY_HOST, bytes));
+    backup_h[0] = quda_ptr(QUDA_MEMORY_HOST, bytes);
     qudaMemcpy(backup_h[0], v, bytes, qudaMemcpyDefault);
   }
 
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 51d5b59a47..d1700709fc 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -169,18 +169,18 @@ namespace quda {
 
     if (isNative()) {
       if (param.create != QUDA_REFERENCE_FIELD_CREATE) {
-        gauge = std::move(quda_ptr(mem_type, bytes));
+        gauge = quda_ptr(mem_type, bytes);
       } else {
-        gauge = std::move(quda_ptr(param.gauge, mem_type));
+        gauge = quda_ptr(param.gauge, mem_type);
       }
     } else if (is_pointer_array(order)) {
 
       size_t nbytes = volume * nInternal * precision;
       for (int d = 0; d < site_dim; d++) {
         if (param.create != QUDA_REFERENCE_FIELD_CREATE) {
-          gauge_array[d] = std::move(quda_ptr(mem_type, nbytes));
+          gauge_array[d] = quda_ptr(mem_type, nbytes);
         } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
-          gauge_array[d] = std::move(quda_ptr(static_cast<void **>(param.gauge)[d], mem_type));
+          gauge_array[d] = quda_ptr(static_cast<void **>(param.gauge)[d], mem_type);
         } else {
           errorQuda("Unsupported creation type %d", param.create);
         }
@@ -196,9 +196,9 @@ namespace quda {
       }
 
       if (param.create != QUDA_REFERENCE_FIELD_CREATE) {
-        gauge = std::move(quda_ptr(mem_type, bytes));
+        gauge = quda_ptr(mem_type, bytes);
       } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
-        gauge = std::move(quda_ptr(param.gauge, mem_type));
+        gauge = quda_ptr(param.gauge, mem_type);
       } else {
         errorQuda("Unsupported creation type %d", param.create);
       }
@@ -211,8 +211,8 @@ namespace quda {
       if (!isNative()) {
         for (int i=0; i<nDim; i++) {
           size_t nbytes = nFace * surface[i] * nInternal * precision;
-          ghost[i] = std::move(quda_ptr(mem_type, nbytes));
-          if (geometry == QUDA_COARSE_GEOMETRY) ghost[i+4] = std::move(quda_ptr(mem_type, nbytes));
+          ghost[i] = quda_ptr(mem_type, nbytes);
+          if (geometry == QUDA_COARSE_GEOMETRY) ghost[i+4] = quda_ptr(mem_type, nbytes);
 
           qudaMemset(ghost[i], 0, nbytes);
           if (geometry == QUDA_COARSE_GEOMETRY) qudaMemset(ghost[i + 4], 0, nbytes);
@@ -1276,12 +1276,12 @@ namespace quda {
     if (order == QUDA_QDP_GAUGE_ORDER) {
       backup_h.resize(geometry);
       for (int d = 0; d < geometry; d++) {
-        backup_h[d] = std::move(quda_ptr(QUDA_MEMORY_HOST, bytes / geometry));
+        backup_h[d] = quda_ptr(QUDA_MEMORY_HOST, bytes / geometry);
         qudaMemcpy(backup_h[d], gauge_array[d], bytes / geometry, qudaMemcpyDefault);
       }
     } else {
       backup_h.resize(1);
-      backup_h[0] = std::move(quda_ptr(QUDA_MEMORY_HOST, bytes));
+      backup_h[0] = quda_ptr(QUDA_MEMORY_HOST, bytes);
       qudaMemcpy(backup_h[0], gauge, bytes, qudaMemcpyDefault);
     }
   }

From 08b99a5a16cb6c1334c85af86e80097ae2a86033 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 2 Jun 2023 13:09:54 -0700
Subject: [PATCH 27/60] Move quda_ptr to its own file, and make it generic

---
 include/malloc_quda.h       |  75 -----------------
 include/quda_api.h          |   2 +-
 include/quda_ptr.h          |  78 ++++++++++++++++++
 lib/CMakeLists.txt          |   2 +-
 lib/quda_ptr.cpp            | 157 ++++++++++++++++++++++++++++++++++++
 lib/targets/cuda/malloc.cpp | 151 ----------------------------------
 6 files changed, 237 insertions(+), 228 deletions(-)
 create mode 100644 include/quda_ptr.h
 create mode 100644 lib/quda_ptr.cpp

diff --git a/include/malloc_quda.h b/include/malloc_quda.h
index 8cbc2fbb47..05a36fcd77 100644
--- a/include/malloc_quda.h
+++ b/include/malloc_quda.h
@@ -172,78 +172,3 @@ namespace quda {
 #define pool_device_free(ptr) quda::pool::device_free_(__func__, __FILE__, __LINE__, ptr)
 #define pool_pinned_malloc(size) quda::pool::pinned_malloc_(__func__, __FILE__, __LINE__, size)
 #define pool_pinned_free(ptr) quda::pool::pinned_free_(__func__, __FILE__, __LINE__, ptr)
-
-namespace quda {
-
-  /**
-     Object that stores a memory allocation with different views for
-     host or device.  Depending on the nature of the underlying memory
-     type, both views may not be defined
-
-     type                       defined views
-     QUDA_MEMORY_DEVICE         device only
-     QUDA_MEMORY_DEVICE_PINNED  device only
-     QUDA_MEMORY_HOST           host only
-     QUDA_MEMORY_HOST_PINNED    both
-     QUDA_MEMORY_MAPPED         both (pinned to host)
-     QUDA_MEMORY_MANAGED        both
-   */
-  class quda_ptr {
-    QudaMemoryType type = QUDA_MEMORY_INVALID;
-    size_t size = 0;
-    bool pool = false;
-    void *device = nullptr;
-    void *host = nullptr;
-
-  public:
-    quda_ptr() = default;
-    quda_ptr(quda_ptr &&) = default;
-    quda_ptr &operator=(quda_ptr &&);
-
-    /**
-       @brief Constructor for quda_ptr
-       @param[in] type The memory type of the allocation
-       @param[in] size The size of the allocation
-       @param[in] pool Whether the allocation should be in the memory pool (default is true)
-    */
-    quda_ptr(QudaMemoryType type, size_t size, bool pool = true);
-
-    /**
-       @brief Constructor for quda_ptr where we are wrapping a non-owned pointer
-       @param[in] ptr Raw base pointer
-       @param[in] type The memory type of the allocation
-    */
-    quda_ptr(void *ptr, QudaMemoryType type);
-
-    /**
-       @brief Destructor for the quda_ptr
-    */
-    virtual ~quda_ptr();
-
-    /**
-       @return Returns true if allocation is visible to the device
-    */
-    bool is_device() const;
-
-    /**
-       @return Returns true if allocation is visible to the host
-    */
-    bool is_host() const;
-
-    /**
-       Return view of the pointer.  For mapped memory we return the device view.
-     */
-    void *data() const;
-
-    /**
-       Return the device view of the pointer
-     */
-    void *data_device() const;
-
-    /**
-       Return the host view of the pointer
-     */
-    void *data_host() const;
-  };
-
-}
diff --git a/include/quda_api.h b/include/quda_api.h
index 9feea16297..becec68c8b 100644
--- a/include/quda_api.h
+++ b/include/quda_api.h
@@ -3,7 +3,7 @@
 #include <quda_define.h>
 #include <string>
 #include <enum_quda.h>
-#include <malloc_quda.h>
+#include <quda_ptr.h>
 
 /**
    @file quda_api.h
diff --git a/include/quda_ptr.h b/include/quda_ptr.h
new file mode 100644
index 0000000000..3e829f310f
--- /dev/null
+++ b/include/quda_ptr.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include "malloc_quda.h"
+
+namespace quda {
+
+  /**
+     Object that stores a memory allocation with different views for
+     host or device.  Depending on the nature of the underlying memory
+     type, both views may not be defined
+
+     type                       defined views
+     QUDA_MEMORY_DEVICE         device only
+     QUDA_MEMORY_DEVICE_PINNED  device only
+     QUDA_MEMORY_HOST           host only
+     QUDA_MEMORY_HOST_PINNED    both
+     QUDA_MEMORY_MAPPED         both (pinned to host)
+     QUDA_MEMORY_MANAGED        both
+   */
+  class quda_ptr {
+    QudaMemoryType type = QUDA_MEMORY_INVALID;
+    size_t size = 0;
+    bool pool = false;
+    void *device = nullptr;
+    void *host = nullptr;
+
+  public:
+    quda_ptr() = default;
+    quda_ptr(quda_ptr &&) = default;
+    quda_ptr &operator=(quda_ptr &&);
+
+    /**
+       @brief Constructor for quda_ptr
+       @param[in] type The memory type of the allocation
+       @param[in] size The size of the allocation
+       @param[in] pool Whether the allocation should be in the memory pool (default is true)
+    */
+    quda_ptr(QudaMemoryType type, size_t size, bool pool = true);
+
+    /**
+       @brief Constructor for quda_ptr where we are wrapping a non-owned pointer
+       @param[in] ptr Raw base pointer
+       @param[in] type The memory type of the allocation
+    */
+    quda_ptr(void *ptr, QudaMemoryType type);
+
+    /**
+       @brief Destructor for the quda_ptr
+    */
+    virtual ~quda_ptr();
+
+    /**
+       @return Returns true if allocation is visible to the device
+    */
+    bool is_device() const;
+
+    /**
+       @return Returns true if allocation is visible to the host
+    */
+    bool is_host() const;
+
+    /**
+       Return view of the pointer.  For mapped memory we return the device view.
+     */
+    void *data() const;
+
+    /**
+       Return the device view of the pointer
+     */
+    void *data_device() const;
+
+    /**
+       Return the host view of the pointer
+     */
+    void *data_host() const;
+  };
+
+}
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 37a83e001c..5050133341 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -84,7 +84,7 @@ set (QUDA_OBJS
   clover_sigma_outer_product.cu momentum.cu gauge_qcharge.cu
   deflation.cpp checksum.cu transform_reduce.cu
   dslash5_mobius_eofa.cu
-  madwf_ml.cpp
+  madwf_ml.cpp quda_ptr.cpp
   instantiate.cpp version.cpp )
 # cmake-format: on
 
diff --git a/lib/quda_ptr.cpp b/lib/quda_ptr.cpp
new file mode 100644
index 0000000000..8b366afcbb
--- /dev/null
+++ b/lib/quda_ptr.cpp
@@ -0,0 +1,157 @@
+#include "quda_ptr.h"
+#include "util_quda.h"
+#include "timer.h"
+
+namespace quda {
+
+  quda_ptr::quda_ptr(QudaMemoryType type, size_t size, bool pool) :
+    type(type),
+    size(size),
+    pool(pool)
+  {
+    getProfile().TPSTART(QUDA_PROFILE_INIT);
+    if (pool && (type != QUDA_MEMORY_DEVICE && type != QUDA_MEMORY_HOST_PINNED && type != QUDA_MEMORY_HOST))
+      errorQuda("Memory pool not available for memory type %d", type);
+
+    if (size > 0) {
+      switch (type) {
+      case QUDA_MEMORY_DEVICE:
+        device = pool ? pool_device_malloc(size) : device_malloc(size);
+        break;
+      case QUDA_MEMORY_DEVICE_PINNED:
+        device = device_pinned_malloc(size);
+        break;
+      case QUDA_MEMORY_HOST:
+        host = safe_malloc(size);
+        break;
+      case QUDA_MEMORY_HOST_PINNED:
+        host = pool ? pool_pinned_malloc(size) : pinned_malloc(size);
+        break;
+      case QUDA_MEMORY_MAPPED:
+        host = mapped_malloc(size);
+        device = get_mapped_device_pointer(host);
+        break;
+      case QUDA_MEMORY_MANAGED:
+        host = managed_malloc(size);
+        device = host;
+        break;
+      default: errorQuda("Unknown memory type %d", type);
+      }
+    }
+    getProfile().TPSTOP(QUDA_PROFILE_INIT);
+  }
+
+  quda_ptr::quda_ptr(void *ptr, QudaMemoryType type) :
+    type(type)
+  {
+    getProfile().TPSTART(QUDA_PROFILE_INIT);
+    switch (type) {
+    case QUDA_MEMORY_DEVICE:
+    case QUDA_MEMORY_DEVICE_PINNED:
+      device = ptr;
+      host = nullptr;
+      break;
+    case QUDA_MEMORY_HOST:
+    case QUDA_MEMORY_HOST_PINNED:
+      device = nullptr;
+      host = ptr;
+      break;
+    case QUDA_MEMORY_MANAGED:
+      device = ptr;
+      host = ptr;
+      break;
+    default: errorQuda("Unsupported memory type %d", type);
+    }
+    getProfile().TPSTOP(QUDA_PROFILE_INIT);
+  }
+
+  quda_ptr& quda_ptr::operator=(quda_ptr &&other)
+  {
+    if (&other != this) {
+      type = std::exchange(other.type, QUDA_MEMORY_INVALID);
+      size = std::exchange(other.size, 0);
+      pool = std::exchange(other.pool, false);
+      device = std::exchange(other.device, nullptr);
+      host = std::exchange(other.host, nullptr);
+    }
+    return *this;
+  }
+
+  quda_ptr::~quda_ptr()
+  {
+    getProfile().TPSTART(QUDA_PROFILE_FREE);
+
+    if (size > 0) {
+      switch (type) {
+      case QUDA_MEMORY_DEVICE:        pool ? pool_device_free(device) : device_free(device); break;
+      case QUDA_MEMORY_DEVICE_PINNED: device_pinned_free(device); break;
+      case QUDA_MEMORY_HOST:          host_free(host); break;
+      case QUDA_MEMORY_HOST_PINNED:   pool ? pool_pinned_free(host) : host_free(host); break;
+      case QUDA_MEMORY_MAPPED:        host_free(host); break;
+      default: errorQuda("Unknown memory type %d", type);
+      }
+    }
+
+    device = nullptr;
+    host = nullptr;
+
+    getProfile().TPSTOP(QUDA_PROFILE_FREE);
+  }
+
+  bool quda_ptr::is_device() const
+  {
+    switch (type) {
+    case QUDA_MEMORY_DEVICE:
+    case QUDA_MEMORY_DEVICE_PINNED:
+    case QUDA_MEMORY_MAPPED:
+    case QUDA_MEMORY_MANAGED:
+      return true;
+    default: return false;
+    }
+  }
+
+  bool quda_ptr::is_host() const
+  {
+    switch (type) {
+    case QUDA_MEMORY_HOST:
+    case QUDA_MEMORY_HOST_PINNED:
+    case QUDA_MEMORY_MANAGED:
+      return true;
+    default: return false;
+    }
+  }
+
+  void *quda_ptr::data() const
+  {
+    void *ptr = nullptr;
+
+    switch (type) {
+    case QUDA_MEMORY_DEVICE:
+    case QUDA_MEMORY_DEVICE_PINNED:
+    case QUDA_MEMORY_MAPPED:
+    case QUDA_MEMORY_MANAGED:
+      ptr = device;
+      break;
+    case QUDA_MEMORY_HOST:
+    case QUDA_MEMORY_HOST_PINNED:
+      ptr = host;
+      break;
+    default: errorQuda("Unknown memory type %d", type);
+    }
+
+    return ptr;
+  }
+
+  void *quda_ptr::data_device() const
+  {
+    if (!device) errorQuda("Device view not defined");
+    return device;
+  }
+
+  void *quda_ptr::data_host() const
+  {
+    if (!host) errorQuda("Host view not defined");
+    return host;
+  }
+
+}
diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp
index 1f78d936bc..2b0d3c97ba 100644
--- a/lib/targets/cuda/malloc.cpp
+++ b/lib/targets/cuda/malloc.cpp
@@ -790,155 +790,4 @@ namespace quda
 
   } // namespace pool
 
-
-  quda_ptr::quda_ptr(QudaMemoryType type, size_t size, bool pool) :
-    type(type),
-    size(size),
-    pool(pool)
-  {
-    getProfile().TPSTART(QUDA_PROFILE_INIT);
-    if (pool && (type != QUDA_MEMORY_DEVICE && type != QUDA_MEMORY_HOST_PINNED && type != QUDA_MEMORY_HOST))
-      errorQuda("Memory pool not available for memory type %d", type);
-
-    if (size > 0) {
-      switch (type) {
-      case QUDA_MEMORY_DEVICE:
-        device = pool ? pool_device_malloc(size) : device_malloc(size);
-        break;
-      case QUDA_MEMORY_DEVICE_PINNED:
-        device = device_pinned_malloc(size);
-        break;
-      case QUDA_MEMORY_HOST:
-        host = safe_malloc(size);
-        break;
-      case QUDA_MEMORY_HOST_PINNED:
-        host = pool ? pool_pinned_malloc(size) : pinned_malloc(size);
-        break;
-      case QUDA_MEMORY_MAPPED:
-        host = mapped_malloc(size);
-        device = get_mapped_device_pointer(host);
-        break;
-      case QUDA_MEMORY_MANAGED:
-        host = managed_malloc(size);
-        device = host;
-        break;
-      default: errorQuda("Unknown memory type %d", type);
-      }
-    }
-    getProfile().TPSTOP(QUDA_PROFILE_INIT);
-  }
-
-  quda_ptr::quda_ptr(void *ptr, QudaMemoryType type) :
-    type(type)
-  {
-    getProfile().TPSTART(QUDA_PROFILE_INIT);
-    switch (type) {
-    case QUDA_MEMORY_DEVICE:
-    case QUDA_MEMORY_DEVICE_PINNED:
-      device = ptr;
-      host = nullptr;
-      break;
-    case QUDA_MEMORY_HOST:
-    case QUDA_MEMORY_HOST_PINNED:
-      device = nullptr;
-      host = ptr;
-      break;
-    case QUDA_MEMORY_MANAGED:
-      device = ptr;
-      host = ptr;
-      break;
-    default: errorQuda("Unsupported memory type %d", type);
-    }
-    getProfile().TPSTOP(QUDA_PROFILE_INIT);
-  }
-
-  quda_ptr& quda_ptr::operator=(quda_ptr &&other)
-  {
-    if (&other != this) {
-      type = std::exchange(other.type, QUDA_MEMORY_INVALID);
-      size = std::exchange(other.size, 0);
-      pool = std::exchange(other.pool, false);
-      device = std::exchange(other.device, nullptr);
-      host = std::exchange(other.host, nullptr);
-    }
-    return *this;
-  }
-
-  quda_ptr::~quda_ptr()
-  {
-    getProfile().TPSTART(QUDA_PROFILE_FREE);
-
-    if (size > 0) {
-      switch (type) {
-      case QUDA_MEMORY_DEVICE:        pool ? pool_device_free(device) : device_free(device); break;
-      case QUDA_MEMORY_DEVICE_PINNED: device_pinned_free(device); break;
-      case QUDA_MEMORY_HOST:          host_free(host); break;
-      case QUDA_MEMORY_HOST_PINNED:   pool ? pool_pinned_free(host) : host_free(host); break;
-      case QUDA_MEMORY_MAPPED:        host_free(host); break;
-      default: errorQuda("Unknown memory type %d", type);
-      }
-    }
-
-    device = nullptr;
-    host = nullptr;
-
-    getProfile().TPSTOP(QUDA_PROFILE_FREE);
-  }
-
-  bool quda_ptr::is_device() const
-  {
-    switch (type) {
-    case QUDA_MEMORY_DEVICE:
-    case QUDA_MEMORY_DEVICE_PINNED:
-    case QUDA_MEMORY_MAPPED:
-    case QUDA_MEMORY_MANAGED:
-      return true;
-    default: return false;
-    }
-  }
-
-  bool quda_ptr::is_host() const
-  {
-    switch (type) {
-    case QUDA_MEMORY_HOST:
-    case QUDA_MEMORY_HOST_PINNED:
-    case QUDA_MEMORY_MANAGED:
-      return true;
-    default: return false;
-    }
-  }
-
-  void *quda_ptr::data() const
-  {
-    void *ptr = nullptr;
-
-    switch (type) {
-    case QUDA_MEMORY_DEVICE:
-    case QUDA_MEMORY_DEVICE_PINNED:
-    case QUDA_MEMORY_MAPPED:
-    case QUDA_MEMORY_MANAGED:
-      ptr = device;
-      break;
-    case QUDA_MEMORY_HOST:
-    case QUDA_MEMORY_HOST_PINNED:
-      ptr = host;
-      break;
-    default: errorQuda("Unknown memory type %d", type);
-    }
-
-    return ptr;
-  }
-
-  void *quda_ptr::data_device() const
-  {
-    if (!device) errorQuda("Device view not defined");
-    return device;
-  }
-
-  void *quda_ptr::data_host() const
-  {
-    if (!host) errorQuda("Host view not defined");
-    return host;
-  }
-
 } // namespace quda

From 2516878950c8cb97fd336d65b2760e398b3e9db4 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Sun, 4 Jun 2023 22:31:45 -0700
Subject: [PATCH 28/60] Add missing utility header

---
 lib/quda_ptr.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/quda_ptr.cpp b/lib/quda_ptr.cpp
index 8b366afcbb..7db16b641d 100644
--- a/lib/quda_ptr.cpp
+++ b/lib/quda_ptr.cpp
@@ -1,3 +1,4 @@
+#include <utility>
 #include "quda_ptr.h"
 #include "util_quda.h"
 #include "timer.h"

From 27badee5f11d2e3887250a08937b5df785830237 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 23 Jun 2023 10:24:05 -0700
Subject: [PATCH 29/60] Fix issue with Wilson MG

---
 include/clover_field_order.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/clover_field_order.h b/include/clover_field_order.h
index 05b77eee63..65d5ef6cff 100644
--- a/include/clover_field_order.h
+++ b/include/clover_field_order.h
@@ -312,7 +312,7 @@ namespace quda {
       static constexpr int N = nColor * nSpin / 2;
       reconstruct_t<Float, N * N, clover::reconstruct()> recon;
       FloatNAccessor(const CloverField &A, bool inverse = false) :
-        a(A.data<Float *>(inverse)),
+        a(A.Bytes() ? A.data<Float *>(inverse) : nullptr),
         stride(A.VolumeCB()),
         offset_cb(A.Bytes() / (2 * sizeof(Float))),
         compressed_block_size(A.compressed_block_size()),
@@ -403,7 +403,9 @@ namespace quda {
       const int N = nSpin * nColor / 2;
       const complex<Float> zero;
       Accessor(const CloverField &A, bool inverse = false) :
-        a(A.data<Float *>(inverse)), offset_cb(A.Bytes() / (2 * sizeof(Float))), zero(complex<Float>(0.0, 0.0))
+        a(A.Bytes() ? A.data<Float *>(inverse) : nullptr),
+        offset_cb(A.Bytes() / (2 * sizeof(Float))),
+        zero(complex<Float>(0.0, 0.0))
       {
       }
 

From dd66595e1a5e2f61a04d5ec4dbc494b3da5840b4 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 11 Aug 2023 09:18:22 -0700
Subject: [PATCH 30/60] Removed unneeded static_cast

---
 lib/coarsecoarse_op_mma.in.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/coarsecoarse_op_mma.in.cu b/lib/coarsecoarse_op_mma.in.cu
index 8ccd052a1c..eee43a43ac 100644
--- a/lib/coarsecoarse_op_mma.in.cu
+++ b/lib/coarsecoarse_op_mma.in.cu
@@ -43,7 +43,7 @@ namespace quda {
         output = new GaugeField(param);
         if (copy_content) output->copy(X);
       }
-      return static_cast<GaugeField *>(output);
+      return output;
     };
 
     auto Y_order = create_gauge_copy(Y, gOrder, false);

From e818659c4f3b1056eaad12728eb06b9ea89cce13 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 11 Aug 2023 18:24:22 -0700
Subject: [PATCH 31/60] Fix HIP builds

---
 lib/targets/hip/quda_api.cpp | 76 ++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 43 deletions(-)

diff --git a/lib/targets/hip/quda_api.cpp b/lib/targets/hip/quda_api.cpp
index 9191ec16a3..6d9345a884 100644
--- a/lib/targets/hip/quda_api.cpp
+++ b/lib/targets/hip/quda_api.cpp
@@ -261,6 +261,13 @@ namespace quda
     QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line);
   }
 
+  void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func, const char *file,
+                   const char *line)
+  {
+    if (count == 0) return;
+    QudaMem copy(dst.data(), src.data(), count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line);
+  }
+
   void qudaMemcpyAsync_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const qudaStream_t &stream,
                         const char *func, const char *file, const char *line)
   {
@@ -288,6 +295,16 @@ namespace quda
     QudaMem set(ptr, value, count, device::get_default_stream(), false, func, file, line);
   }
 
+  void qudaMemset_(quda_ptr &ptr, int value, size_t count, const char *func, const char *file, const char *line)
+  {
+    if (count == 0) return;
+    if (ptr.is_device()) {
+      QudaMem set(ptr.data(), value, count, device::get_default_stream(), false, func, file, line);
+    } else {
+      memset(ptr.data(), value, count);
+    }
+  }
+
   void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
                         const char *file, const char *line)
   {
@@ -295,18 +312,26 @@ namespace quda
     QudaMem copy(ptr, value, count, stream, true, func, file, line);
   }
 
-  void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height, const char *func,
-                     const char *file, const char *line)
+  void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream,
+                        const char *func, const char *file, const char *line)
   {
-    hipError_t error = hipMemset2D(ptr, pitch, value, width, height);
-    set_runtime_error(error, __func__, func, file, line);
+    if (count == 0) return;
+    if (ptr.is_device()) {
+      QudaMem set(ptr.data(), value, count, stream, true, func, file, line);
+    } else {
+      memset(ptr.data(), value, count);
+    }
   }
 
-  void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream,
-                          const char *func, const char *file, const char *line)
+  void qudaMemset2DAsync_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height,
+                          const qudaStream_t &stream, const char *func, const char *file, const char *line)
   {
-    hipError_t error = hipMemset2DAsync(ptr, pitch, value, width, height, get_stream(stream));
-    set_runtime_error(error, __func__, func, file, line);
+    if (ptr.is_device()) {
+      hipError_t error = hipMemset2DAsync(static_cast<char*>(ptr.data()) + offset, pitch, value, width, height, get_stream(stream));
+      set_runtime_error(error, __func__, func, file, line);
+    } else {
+      for (auto i = 0u; i < height; i++) memset(static_cast<char*>(ptr.data()) + offset + i * pitch, value, width);
+    }
   }
 
   void qudaMemPrefetchAsync_(void *, size_t, QudaFieldLocation, const qudaStream_t &, const char *, const char *,
@@ -315,41 +340,6 @@ namespace quda
     // No prefetch
   }
 
-#if 0
-  bool qudaEventQuery_(qudaEvent_t &quda_event, const char *func, const char *file, const char *line)
-  {
-    cudaEvent_t &event = reinterpret_cast<cudaEvent_t&>(quda_event.event);
-#ifdef USE_DRIVER_API
-    PROFILE(CUresult error = cuEventQuery(event), QUDA_PROFILE_EVENT_QUERY);
-    switch (error) {
-    case CUDA_SUCCESS: return true;
-    case CUDA_ERROR_NOT_READY: return false;
-    default: set_driver_error(error, __func__, func, file, line);
-    }
-#else
-    PROFILE(cudaError_t error = cudaEventQuery(event), QUDA_PROFILE_EVENT_QUERY);
-    switch (error) {
-    case cudaSuccess: return true;
-    case cudaErrorNotReady: return false;
-    default: set_runtime_error(error, __func__, func, file, line);
-    }
-#endif
-    return false;
-  }
-
-  void qudaEventRecord_(qudaEvent_t &quda_event, qudaStream_t stream, const char *func, const char *file, const char *line)
-  {
-    cudaEvent_t &event = reinterpret_cast<cudaEvent_t&>(quda_event.event);
-#ifdef USE_DRIVER_API
-    PROFILE(CUresult error = cuEventRecord(event, get_stream(stream)), QUDA_PROFILE_EVENT_RECORD);
-    set_driver_error(error, __func__, func, file, line);
-#else
-    PROFILE(cudaError_t error = cudaEventRecord(event, get_stream(stream)), QUDA_PROFILE_EVENT_RECORD);
-    set_runtime_error(error, __func__, func, file, line);
-#endif
-  }
-#endif
-
   bool qudaEventQuery_(qudaEvent_t &quda_event, const char *func, const char *file, const char *line)
   {
     hipEvent_t &event = reinterpret_cast<hipEvent_t &>(quda_event.event);

From 50987b1b96a55143adae9810e1e45b38b7f93b13 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 18 Aug 2023 15:15:14 -0700
Subject: [PATCH 32/60] Minor review comment

---
 tests/host_reference/gauge_force_reference.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp
index 83c5251e27..eb18f10568 100644
--- a/tests/host_reference/gauge_force_reference.cpp
+++ b/tests/host_reference/gauge_force_reference.cpp
@@ -492,8 +492,6 @@ void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int **
   param.t_boundary = QUDA_PERIODIC_T;
 
   auto qdp_ex = quda::createExtendedGauge((void **)sitelink, param, R);
-  //quda::TimeProfile dummy("blah");
-  //auto qdp_ex = quda::createExtendedGauge(u, R, dummy);
   lattice_t lat(*qdp_ex);
 
   void *sitelink_ex[] = {qdp_ex->data(0), qdp_ex->data(1), qdp_ex->data(2), qdp_ex->data(3)};

From f8b324439be89fd6056c934241186bd47db154d6 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 18 Aug 2023 18:05:15 -0700
Subject: [PATCH 33/60] Add default assignment operator for TimeProfile class

---
 include/timer.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/timer.h b/include/timer.h
index 0d529867cb..b819b81bb2 100644
--- a/include/timer.h
+++ b/include/timer.h
@@ -205,6 +205,7 @@ namespace quda {
   public:
     TimeProfile() = default;
     TimeProfile(const TimeProfile &) = default;
+    TimeProfile& operator=(const TimeProfile &) = default;
 
     TimeProfile(std::string fname) : fname(fname), switchOff(false), use_global(true) { ; }
 

From 06d2dcbeb259fb35a4b0e95561d594167d278daa Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 29 Aug 2023 15:38:55 -0700
Subject: [PATCH 34/60] Further cleanup and minor fixes

---
 lib/interface_quda.cpp | 334 +++++++++++++----------------------------
 1 file changed, 101 insertions(+), 233 deletions(-)

diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index e63f9c1dc4..19a97983b0 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -429,16 +429,14 @@ void initQudaDevice(int dev)
   initialized = true;
 
   profileInit2End.TPSTART(QUDA_PROFILE_TOTAL);
-  profileInit.TPSTART(QUDA_PROFILE_TOTAL);
+  pushProfile(profileInit);
   profileInit.TPSTART(QUDA_PROFILE_INIT);
 
-  if (getVerbosity() >= QUDA_SUMMARIZE) {
 #ifdef GITVERSION
-    printfQuda("QUDA %s (git %s)\n",quda_version.c_str(),gitversion);
+  logQuda(QUDA_SUMMARIZE, "QUDA %s (git %s)\n",quda_version.c_str(),gitversion);
 #else
-    printfQuda("QUDA %s\n",quda_version.c_str());
+  logQuda(QUDA_SUMMARIZE, "QUDA %s\n",quda_version.c_str());
 #endif
-  }
 
 #ifdef MULTI_GPU
   if (dev < 0) {
@@ -466,7 +464,7 @@ void initQudaDevice(int dev)
   }
 
   profileInit.TPSTOP(QUDA_PROFILE_INIT);
-  profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
 }
 
 /*
@@ -474,7 +472,7 @@ void initQudaDevice(int dev)
  */
 void initQudaMemory()
 {
-  profileInit.TPSTART(QUDA_PROFILE_TOTAL);
+  pushProfile(profileInit);
   profileInit.TPSTART(QUDA_PROFILE_INIT);
 
   if (!comms_initialized) init_default_comms();
@@ -498,7 +496,7 @@ void initQudaMemory()
   for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));
 
   profileInit.TPSTOP(QUDA_PROFILE_INIT);
-  profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
+  pushProfile(profileInit);
 }
 
 void updateR()
@@ -564,25 +562,20 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
   if (!initialized) errorQuda("QUDA not initialized");
   if (getVerbosity() == QUDA_DEBUG_VERBOSE) printQudaGaugeParam(param);
 
-  profileGauge.TPSTART(QUDA_PROFILE_INIT);
   // Set the specific input parameters and create the cpu gauge field
   GaugeFieldParam gauge_param(*param, h_gauge);
 
   if (gauge_param.order <= 4) gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
-  GaugeField *in = (param->location == QUDA_CPU_FIELD_LOCATION) ?
-    static_cast<GaugeField*>(new GaugeField(gauge_param)) :
-    static_cast<GaugeField*>(new GaugeField(gauge_param));
+  GaugeField *in = GaugeField::Create(gauge_param);
 
   if (in->Order() == QUDA_BQCD_GAUGE_ORDER) {
     static size_t checksum = SIZE_MAX;
     size_t in_checksum = in->checksum(true);
     if (in_checksum == checksum) {
-      if (getVerbosity() >= QUDA_VERBOSE)
-        printfQuda("Gauge field unchanged - using cached gauge field %lu\n", checksum);
-      profileGauge.TPSTOP(QUDA_PROFILE_INIT);
-      profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
+      logQuda(QUDA_VERBOSE, "Gauge field unchanged - using cached gauge field %lu\n", checksum);
       delete in;
       invalidate_clover = false;
+      popProfile();
       return;
     }
     checksum = in_checksum;
@@ -627,9 +620,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
     precise->copy(*gaugePrecise);
     precise->exchangeGhost();
     freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-    profileGauge.TPSTOP(QUDA_PROFILE_INIT);
   } else {
-    profileGauge.TPSTOP(QUDA_PROFILE_INIT);
     precise->copy(*in);
   }
 
@@ -637,10 +628,8 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
   if (param->type == QUDA_SMEARED_LINKS) {
     gaugeSmeared = createExtendedGauge(*precise, R, profileGauge);
 
-    profileGauge.TPSTART(QUDA_PROFILE_FREE);
     delete precise;
     delete in;
-    profileGauge.TPSTOP(QUDA_PROFILE_FREE);
 
     popProfile();
     return;
@@ -751,9 +740,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
       errorQuda("Invalid gauge type %d", param->type);
   }
 
-  profileGauge.TPSTART(QUDA_PROFILE_FREE);
   delete in;
-  profileGauge.TPSTOP(QUDA_PROFILE_FREE);
 
   if (extendedGaugeResident) {
     // updated the resident gauge field if needed
@@ -809,7 +796,6 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
 {
   pushProfile(profileClover);
   pushVerbosity(inv_param->verbosity);
-  profileClover.TPSTART(QUDA_PROFILE_INIT);
 
   checkCloverParam(inv_param);
   bool device_calc = false; // calculate clover and inverse on the device?
@@ -847,8 +833,6 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
 
   CloverField *in = nullptr;
 
-  profileClover.TPSTOP(QUDA_PROFILE_INIT);
-
   bool clover_update = false;
   // If either of the clover params have changed, trigger a recompute
   double csw_old = cloverPrecise ? cloverPrecise->Csw() : 0.0;
@@ -862,11 +846,10 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
 
   // compute or download clover field only if gauge field has been updated or clover field doesn't exist
   if (clover_update) {
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating new clover field\n");
+    logQuda(QUDA_VERBOSE, "Creating new clover field\n");
     freeSloppyCloverQuda();
     if (cloverPrecise) delete cloverPrecise;
 
-    profileClover.TPSTART(QUDA_PROFILE_INIT);
     cloverPrecise = new CloverField(clover_param);
 
     if (!device_calc || inv_param->return_clover || inv_param->return_clover_inverse) {
@@ -882,16 +865,13 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
       inParam.reconstruct = false;
       in = new CloverField(inParam);
     }
-    profileClover.TPSTOP(QUDA_PROFILE_INIT);
 
     if (!device_calc) {
       cloverPrecise->copy(*in, false);
       if ((h_clovinv && !inv_param->compute_clover_inverse) && !clover::dynamic_inverse())
         cloverPrecise->copy(*in, true);
     } else {
-      profileClover.TPSTOP(QUDA_PROFILE_TOTAL);
       createCloverQuda(inv_param);
-      profileClover.TPSTART(QUDA_PROFILE_TOTAL);
     }
 
     if ((!h_clovinv || inv_param->compute_clover_inverse) && !clover::dynamic_inverse()) {
@@ -902,7 +882,7 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
       }
     }
   } else {
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gauge field unchanged - using cached clover field\n");
+    logQuda(QUDA_VERBOSE, "Gauge field unchanged - using cached clover field\n");
   }
 
   // if requested, copy back the clover / inverse field
@@ -929,9 +909,7 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
     delete tmp;
   }
 
-  profileClover.TPSTART(QUDA_PROFILE_FREE);
   if (in) delete in; // delete object referencing input field
-  profileClover.TPSTOP(QUDA_PROFILE_FREE);
 
   QudaPrecision prec[] = {inv_param->clover_cuda_prec_sloppy, inv_param->clover_cuda_prec_precondition,
                           inv_param->clover_cuda_prec_refinement_sloppy, inv_param->clover_cuda_prec_eigensolver};
@@ -1355,7 +1333,7 @@ void flushChronoQuda(int i)
 
 void endQuda(void)
 {
-  profileEnd.TPSTART(QUDA_PROFILE_TOTAL);
+  pushProfile(profileEnd);
 
   if (!initialized) return;
 
@@ -1394,7 +1372,7 @@ void endQuda(void)
   comm_finalize();
   comms_initialized = false;
 
-  profileEnd.TPSTOP(QUDA_PROFILE_TOTAL);
+  popProfile();
   profileInit2End.TPSTOP(QUDA_PROFILE_TOTAL);
 
   // print out the profile information of the lifetime of the library
@@ -1498,15 +1476,11 @@ namespace quda {
       }
       memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls);
       memcpy(diracParam.c_5, inv_param->c_5, sizeof(Complex) * inv_param->Ls);
-      if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
-        printfQuda("Printing b_5 and c_5 values\n");
-        for (int i = 0; i < diracParam.Ls; i++) {
-          printfQuda("fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n", i, diracParam.b_5[i].real(),
-              diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(), diracParam.c_5[i].imag());
-          // printfQuda("fromQUDA inv_param: b5[%d] = %f %f c5[%d] = %f %f\n", i, inv_param->b_5[i], i,
-          // inv_param->c_5[i] ); printfQuda("fromQUDA creal: b5[%d] = %f %f c5[%d] = %f %f \n", i,
-          // creal(inv_param->b_5[i]), cimag(inv_param->b_5[i]), i, creal(inv_param->c_5[i]), cimag(inv_param->c_5[i]) );
-        }
+      logQuda(QUDA_DEBUG_VERBOSE, "Printing b_5 and c_5 values\n");
+      for (int i = 0; i < diracParam.Ls; i++) {
+        logQuda(QUDA_DEBUG_VERBOSE, "fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n",
+                i, diracParam.b_5[i].real(),
+                diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(), diracParam.c_5[i].imag());
       }
       break;
     case QUDA_STAGGERED_DSLASH:
@@ -1807,7 +1781,6 @@ namespace quda {
 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
 {
   pushProfile(profileDslash);
-  profileDslash.TPSTART(QUDA_PROFILE_INIT);
 
   const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;
 
@@ -1835,13 +1808,11 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc);
 
-  profileDslash.TPSTOP(QUDA_PROFILE_INIT);
-
   in = in_h;
 
   profileDslash.TPSTART(QUDA_PROFILE_COMPUTE);
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
+  logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
 
   if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION &&
       (inv_param->dslash_type == QUDA_STAGGERED_DSLASH ||
@@ -1873,11 +1844,9 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
 
   out_h = out;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
+  logQuda(QUDA_DEBUG_VERBOSE, "Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
 
-  profileDslash.TPSTART(QUDA_PROFILE_FREE);
   delete dirac; // clean up
-  profileDslash.TPSTOP(QUDA_PROFILE_FREE);
 
   popVerbosity();
   popProfile();
@@ -1906,7 +1875,7 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
   ColorSpinorField in(cudaParam);
   in = in_h;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
+  logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
 
   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaParam.location = QUDA_CUDA_FIELD_LOCATION;
@@ -1938,8 +1907,7 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
   ColorSpinorField out_h(cpuParam);
   out_h = out;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
-
+  logQuda(QUDA_DEBUG_VERBOSE, "Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
   popVerbosity();
 }
 
@@ -1967,7 +1935,7 @@ void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
   ColorSpinorField in(cudaParam);
   in = in_h;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
+  logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
 
   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   ColorSpinorField out(cudaParam);
@@ -2001,8 +1969,7 @@ void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
   ColorSpinorField out_h(cpuParam);
   out_h = out;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
-
+  logQuda(QUDA_DEBUG_VERBOSE, "Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
   popVerbosity();
 }
 
@@ -2148,7 +2115,7 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
   ColorSpinorField in(cudaParam);
   in = in_h;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
+  logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
 
   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   ColorSpinorField out(cudaParam);
@@ -2175,8 +2142,7 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
   ColorSpinorField out_h(cpuParam);
   out_h = out;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
-
+  logQuda(QUDA_DEBUG_VERBOSE, "Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
   popVerbosity();
 }
 
@@ -2184,7 +2150,6 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
 {
   if (!initialized) errorQuda("QUDA not initialized");
   pushProfile(profileEigensolve);
-  profileEigensolve.TPSTART(QUDA_PROFILE_INIT);
 
   // Transfer the inv param structure contained in eig_param.
   // This will define the operator to be eigensolved.
@@ -2306,8 +2271,6 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
     }
   }
   //------------------------------------------------------
-  profileEigensolve.TPSTOP(QUDA_PROFILE_INIT);
-
   // We must construct the correct Dirac operator type based on the three
   // options: The normal operator, the daggered operator, and if we pre
   // multiply by gamma5. Each combination requires a unique Dirac operator
@@ -2346,11 +2309,9 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
     for (int i = 0; i < n_eig; i++) host_evecs_[i] = kSpace[i];
   }
 
-  profileEigensolve.TPSTART(QUDA_PROFILE_FREE);
   delete d;
   delete dSloppy;
   delete dPre;
-  profileEigensolve.TPSTOP(QUDA_PROFILE_FREE);
 
   popVerbosity();
 
@@ -2362,7 +2323,6 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
 
 multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)
   : profile(profile) {
-  profile.TPSTART(QUDA_PROFILE_INIT);
   QudaInvertParam *param = mg_param.invert_param;
   // set whether we are going use native or generic blas
   blas_lapack::set_native(param->native_blas_lapack);
@@ -2441,22 +2401,19 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr
 
   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();
-  profile.TPSTOP(QUDA_PROFILE_INIT);
 }
 
 void* newMultigridQuda(QudaMultigridParam *mg_param) {
   profilerStart(__func__);
-
+  pushProfile(profileInvert);
   pushVerbosity(mg_param->invert_param->verbosity);
 
-  profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
   auto *mg = new multigrid_solver(*mg_param, profileInvert);
-  profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
 
   saveTuneCache();
 
   popVerbosity();
-
+  popProfile();
   profilerStop(__func__);
   return static_cast<void*>(mg);
 }
@@ -2468,10 +2425,9 @@ void destroyMultigridQuda(void *mg) {
 void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
 {
   profilerStart(__func__);
-
+  pushProfile(profileInvert);
   pushVerbosity(mg_param->invert_param->verbosity);
 
-  profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
   profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE);
 
   auto *mg = static_cast<multigrid_solver*>(mg_);
@@ -2573,18 +2529,17 @@ void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
   saveTuneCache();
 
   profileInvert.TPSTOP(QUDA_PROFILE_PREAMBLE);
-  profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
 
   popVerbosity();
-
+  popProfile();
   profilerStop(__func__);
 }
 
 void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
 {
   profilerStart(__func__);
+  pushProfile(profileInvert);
   pushVerbosity(mg_param->invert_param->verbosity);
-  profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
 
   auto *mg = static_cast<multigrid_solver*>(mg_);
   checkMultigridParam(mg_param);
@@ -2592,8 +2547,8 @@ void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
 
   mg->mg->dumpNullVectors();
 
-  profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
   popVerbosity();
+  popProfile();
   profilerStop(__func__);
 }
 
@@ -2604,8 +2559,6 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
 
   if (param->inv_type != QUDA_EIGCG_INVERTER && param->inv_type != QUDA_INC_EIGCG_INVERTER) return;
 
-  profile.TPSTART(QUDA_PROFILE_INIT);
-
   GaugeField *cudaGauge = checkGauge(param);
   eig_param.secs   = 0;
   eig_param.gflops = 0;
@@ -2659,16 +2612,12 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
   deflParam = new DeflationParam(eig_param, RV, *m);
 
   defl = new Deflation(*deflParam, profile);
-
-  profile.TPSTOP(QUDA_PROFILE_INIT);
 }
 
 void* newDeflationQuda(QudaEigParam *eig_param) {
-  profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
+  pushProfile(profileInvert);
   auto *defl = new deflated_solver(*eig_param, profileInvert);
-
-  profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
-
+  popProfile();
   saveProfile(__func__);
   flushProfile();
   return static_cast<void*>(defl);
@@ -2811,17 +2760,8 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
 
   dirac.prepare(in, out, x, b, param->solution_type);
 
-  if (getVerbosity() >= QUDA_VERBOSE) {
-    double nin = blas::norm2(*in);
-    double nout = blas::norm2(*out);
-    printfQuda("Prepared source = %g\n", nin);
-    printfQuda("Prepared solution = %g\n", nout);
-  }
-
-  if (getVerbosity() >= QUDA_VERBOSE) {
-    double nin = blas::norm2(*in);
-    printfQuda("Prepared source post mass rescale = %g\n", nin);
-  }
+  logQuda(QUDA_VERBOSE, "Prepared source = %g\n", blas::norm2(*in));
+  logQuda(QUDA_VERBOSE, "Prepared solution = %g\n", blas::norm2(*out));
 
   // solution_type specifies *what* system is to be solved.
   // solve_type specifies *how* the system is to be solved.
@@ -2968,7 +2908,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
     solverParam.updateInvertParam(*param);
   }
 
-  if (getVerbosity() >= QUDA_VERBOSE) { printfQuda("Solution = %g\n", blas::norm2(x)); }
+  logQuda(QUDA_VERBOSE, "Solution = %g\n", blas::norm2(x));
 
   profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE);
   if (param->chrono_make_resident) {
@@ -3026,8 +2966,6 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
   }
   profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE);
 
-  profileInvert.TPSTART(QUDA_PROFILE_FREE);
-
   if (param->use_resident_solution && !param->make_resident_solution) solutionResident.clear();
 
   delete d;
@@ -3035,14 +2973,11 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
   delete dPre;
   delete dEig;
 
-  profileInvert.TPSTOP(QUDA_PROFILE_FREE);
-
-  popVerbosity();
-
   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();
 
   profilerStop(__func__);
+  popVerbosity();
   popProfile();
 }
 
@@ -3112,12 +3047,13 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
   */
 
   profilerStart(__func__);
+  pushProfile(profileInvertMultiSrc);
 
   CommKey split_key = {param->split_grid[0], param->split_grid[1], param->split_grid[2], param->split_grid[3]};
   int num_sub_partition = quda::product(split_key);
 
   if (!split_key.is_valid()) {
-    errorQuda("split_key = [%d,%d,%d,%d] is not valid.\n", split_key[0], split_key[1], split_key[2], split_key[3]);
+    errorQuda("split_key = [%d,%d,%d,%d] is not valid", split_key[0], split_key[1], split_key[2], split_key[3]);
   }
 
   if (num_sub_partition == 1) { // In this case we don't split the grid.
@@ -3126,10 +3062,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
 
   } else {
 
-    profileInvertMultiSrc.TPSTART(QUDA_PROFILE_TOTAL);
-    profileInvertMultiSrc.TPSTART(QUDA_PROFILE_INIT);
-
-    if (gauge_param == nullptr) { errorQuda("gauge_param == nullptr.\n"); }
+    if (gauge_param == nullptr) { errorQuda("gauge_param == nullptr"); }
 
     // Doing the sub-partition arithmatics
     if (param->num_src_per_sub_partition * num_sub_partition != param->num_src) {
@@ -3143,7 +3076,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH) { pc_type = QUDA_5D_PC; }
 
     // Doesn't work for MG yet.
-    if (param->inv_type_precondition == QUDA_MG_INVERTER) { errorQuda("Split Grid does NOT work with MG yet."); }
+    if (param->inv_type_precondition == QUDA_MG_INVERTER) errorQuda("Split Grid does NOT work with MG yet");
 
     checkInvertParam(param, _hp_x[0], _hp_b[0]);
 
@@ -3169,14 +3102,14 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     // set up the gauge field params.
     if (!is_staggered) { // not staggered
       gf_param = new GaugeFieldParam(*gauge_param, h_gauge);
-      if (gf_param->order <= 4) { gf_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; }
+      if (gf_param->order <= 4) gf_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO;
       in = GaugeField::Create(*gf_param);
     } else { // staggered
       milc_fatlink_param = new GaugeFieldParam(*gauge_param, milc_fatlinks);
-      if (milc_fatlink_param->order <= 4) { milc_fatlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; }
+      if (milc_fatlink_param->order <= 4) milc_fatlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO;
       milc_fatlink_field = GaugeField::Create(*milc_fatlink_param);
       milc_longlink_param = new GaugeFieldParam(*gauge_param, milc_longlinks);
-      if (milc_longlink_param->order <= 4) { milc_longlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; }
+      if (milc_longlink_param->order <= 4) milc_longlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO;
       milc_longlink_field = GaugeField::Create(*milc_longlink_param);
     }
 
@@ -3200,13 +3133,14 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     }
 
     // Make the gauge param dimensions larger
-    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
-      printfQuda("Spliting the grid into sub-partitions: (%2d,%2d,%2d,%2d) / (%2d,%2d,%2d,%2d).\n", comm_dim(0),
-                 comm_dim(1), comm_dim(2), comm_dim(3), split_key[0], split_key[1], split_key[2], split_key[3]);
-    }
+    logQuda(QUDA_DEBUG_VERBOSE,
+            "Spliting the grid into sub-partitions: (%2d,%2d,%2d,%2d) / (%2d,%2d,%2d,%2d)\n",
+            comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3),
+            split_key[0], split_key[1], split_key[2], split_key[3]);
+
     for (int d = 0; d < CommKey::n_dim; d++) {
       if (comm_dim(d) % split_key[d] != 0) {
-        errorQuda("Split not possible: %2d %% %2d != 0.", comm_dim(d), split_key[d]);
+        errorQuda("Split not possible: %2d %% %2d != 0", comm_dim(d), split_key[d]);
       }
       if (!is_staggered) {
         gf_param->x[d] *= split_key[d];
@@ -3283,7 +3217,6 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
       quda::split_field(*collected_milc_longlink_field, v_g, split_key);
     }
 
-    profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_INIT);
     profileInvertMultiSrc.TPSTART(QUDA_PROFILE_PREAMBLE);
 
     comm_barrier();
@@ -3309,11 +3242,10 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     comm_barrier();
 
     profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_PREAMBLE);
-    profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_TOTAL);
 
     // Load gauge field after pushing the split communicator so the comm buffers, etc are setup according to
     // the split topology.
-    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loading gauge field...\n"); }
+    logQuda(QUDA_DEBUG_VERBOSE, "Split grid loading gauge field...\n");
     if (!is_staggered) {
       loadGaugeQuda(collected_gauge->data(), gauge_param);
     } else {
@@ -3321,24 +3253,23 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
       loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->data(),
                            collected_milc_longlink_field->data());
     }
-    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loaded gauge field...\n"); }
+    logQuda(QUDA_DEBUG_VERBOSE, "Split grid loaded gauge field...\n");
 
     if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH || param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH
         || param->dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) {
-      if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loading clover field...\n"); }
+      logQuda(QUDA_DEBUG_VERBOSE, "Split grid loading clover field...\n");
       if (collected_clover) {
         loadCloverQuda(collected_clover->data(false), collected_clover->data(true), param);
       } else {
         loadCloverQuda(nullptr, nullptr, param);
       }
-      if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loaded clover field...\n"); }
+      logQuda(QUDA_DEBUG_VERBOSE, "Split grid loaded clover field...\n");
     }
 
     for (int n = 0; n < param->num_src_per_sub_partition; n++) {
       op(_collect_x[n]->data(), _collect_b[n]->data(), param, args...);
     }
 
-    profileInvertMultiSrc.TPSTART(QUDA_PROFILE_TOTAL);
     profileInvertMultiSrc.TPSTART(QUDA_PROFILE_EPILOGUE);
     push_communicator(default_comm_key);
     updateR();
@@ -3376,7 +3307,6 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     if (collected_clover) { delete collected_clover; }
 
     profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_EPILOGUE);
-    profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_TOTAL);
 
     // Restore the gauge field
     if (!is_staggered) {
@@ -3391,6 +3321,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     }
   }
 
+  popProfile();
   profilerStop(__func__);
 }
 
@@ -3453,8 +3384,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
   pushProfile(profileMulti);
   profilerStart(__func__);
 
-  profileMulti.TPSTART(QUDA_PROFILE_INIT);
-
   if (!initialized) errorQuda("QUDA not initialized");
 
   checkInvertParam(param, hp_x[0], hp_b);
@@ -3558,7 +3487,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
     h_x[i] = std::make_unique<ColorSpinorField>(cpuParam);
   }
 
-  profileMulti.TPSTOP(QUDA_PROFILE_INIT);
   // Now I need a colorSpinorParam for the device
   ColorSpinorParam cudaParam(cpuParam, *param, QUDA_CUDA_FIELD_LOCATION);
   // This setting will download a host vector
@@ -3566,7 +3494,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
   cudaParam.field = &h_b;
   ColorSpinorField b(cudaParam); // Creates b and downloads h_b to it
 
-  profileMulti.TPSTART(QUDA_PROFILE_INIT);
   // Create the solution fields filled with zero
   cudaParam.create = QUDA_ZERO_FIELD_CREATE;
 
@@ -3586,8 +3513,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
   std::vector<ColorSpinorField> &x = solutionResident;
   std::vector<ColorSpinorField> p;
 
-  profileMulti.TPSTOP(QUDA_PROFILE_INIT);
-
   profileMulti.TPSTART(QUDA_PROFILE_PREAMBLE);
 
   // Check source norms
@@ -3634,10 +3559,8 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
 
   if (param->compute_true_res) {
     // check each shift has the desired tolerance and use sequential CG to refine
-    profileMulti.TPSTART(QUDA_PROFILE_INIT);
     cudaParam.create = QUDA_ZERO_FIELD_CREATE;
     ColorSpinorField r(cudaParam);
-    profileMulti.TPSTOP(QUDA_PROFILE_INIT);
     QudaInvertParam refineparam = *param;
     refineparam.cuda_prec_sloppy = param->cuda_prec_refinement_sloppy;
     Dirac &dirac = *d;
@@ -3667,9 +3590,9 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
       const double refine_tol = (param->tol_offset[i] == 0.0 ? iter_tol : param->tol_offset[i]);
       // refine if either L2 or heavy quark residual tolerances have not been met, only if desired residual is > 0
       if (param->true_res_offset[i] > refine_tol || rsd_hq > tol_hq) {
-	if (getVerbosity() >= QUDA_SUMMARIZE)
-	  printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
-		     i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);
+	logQuda(QUDA_SUMMARIZE,
+                "Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
+                i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);
 
         // for staggered the shift is just a change in mass term (FIXME: for twisted mass also)
         if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
@@ -3767,8 +3690,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
       blas::ax(sqrt(nb), x[i]);
     }
 
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Solution %d = %g\n", i, blas::norm2(x[i]));
-
+    logQuda(QUDA_VERBOSE, "Solution %d = %g\n", i, blas::norm2(x[i]));
     if (!param->make_resident_solution) *h_x[i] = x[i];
   }
 
@@ -3778,19 +3700,16 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
 
   profileMulti.TPSTOP(QUDA_PROFILE_EPILOGUE);
 
-  profileMulti.TPSTART(QUDA_PROFILE_FREE);
   delete d;
   delete dSloppy;
   delete dPre;
   delete dRefine;
-  profileMulti.TPSTOP(QUDA_PROFILE_FREE);
-
-  popVerbosity();
 
   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();
 
   profilerStop(__func__);
+  popVerbosity();
   popProfile();
 }
 
@@ -3883,7 +3802,6 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
     gParam.setPrecision(param->cuda_prec, true);
     gParam.create = QUDA_NULL_FIELD_CREATE;
     GaugeField cudaInLink(gParam);
-    profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
 
     cudaInLink.copy(cpuInLink);
     cudaInLinkEx = createExtendedGauge(cudaInLink, R, profileGaussianSmear);
@@ -3903,7 +3821,7 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
 
   freeUniqueGaugeQuda(QUDA_SMEARED_LINKS);
   gaugeSmeared = new GaugeField(gsParam);
-  
+
   computeTwoLink(*gaugeSmeared, *cudaInLinkEx);
   gaugeSmeared->exchangeGhost();
 
@@ -4206,7 +4124,6 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   GaugeField cudaForce(gParam);
   GaugeField *cudaForce_[2] = {&cudaForce};
 
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT);
   ColorSpinorParam qParam;
   qParam.location = QUDA_CUDA_FIELD_LOCATION;
   qParam.nColor = 3;
@@ -4221,7 +4138,6 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   qParam.x[4] = 1;
   qParam.create = QUDA_NULL_FIELD_CREATE;
   qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT);
 
   // resident gauge field is required
   if (!gauge_param->use_resident_gauge || !gaugePrecise) errorQuda("Resident gauge field is required");
@@ -4233,8 +4149,6 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
               gauge_param->staggered_phase_type, gaugePrecise->StaggeredPhase());
   }
 
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT);
-
   const int nvector = inv_param->num_offset;
   std::vector<ColorSpinorField*> X(nvector);
   for (int i=0; i<nvector; i++) X[i] = ColorSpinorField::Create(qParam);
@@ -4254,7 +4168,6 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   setDiracParam(diracParam, inv_param, pc_solve);
   Dirac *dirac = Dirac::create(diracParam);
 
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_PREAMBLE);
 
   for (int i=0; i<nvector; i++) {
@@ -4270,12 +4183,10 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
 
   profileStaggeredForce.TPSTOP(QUDA_PROFILE_PREAMBLE);
 
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE);
 #if 0
   if (inv_param->use_resident_solution) solutionResident.clear();
 #endif
   delete dirac;
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE);
 
   // compute quark-field outer product
   for (int i=0; i<nvector; i++) {
@@ -4297,9 +4208,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   if (gauge_param->make_resident_mom && !gauge_param->use_resident_mom) std::exchange(momResident, cudaMom);
   else momResident = GaugeField();
 
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE);
   for (int i=0; i<nvector; i++) delete X[i];
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE);
 
   popProfile();
 }
@@ -4405,10 +4314,8 @@ void computeHISQForceQuda(void* const milc_momentum,
       for (int i = 0; i < num_terms; ++i) {
 
         // Wrap the MILC quark field
-        profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
         qParam.v = fermion[i];
         ColorSpinorField cpuQuark(qParam); // create host quark field
-        profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
         cudaQuark = cpuQuark;
         computeStaggeredOprod(oprod, cudaQuark, coeff[i], 3);
@@ -4424,10 +4331,8 @@ void computeHISQForceQuda(void* const milc_momentum,
       for (int i = 0; i < num_naik_terms; ++i) {
 
         // Wrap the MILC quark field
-        profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
         qParam.v = fermion[i + num_terms - num_naik_terms];
         ColorSpinorField cpuQuark(qParam); // create host quark field
-        profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
         cudaQuark = cpuQuark;
         computeStaggeredOprod(oprod, cudaQuark, coeff[i + num_terms], 3);
@@ -4615,7 +4520,6 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
 {
   using namespace quda;
   pushProfile(profileCloverForce);
-  profileCloverForce.TPSTART(QUDA_PROFILE_INIT);
 
   checkGaugeParam(gauge_param);
   if (!gaugePrecise) errorQuda("No resident gauge field");
@@ -4689,8 +4593,6 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
   fParam.geometry = QUDA_TENSOR_GEOMETRY;
   GaugeField oprod(fParam);
 
-  profileCloverForce.TPSTOP(QUDA_PROFILE_INIT);
-
   std::vector<double> force_coeff(nvector);
   // loop over different quark fields
   for(int i=0; i<nvector; i++){
@@ -4703,10 +4605,8 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
       qParam.x[0] /= 2;
 
       // Wrap the even-parity MILC quark field
-      profileCloverForce.TPSTART(QUDA_PROFILE_INIT);
       qParam.v = h_x[i];
       ColorSpinorField cpuQuarkX(qParam); // create host quark field
-      profileCloverForce.TPSTOP(QUDA_PROFILE_INIT);
 
       x.Even() = cpuQuarkX;
 
@@ -4762,8 +4662,6 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
   // copy the outer product field back to the host
   cpuMom.copy(cudaMom);
 
-  profileCloverForce.TPSTART(QUDA_PROFILE_FREE);
-
   for (int i=0; i<nvector; i++) {
     delete quarkX[i];
     delete quarkP[i];
@@ -5034,23 +4932,23 @@ void copyExtendedResidentGaugeQuda(void *resident_gauge)
 
 void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int n_steps, double alpha)
 {
+  pushProfile(profileWuppertal);
+  pushVerbosity(inv_param->verbosity);
   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
 
-  pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);
 
   GaugeField *precise = nullptr;
 
   if (gaugeSmeared != nullptr) {
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Wuppertal smearing done with gaugeSmeared\n");
+    logQuda(QUDA_VERBOSE, "Wuppertal smearing done with gaugeSmeared\n");
     GaugeFieldParam gParam(*gaugePrecise);
     gParam.create = QUDA_NULL_FIELD_CREATE;
     precise = new GaugeField(gParam);
     copyExtendedGauge(*precise, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
     precise->exchangeGhost();
   } else {
-    if (getVerbosity() >= QUDA_VERBOSE)
-      printfQuda("Wuppertal smearing done with gaugePrecise\n");
+    logQuda(QUDA_VERBOSE, "Wuppertal smearing done with gaugePrecise\n");
     precise = gaugePrecise;
   }
 
@@ -5061,11 +4959,7 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param,
   ColorSpinorField in(cudaParam);
   in = in_h;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
-    double cpu = blas::norm2(in_h);
-    double gpu = blas::norm2(in);
-    printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
-  }
+  logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
 
   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   ColorSpinorField out(cudaParam);
@@ -5085,10 +4979,7 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param,
   for (unsigned int i = 0; i < n_steps; i++) {
     if (i) in = out;
     ApplyLaplace(out, in, *precise, 3, a, b, in, parity, false, comm_dim, profileWuppertal);
-    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
-      double norm = blas::norm2(out);
-      printfQuda("Step %d, vector norm %e\n", i, norm);
-    }
+    logQuda(QUDA_DEBUG_VERBOSE, "Step %d, vector norm %e\n", i, blas::norm2(out));
   }
 
   cpuParam.v = h_out;
@@ -5096,34 +4987,29 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param,
   ColorSpinorField out_h(cpuParam);
   out_h = out;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
-    double cpu = blas::norm2(out_h);
-    double gpu = blas::norm2(out);
-    printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
-  }
+  logQuda(QUDA_DEBUG_VERBOSE, "Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
 
-  if (gaugeSmeared != nullptr)
-    delete precise;
+  if (gaugeSmeared != nullptr) delete precise;
 
   popVerbosity();
+  popProfile();
 }
- 
+
 
 void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_param)
 {
   if (smear_param->n_steps == 0) return;
   pushProfile(profileGaussianSmear);
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_INIT);
-  
+
   QudaInvertParam *inv_param = smear_param->inv_param;
 
   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
-    
+
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);
 
   if ( gaugeSmeared == nullptr || smear_param->compute_2link != 0 ) {
-  
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gaussian smearing done with gaugeSmeared\n");
+
+    logQuda(QUDA_VERBOSE, "Gaussian smearing done with gaugeSmeared\n");
     freeUniqueGaugeQuda(QUDA_SMEARED_LINKS);
 
     GaugeFieldParam gParam(*gaugePrecise);
@@ -5137,14 +5023,14 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
     gParam.pad = gParam.pad*gParam.nFace;
     //
     gaugeSmeared = new GaugeField(gParam);
-    
+
     GaugeField *two_link_ext = createExtendedGauge(*gaugePrecise, R, profileGauge);//aux field
-    
+
     computeTwoLink(*gaugeSmeared, *two_link_ext);
-    
+
     gaugeSmeared->exchangeGhost();
-    
-    delete two_link_ext;   
+
+    delete two_link_ext;
   }
 
   if (!initialized) errorQuda("QUDA not initialized");
@@ -5152,13 +5038,13 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printQudaInvertParam(inv_param); }
 
   checkInvertParam(inv_param);
-  
+
   // Create device side ColorSpinorField vectors and to pass to the
   // compute function.
   const lat_dim_t X = gaugeSmeared->X();
-  
+
   inv_param->dslash_type = QUDA_ASQTAD_DSLASH;
-  
+
   ColorSpinorParam cpuParam(h_in, *inv_param, X, QUDA_MAT_SOLUTION, QUDA_CPU_FIELD_LOCATION);
   cpuParam.nSpin = 1;
   // QUDA style pointer for host data.
@@ -5172,7 +5058,7 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
   ColorSpinorField in(cudaParam);
   ColorSpinorField out(cudaParam);
   ColorSpinorField temp1(cudaParam);
- 
+
   // Create the smearing operator
   //------------------------------------------------------
   Dirac *d       = nullptr;
@@ -5197,10 +5083,9 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
     errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(), inv_param->cuda_prec);
   //
   d = Dirac::create(diracParam); // create the Dirac operator
-  
+
   Dirac &dirac = *d;
   DiracM qsmear_op(dirac);
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
 
   // Copy host data to device
   in = in_h;
@@ -5208,20 +5093,17 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
   const double ftmp    = -(smear_param->width*smear_param->width)/(4.0*smear_param->n_steps*4.0);  /* Extra 4 to compensate for stride 2 */
   // Scale up the source to prevent underflow
   profileGaussianSmear.TPSTART(QUDA_PROFILE_COMPUTE);
-  
-  const double msq     = 1. / ftmp;  
+
+  const double msq     = 1. / ftmp;
   const double a       = inv_param->laplace3D * 2.0 + msq;
   const QudaParity  parity   = QUDA_INVALID_PARITY;
   for (int i = 0; i < smear_param->n_steps; i++) {
     if (i > 0) std::swap(in, out);
     blas::ax(ftmp, in);
     blas::axpy(a, in, temp1);
-    
+
     qsmear_op.Expose()->SmearOp(out, in, a, 0.0, smear_param->t0, parity);
-    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
-      double norm = blas::norm2(out);
-      printfQuda("Step %d, vector norm %e\n", i, norm);
-    }
+    logQuda(QUDA_DEBUG_VERBOSE, "Step %d, vector norm %e\n", i, blas::norm2(out));
     blas::xpay(temp1, -1.0, out);
     blas::zero(temp1);
   }
@@ -5231,12 +5113,8 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
   // Copy device data to host.
   in_h = out;
 
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_FREE);
-
-  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Finished 2link Gaussian smearing.\n");
-
+  logQuda(QUDA_VERBOSE, "Finished 2link Gaussian smearing.\n");
   delete d;
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_FREE);
 
   smear_param->gflops = dirac.Flops();
 
@@ -5263,9 +5141,7 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
 
   int measurement_n = 0; // The nth measurement to take
   gaugeObservablesQuda(&obs_param[measurement_n]);
-  if (getVerbosity() >= QUDA_SUMMARIZE) {
-    printfQuda("Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge);
-  }
+  logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge);
 
   for (unsigned int i = 0; i < smear_param->n_steps; i++) {
     switch (smear_param->smear_type) {
@@ -5280,9 +5156,7 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
     if ((i + 1) % smear_param->meas_interval == 0) {
       measurement_n++;
       gaugeObservablesQuda(&obs_param[measurement_n]);
-      if (getVerbosity() >= QUDA_SUMMARIZE) {
-        printfQuda("Q charge at step %03d = %+.16e\n", i + 1, obs_param[measurement_n].qcharge);
-      }
+      logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", i + 1, obs_param[measurement_n].qcharge);
     }
   }
 
@@ -5314,11 +5188,10 @@ void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam
 
   gaugeObservables(in, obs_param[measurement_n]);
 
-  if (getVerbosity() >= QUDA_SUMMARIZE) {
-    printfQuda("flow t, plaquette, E_tot, E_spatial, E_temporal, Q charge\n");
-    printfQuda("%le %.16e %+.16e %+.16e %+.16e %+.16e\n", 0.0, obs_param[0].plaquette[0], obs_param[0].energy[0],
-               obs_param[0].energy[1], obs_param[0].energy[2], obs_param[0].qcharge);
-  }
+  logQuda(QUDA_SUMMARIZE, "flow t, plaquette, E_tot, E_spatial, E_temporal, Q charge\n");
+  logQuda(QUDA_SUMMARIZE, "%le %.16e %+.16e %+.16e %+.16e %+.16e\n", 0.0,
+          obs_param[0].plaquette[0], obs_param[0].energy[0],
+          obs_param[0].energy[1], obs_param[0].energy[2], obs_param[0].qcharge);
 
   for (unsigned int i = 0; i < smear_param->n_steps; i++) {
     // Perform W1, W2, and Vt Wilson Flow steps as defined in
@@ -5329,12 +5202,10 @@ void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam
     if ((i + 1) % smear_param->meas_interval == 0) {
       measurement_n++; // increment measurements.
       gaugeObservables(out, obs_param[measurement_n]);
-      if (getVerbosity() >= QUDA_SUMMARIZE) {
-        printfQuda("%le %.16e %+.16e %+.16e %+.16e %+.16e\n", smear_param->epsilon * (i + 1),
-                   obs_param[measurement_n].plaquette[0], obs_param[measurement_n].energy[0],
-                   obs_param[measurement_n].energy[1], obs_param[measurement_n].energy[2],
-                   obs_param[measurement_n].qcharge);
-      }
+      logQuda(QUDA_SUMMARIZE, "%le %.16e %+.16e %+.16e %+.16e %+.16e\n", smear_param->epsilon * (i + 1),
+              obs_param[measurement_n].plaquette[0], obs_param[measurement_n].energy[0],
+              obs_param[measurement_n].energy[1], obs_param[measurement_n].energy[2],
+              obs_param[measurement_n].qcharge);
     }
   }
 
@@ -5430,8 +5301,6 @@ void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const Quda
   // DMH: Easiest way to construct ColorSpinorField? Do we require the user
   //     to declare and fill and invert_param, or can it just be hacked?.
 
-  profileContract.TPSTART(QUDA_PROFILE_INIT);
-
   // wrap CPU host side pointers
   lat_dim_t X_ = {X[0], X[1], X[2], X[3]};
   ColorSpinorParam cpuParam((void *)hp_x, *param, X_, false, param->input_location);
@@ -5454,7 +5323,6 @@ void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const Quda
 
   size_t data_bytes = x[0].Volume() * x[0].Nspin() * x[0].Nspin() * 2 * x[0].Precision();
   void *d_result = pool_device_malloc(data_bytes);
-  profileContract.TPSTOP(QUDA_PROFILE_INIT);
 
   x[0] = h_x;
   y[0] = h_y;

From a61fbbaf9e4e32e9e21be2ed1c93f1dcb84ecb2d Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 30 Aug 2023 10:54:12 -0700
Subject: [PATCH 35/60] Fix issues with staggered_invert_test related to
 gauge-field unification

---
 tests/staggered_invert_test.cpp | 249 +++++++++++++-------------------
 1 file changed, 102 insertions(+), 147 deletions(-)

diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index de60f45d41..ea5aab17fd 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -102,60 +102,8 @@ void display_test_info()
              dimPartitioned(3));
 }
 
-int main(int argc, char **argv)
+void test(int argc, char **argv)
 {
-  setQudaDefaultMgTestParams();
-  // Parse command line options
-  auto app = make_app();
-  add_eigen_option_group(app);
-  add_deflation_option_group(app);
-  add_multigrid_option_group(app);
-  add_comms_option_group(app);
-  CLI::TransformPairs<int> test_type_map {{"full", 0}, {"full_ee_prec", 1}, {"full_oo_prec", 2}, {"even", 3},
-                                          {"odd", 4},  {"mcg_even", 5},     {"mcg_odd", 6}};
-  app->add_option("--test", test_type, "Test method")->transform(CLI::CheckedTransformer(test_type_map));
-  try {
-    app->parse(argc, argv);
-  } catch (const CLI::ParseError &e) {
-    return app->exit(e);
-  }
-  setVerbosity(verbosity);
-  if (!inv_multigrid) solve_type = QUDA_INVALID_SOLVE;
-
-  if (inv_deflate && inv_multigrid) {
-    printfQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve.\n");
-    exit(0);
-  }
-
-  // Set values for precisions via the command line.
-  setQudaPrecisions();
-
-  // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp)
-  initComms(argc, argv, gridsize_from_cmdline);
-
-  initRand();
-
-  // Only these fermions are supported in this file. Ensure a reasonable default,
-  // ensure that the default is improved staggered
-  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) {
-    printfQuda("dslash_type %s not supported, defaulting to %s\n", get_dslash_str(dslash_type),
-               get_dslash_str(QUDA_ASQTAD_DSLASH));
-    dslash_type = QUDA_ASQTAD_DSLASH;
-  }
-
-  // Need to add support for LAPLACE MG?
-  if (inv_multigrid) {
-    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) {
-      printfQuda("dslash_type %s not supported for multigrid preconditioner\n", get_dslash_str(dslash_type));
-      exit(0);
-    }
-  }
-
-  // Deduce operator, solution, and operator preconditioning types
-  if (!inv_multigrid) setQudaStaggeredInvTestParams();
-
-  display_test_info();
-
   // Set QUDA internal parameters
   QudaGaugeParam gauge_param = newQudaGaugeParam();
   QudaInvertParam inv_param = newQudaInvertParam();
@@ -167,11 +115,7 @@ int main(int argc, char **argv)
   QudaEigParam mg_eig_param[mg_levels];
 
   // params related to split grid.
-  inv_param.split_grid[0] = grid_partition[0];
-  inv_param.split_grid[1] = grid_partition[1];
-  inv_param.split_grid[2] = grid_partition[2];
-  inv_param.split_grid[3] = grid_partition[3];
-
+  for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i];
   int num_sub_partition = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
   bool use_split_grid = num_sub_partition > 1;
 
@@ -205,9 +149,6 @@ int main(int argc, char **argv)
     inv_param.eig_param = nullptr;
   }
 
-  // This must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME)
-  initQuda(device_ordinal);
-
   setDims(gauge_param.X);
   // Hack: use the domain wall dimensions so we may use the 5th dim for multi indexing
   dw_setDims(gauge_param.X, 1);
@@ -215,29 +156,35 @@ int main(int argc, char **argv)
   // Staggered Gauge construct START
   //-----------------------------------------------------------------------------------
   // Allocate host staggered gauge fields
-  void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *milc_fatlink = nullptr;
-  void *milc_longlink = nullptr;
-  GaugeField *cpuFat = nullptr;
-  GaugeField *cpuLong = nullptr;
-
-  for (int dir = 0; dir < 4; dir++) {
-    qdp_inlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-    qdp_fatlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-    qdp_longlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-  }
-  milc_fatlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-  milc_longlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-
-  // For load, etc
+  gauge_param.type = (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) ?
+    QUDA_SU3_LINKS :
+    QUDA_ASQTAD_FAT_LINKS;
   gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
+  gauge_param.location = QUDA_CPU_FIELD_LOCATION;
 
+  GaugeFieldParam cpuParam(gauge_param);
+  cpuParam.create = QUDA_NULL_FIELD_CREATE;
+  cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
+  cpuParam.order = QUDA_QDP_GAUGE_ORDER;
+  GaugeField cpuIn = GaugeField(cpuParam);
+  GaugeField cpuFatQDP = GaugeField(cpuParam);
+  cpuParam.order = QUDA_MILC_GAUGE_ORDER;
+  GaugeField cpuFatMILC = GaugeField(cpuParam);
+
+  cpuParam.link_type = QUDA_ASQTAD_LONG_LINKS;
+  cpuParam.nFace = 3;
+  cpuParam.order = QUDA_QDP_GAUGE_ORDER;
+  GaugeField cpuLongQDP = GaugeField(cpuParam);
+  cpuParam.order = QUDA_MILC_GAUGE_ORDER;
+  GaugeField cpuLongMILC = GaugeField(cpuParam);
+
+  void* qdp_inlink[4] = {cpuIn.data(0), cpuIn.data(1), cpuIn.data(2), cpuIn.data(3)};
+  void* qdp_fatlink[4] = {cpuFatQDP.data(0), cpuFatQDP.data(1), cpuFatQDP.data(2), cpuFatQDP.data(3)};
+  void* qdp_longlink[4] = {cpuLongQDP.data(0), cpuLongQDP.data(1), cpuLongQDP.data(2), cpuLongQDP.data(3)};
   constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv);
   // Reorder gauge fields to MILC order
-  reorderQDPtoMILC(milc_fatlink, qdp_fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-  reorderQDPtoMILC(milc_longlink, qdp_longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+  cpuFatMILC = cpuFatQDP;
+  cpuLongMILC = cpuLongQDP;
 
   // Compute plaquette. Routine is aware that the gauge fields already have the phases on them.
   // This needs to be called before `loadFatLongGaugeQuda` because this routine also loads the
@@ -252,23 +199,14 @@ int main(int argc, char **argv)
     printfQuda("Computed fat link plaquette is %e (spatial = %e, temporal = %e)\n", plaq[0], plaq[1], plaq[2]);
   }
 
-  // Create ghost gauge fields in case of multi GPU builds.
-  gauge_param.type = (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) ?
-    QUDA_SU3_LINKS :
-    QUDA_ASQTAD_FAT_LINKS;
-  gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
-  gauge_param.location = QUDA_CPU_FIELD_LOCATION;
-
-  GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink);
-  cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-  cpuFat = GaugeField::Create(cpuFatParam);
+  loadFatLongGaugeQuda(cpuFatMILC.data(), cpuLongMILC.data(), gauge_param);
 
-  gauge_param.type = QUDA_ASQTAD_LONG_LINKS;
-  GaugeFieldParam cpuLongParam(gauge_param, milc_longlink);
-  cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-  cpuLong = GaugeField::Create(cpuLongParam);
-
-  loadFatLongGaugeQuda(milc_fatlink, milc_longlink, gauge_param);
+  // now copy back to QDP aliases, since these are used for the reference dslash
+  cpuFatQDP = cpuFatMILC;
+  cpuLongQDP = cpuLongMILC;
+  // ensure QDP alias has exchanged ghosts
+  cpuFatQDP.exchangeGhost();
+  cpuLongQDP.exchangeGhost();
 
   // Staggered Gauge construct END
   //-----------------------------------------------------------------------------------
@@ -283,33 +221,27 @@ int main(int argc, char **argv)
 
   // Staggered vector construct START
   //-----------------------------------------------------------------------------------
-  std::vector<quda::ColorSpinorField *> in;
-  std::vector<quda::ColorSpinorField *> out;
-  quda::ColorSpinorField *ref;
-  quda::ColorSpinorField *tmp;
+  std::vector<quda::ColorSpinorField> in(Nsrc);
+  std::vector<quda::ColorSpinorField> out(Nsrc);
   quda::ColorSpinorParam cs_param;
   constructStaggeredTestSpinorParam(&cs_param, &inv_param, &gauge_param);
   for (int k = 0; k < Nsrc; k++) {
-    in.emplace_back(quda::ColorSpinorField::Create(cs_param));
-    out.emplace_back(quda::ColorSpinorField::Create(cs_param));
+    in[k] = quda::ColorSpinorField(cs_param);
+    out[k] = quda::ColorSpinorField(cs_param);
   }
-  ref = quda::ColorSpinorField::Create(cs_param);
-  tmp = quda::ColorSpinorField::Create(cs_param);
+  ColorSpinorField ref(cs_param);
+  ColorSpinorField tmp(cs_param);
   // Staggered vector construct END
   //-----------------------------------------------------------------------------------
 
   // Prepare rng
-  auto *rng = new quda::RNG(*ref, 1234);
+  quda::RNG rng(ref, 1234);
 
   // Performance measuring
   std::vector<double> time(Nsrc);
   std::vector<double> gflops(Nsrc);
   std::vector<int> iter(Nsrc);
 
-  // Pointers for split grid tests
-  std::vector<quda::ColorSpinorField *> _h_b(Nsrc, nullptr);
-  std::vector<quda::ColorSpinorField *> _h_x(Nsrc, nullptr);
-
   // QUDA invert test
   //----------------------------------------------------------------------------
 
@@ -320,17 +252,14 @@ int main(int argc, char **argv)
     // case 3: // even parity solution, solving EVEN system
     // case 4: // odd parity solution, solving ODD system
 
-    if (multishift != 1) {
-      printfQuda("Multishift not supported for test %d\n", test_type);
-      exit(0);
-    }
+    if (multishift != 1) errorQuda("Multishift not supported for test %d\n", test_type);
 
-    for (int k = 0; k < Nsrc; k++) { quda::spinorNoise(*in[k], *rng, QUDA_NOISE_UNIFORM); }
+    for (int k = 0; k < Nsrc; k++) { quda::spinorNoise(in[k], rng, QUDA_NOISE_UNIFORM); }
 
     if (!use_split_grid) {
       for (int k = 0; k < Nsrc; k++) {
         if (inv_deflate) eig_param.preserve_deflation = k < Nsrc - 1 ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
-        invertQuda(out[k]->data(), in[k]->data(), &inv_param);
+        invertQuda(out[k].data(), in[k].data(), &inv_param);
         time[k] = inv_param.secs;
         gflops[k] = inv_param.gflops / inv_param.secs;
         iter[k] = inv_param.iter;
@@ -341,13 +270,13 @@ int main(int argc, char **argv)
       std::vector<void *> _hp_x(Nsrc);
       std::vector<void *> _hp_b(Nsrc);
       for (int k = 0; k < Nsrc; k++) {
-        _hp_x[k] = out[k]->data();
-        _hp_b[k] = in[k]->data();
+        _hp_x[k] = out[k].data();
+        _hp_b[k] = in[k].data();
       }
       inv_param.num_src = Nsrc;
       inv_param.num_src_per_sub_partition = Nsrc / num_sub_partition;
-      invertMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, (void *)milc_fatlink, (void *)milc_longlink,
-                                  &gauge_param);
+      invertMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, cpuFatMILC.data(),
+                                  cpuLongMILC.data(), &gauge_param);
       quda::comm_allreduce_int(inv_param.iter);
       inv_param.iter /= comm_size() / num_sub_partition;
       quda::comm_allreduce_sum(inv_param.gflops);
@@ -359,7 +288,7 @@ int main(int argc, char **argv)
 
     for (int k = 0; k < Nsrc; k++) {
       if (verify_results)
-        verifyStaggeredInversion(*tmp, *ref, *in[k], *out[k], mass, *cpuFat, *cpuLong, gauge_param, inv_param, 0);
+        verifyStaggeredInversion(tmp, ref, in[k], out[k], mass, cpuFatQDP, cpuLongQDP, gauge_param, inv_param, 0);
     }
   } else if (test_type == 5 || test_type == 6) {
     // case 5: // multi mass CG, even parity solution, solving EVEN system
@@ -403,8 +332,8 @@ int main(int argc, char **argv)
     }
 
     for (int k = 0; k < Nsrc; k++) {
-      quda::spinorNoise(*in[k], *rng, QUDA_NOISE_UNIFORM);
-      invertMultiShiftQuda((void **)outArray.data(), in[k]->data(), &inv_param);
+      quda::spinorNoise(in[k], rng, QUDA_NOISE_UNIFORM);
+      invertMultiShiftQuda((void **)outArray.data(), in[k].data(), &inv_param);
 
       time[k] = inv_param.secs;
       gflops[k] = inv_param.gflops / inv_param.secs;
@@ -414,7 +343,7 @@ int main(int argc, char **argv)
 
       for (int i = 0; i < multishift; i++) {
         printfQuda("%dth solution: mass=%f, ", i, masses[i]);
-        verifyStaggeredInversion(*tmp, *ref, *in[k], qudaOutArray[i], masses[i], *cpuFat, *cpuLong, gauge_param, inv_param, i);
+        verifyStaggeredInversion(tmp, ref, in[k], qudaOutArray[i], masses[i], cpuFatQDP, cpuLongQDP, gauge_param, inv_param, i);
       }
     }
   } else {
@@ -424,39 +353,65 @@ int main(int argc, char **argv)
   // Compute timings
   if (Nsrc > 1 && !use_split_grid) performanceStats(time, gflops, iter);
 
-  // Free RNG
-  delete rng;
-
   // Free the multigrid solver
   if (inv_multigrid) destroyMultigridQuda(mg_preconditioner);
+}
+
+int main(int argc, char **argv)
+{
+  setQudaDefaultMgTestParams();
+  // Parse command line options
+  auto app = make_app();
+  add_eigen_option_group(app);
+  add_deflation_option_group(app);
+  add_multigrid_option_group(app);
+  add_comms_option_group(app);
+  CLI::TransformPairs<int> test_type_map {{"full", 0}, {"full_ee_prec", 1}, {"full_oo_prec", 2}, {"even", 3},
+                                          {"odd", 4},  {"mcg_even", 5},     {"mcg_odd", 6}};
+  app->add_option("--test", test_type, "Test method")->transform(CLI::CheckedTransformer(test_type_map));
+  try {
+    app->parse(argc, argv);
+  } catch (const CLI::ParseError &e) {
+    return app->exit(e);
+  }
+  setVerbosity(verbosity);
+  if (!inv_multigrid) solve_type = QUDA_INVALID_SOLVE;
 
-  // Clean up gauge fields
-  for (int dir = 0; dir < 4; dir++) {
-    host_free(qdp_inlink[dir]);
-    host_free(qdp_fatlink[dir]);
-    host_free(qdp_longlink[dir]);
+  if (inv_deflate && inv_multigrid) {
+    errorQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve");
   }
-  host_free(milc_fatlink);
-  host_free(milc_longlink);
 
-  if (cpuFat != nullptr) {
-    delete cpuFat;
-    cpuFat = nullptr;
+  // Set values for precisions via the command line.
+  setQudaPrecisions();
+
+  // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp)
+  initComms(argc, argv, gridsize_from_cmdline);
+
+  initRand();
+
+  // Only these fermions are supported in this file. Ensure a reasonable default,
+  // ensure that the default is improved staggered
+  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) {
+    printfQuda("dslash_type %s not supported, defaulting to %s\n", get_dslash_str(dslash_type),
+               get_dslash_str(QUDA_ASQTAD_DSLASH));
+    dslash_type = QUDA_ASQTAD_DSLASH;
   }
-  if (cpuLong != nullptr) {
-    delete cpuLong;
-    cpuLong = nullptr;
+
+  // Need to add support for LAPLACE MG?
+  if (inv_multigrid) {
+    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) {
+      errorQuda("dslash_type %s not supported for multigrid preconditioner", get_dslash_str(dslash_type));
+    }
   }
 
-  for (auto in_vec : in) { delete in_vec; }
-  for (auto out_vec : out) { delete out_vec; }
-  delete ref;
-  delete tmp;
+  // Deduce operator, solution, and operator preconditioning types
+  if (!inv_multigrid) setQudaStaggeredInvTestParams();
 
-  if (use_split_grid) {
-    for (auto p : _h_b) { delete p; }
-    for (auto p : _h_x) { delete p; }
-  }
+  display_test_info();
+
+  initQuda(device_ordinal);
+
+  test(argc, argv);
 
   // Finalize the QUDA library
   endQuda();

From 3963f6329ef8af6d3cc56d1ce6b44859bdf77dbb Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 30 Aug 2023 10:58:32 -0700
Subject: [PATCH 36/60] Pushing a profile onto the stack is now handled using
 an auxiliary container allowing us to use RAII, resulting in auto-popping
 when the container goes out of scope

---
 include/timer.h        |  17 +++-
 lib/interface_quda.cpp | 182 +++++++++++++++--------------------------
 lib/timer.cpp          |   5 +-
 3 files changed, 82 insertions(+), 122 deletions(-)

diff --git a/include/timer.h b/include/timer.h
index b819b81bb2..2de1829c18 100644
--- a/include/timer.h
+++ b/include/timer.h
@@ -229,10 +229,21 @@ namespace quda {
 
   };
 
-  void pushProfile(TimeProfile &profile);
-
-  void popProfile();
+  /**
+     @brief Container that we use for pushing a profile onto the
+     profile stack.  While this object is in scope it will exist on
+     the profile stack, and be popped when its destructor is called.
+   */
+  struct pushProfile {
+    TimeProfile &profile;
+    pushProfile(TimeProfile &profile);
+    virtual ~pushProfile();
+  };
 
+  /**
+     @brief Return a reference to the present profile at the top of
+     the stack
+   */
   TimeProfile& getProfile();
 
 } // namespace quda
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 19a97983b0..0dbe006026 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -429,7 +429,7 @@ void initQudaDevice(int dev)
   initialized = true;
 
   profileInit2End.TPSTART(QUDA_PROFILE_TOTAL);
-  pushProfile(profileInit);
+  auto profile = pushProfile(profileInit);
   profileInit.TPSTART(QUDA_PROFILE_INIT);
 
 #ifdef GITVERSION
@@ -464,7 +464,6 @@ void initQudaDevice(int dev)
   }
 
   profileInit.TPSTOP(QUDA_PROFILE_INIT);
-  popProfile();
 }
 
 /*
@@ -472,7 +471,7 @@ void initQudaDevice(int dev)
  */
 void initQudaMemory()
 {
-  pushProfile(profileInit);
+  auto profile = pushProfile(profileInit);
   profileInit.TPSTART(QUDA_PROFILE_INIT);
 
   if (!comms_initialized) init_default_comms();
@@ -496,7 +495,6 @@ void initQudaMemory()
   for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));
 
   profileInit.TPSTOP(QUDA_PROFILE_INIT);
-  pushProfile(profileInit);
 }
 
 void updateR()
@@ -556,7 +554,7 @@ void freeUniqueGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeFiel
 
 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 {
-  pushProfile(profileGauge);
+  auto profile = pushProfile(profileGauge);
   checkGaugeParam(param);
 
   if (!initialized) errorQuda("QUDA not initialized");
@@ -575,7 +573,6 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
       logQuda(QUDA_VERBOSE, "Gauge field unchanged - using cached gauge field %lu\n", checksum);
       delete in;
       invalidate_clover = false;
-      popProfile();
       return;
     }
     checksum = in_checksum;
@@ -631,7 +628,6 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
     delete precise;
     delete in;
 
-    popProfile();
     return;
   }
 
@@ -749,13 +745,11 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
     // Use the static R (which is defined at the very beginning of lib/interface_quda.cpp) here
     extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGauge, false, recon);
   }
-
-  popProfile();
 }
 
 void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 {
-  pushProfile(profileGauge);
+  auto profile = pushProfile(profileGauge);
 
   if (param->location != QUDA_CPU_FIELD_LOCATION) errorQuda("Non-cpu output location not yet supported");
 
@@ -785,8 +779,6 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
   cpuGauge.copy(*cudaGauge);
 
   if (param->type == QUDA_SMEARED_LINKS) { delete cudaGauge; }
-
-  popProfile();
 }
 
 void loadSloppyCloverQuda(const QudaPrecision prec[]);
@@ -794,7 +786,7 @@ void freeSloppyCloverQuda();
 
 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
 {
-  pushProfile(profileClover);
+  auto profile = pushProfile(profileClover);
   pushVerbosity(inv_param->verbosity);
 
   checkCloverParam(inv_param);
@@ -916,7 +908,6 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
   loadSloppyCloverQuda(prec);
 
   popVerbosity();
-  popProfile();
 }
 
 void freeSloppyCloverQuda();
@@ -1333,46 +1324,47 @@ void flushChronoQuda(int i)
 
 void endQuda(void)
 {
-  pushProfile(profileEnd);
-
   if (!initialized) return;
 
-  freeGaugeQuda();
-  freeCloverQuda();
+  {
+    auto profile = pushProfile(profileEnd);
 
-  for (int i = 0; i < QUDA_MAX_CHRONO; i++) flushChronoQuda(i);
+    freeGaugeQuda();
+    freeCloverQuda();
 
-  solutionResident.clear();
+    for (int i = 0; i < QUDA_MAX_CHRONO; i++) flushChronoQuda(i);
 
-  LatticeField::freeGhostBuffer();
-  ColorSpinorField::freeGhostBuffer();
-  FieldTmp<ColorSpinorField>::destroy();
+    solutionResident.clear();
 
-  blas_lapack::generic::destroy();
-  blas_lapack::native::destroy();
-  reducer::destroy();
+    LatticeField::freeGhostBuffer();
+    ColorSpinorField::freeGhostBuffer();
+    FieldTmp<ColorSpinorField>::destroy();
 
-  pool::flush_pinned();
-  pool::flush_device();
+    blas_lapack::generic::destroy();
+    blas_lapack::native::destroy();
+    reducer::destroy();
 
-  host_free(num_failures_h);
-  num_failures_h = nullptr;
-  num_failures_d = nullptr;
+    pool::flush_pinned();
+    pool::flush_device();
 
-  destroyDslashEvents();
+    host_free(num_failures_h);
+    num_failures_h = nullptr;
+    num_failures_d = nullptr;
 
-  saveTuneCache();
-  saveProfile();
+    destroyDslashEvents();
 
-  // flush any outstanding force monitoring (if enabled)
-  flushForceMonitor();
+    saveTuneCache();
+    saveProfile();
 
-  initialized = false;
+    // flush any outstanding force monitoring (if enabled)
+    flushForceMonitor();
 
-  comm_finalize();
-  comms_initialized = false;
+    initialized = false;
+
+    comm_finalize();
+    comms_initialized = false;
+  }
 
-  popProfile();
   profileInit2End.TPSTOP(QUDA_PROFILE_TOTAL);
 
   // print out the profile information of the lifetime of the library
@@ -1780,7 +1772,7 @@ namespace quda {
 
 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
 {
-  pushProfile(profileDslash);
+  auto profile = pushProfile(profileDslash);
 
   const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;
 
@@ -1849,7 +1841,6 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
   delete dirac; // clean up
 
   popVerbosity();
-  popProfile();
 }
 
 void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
@@ -2149,7 +2140,7 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
 void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param)
 {
   if (!initialized) errorQuda("QUDA not initialized");
-  pushProfile(profileEigensolve);
+  auto profile = pushProfile(profileEigensolve);
 
   // Transfer the inv param structure contained in eig_param.
   // This will define the operator to be eigensolved.
@@ -2317,8 +2308,6 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
 
   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();
-
-  popProfile();
 }
 
 multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)
@@ -2405,7 +2394,7 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr
 
 void* newMultigridQuda(QudaMultigridParam *mg_param) {
   profilerStart(__func__);
-  pushProfile(profileInvert);
+  auto profile = pushProfile(profileInvert);
   pushVerbosity(mg_param->invert_param->verbosity);
 
   auto *mg = new multigrid_solver(*mg_param, profileInvert);
@@ -2413,7 +2402,6 @@ void* newMultigridQuda(QudaMultigridParam *mg_param) {
   saveTuneCache();
 
   popVerbosity();
-  popProfile();
   profilerStop(__func__);
   return static_cast<void*>(mg);
 }
@@ -2425,7 +2413,7 @@ void destroyMultigridQuda(void *mg) {
 void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
 {
   profilerStart(__func__);
-  pushProfile(profileInvert);
+  auto profile = pushProfile(profileInvert);
   pushVerbosity(mg_param->invert_param->verbosity);
 
   profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE);
@@ -2531,14 +2519,13 @@ void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
   profileInvert.TPSTOP(QUDA_PROFILE_PREAMBLE);
 
   popVerbosity();
-  popProfile();
   profilerStop(__func__);
 }
 
 void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
 {
   profilerStart(__func__);
-  pushProfile(profileInvert);
+  auto profile = pushProfile(profileInvert);
   pushVerbosity(mg_param->invert_param->verbosity);
 
   auto *mg = static_cast<multigrid_solver*>(mg_);
@@ -2548,7 +2535,6 @@ void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
   mg->mg->dumpNullVectors();
 
   popVerbosity();
-  popProfile();
   profilerStop(__func__);
 }
 
@@ -2615,9 +2601,8 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
 }
 
 void* newDeflationQuda(QudaEigParam *eig_param) {
-  pushProfile(profileInvert);
+  auto profile = pushProfile(profileInvert);
   auto *defl = new deflated_solver(*eig_param, profileInvert);
-  popProfile();
   saveProfile(__func__);
   flushProfile();
   return static_cast<void*>(defl);
@@ -2629,7 +2614,7 @@ void destroyDeflationQuda(void *df) {
 
 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
 {
-  pushProfile(profileInvert);
+  auto profile = pushProfile(profileInvert);
   profilerStart(__func__);
 
   if (!initialized) errorQuda("QUDA not initialized");
@@ -2978,7 +2963,6 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
 
   profilerStop(__func__);
   popVerbosity();
-  popProfile();
 }
 
 void loadFatLongGaugeQuda(QudaInvertParam *inv_param, QudaGaugeParam *gauge_param, void *milc_fatlinks,
@@ -3047,7 +3031,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
   */
 
   profilerStart(__func__);
-  pushProfile(profileInvertMultiSrc);
+  auto profile = pushProfile(profileInvertMultiSrc);
 
   CommKey split_key = {param->split_grid[0], param->split_grid[1], param->split_grid[2], param->split_grid[3]};
   int num_sub_partition = quda::product(split_key);
@@ -3321,7 +3305,6 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     }
   }
 
-  popProfile();
   profilerStop(__func__);
 }
 
@@ -3381,7 +3364,7 @@ void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param
  */
 void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
 {
-  pushProfile(profileMulti);
+  auto profile = pushProfile(profileMulti);
   profilerStart(__func__);
 
   if (!initialized) errorQuda("QUDA not initialized");
@@ -3710,12 +3693,11 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
 
   profilerStop(__func__);
   popVerbosity();
-  popProfile();
 }
 
 void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
 {
-  pushProfile(profileFatLink);
+  auto profile = pushProfile(profileFatLink);
   checkGaugeParam(param);
 
   GaugeFieldParam gParam(*param, fatlink, QUDA_GENERAL_LINKS);
@@ -3778,12 +3760,11 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
   }
 
   delete cudaInLinkEx;
-  popProfile();
 }
 
 void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
 {
-  pushProfile(profileGaussianSmear);
+  auto profile = pushProfile(profileGaussianSmear);
   checkGaugeParam(param);
 
   GaugeFieldParam gParam(*param, inlink, QUDA_ASQTAD_LONG_LINKS);
@@ -3829,14 +3810,12 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
 
   freeUniqueGaugeQuda(QUDA_SMEARED_LINKS);
   delete cudaInLinkEx;
-
-  popProfile();
 }
 
 int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int* path_length,
 			  double* loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam* qudaGaugeParam)
 {
-  pushProfile(profileGaugeForce);
+  auto profile = pushProfile(profileGaugeForce);
   checkGaugeParam(qudaGaugeParam);
 
   GaugeFieldParam gParam(*qudaGaugeParam, siteLink);
@@ -3914,14 +3893,13 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
     delete cudaGauge;
   }
 
-  popProfile();
   return 0;
 }
 
 int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff,
                          int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam)
 {
-  pushProfile(profileGaugePath);
+  auto profile = pushProfile(profileGaugePath);
   checkGaugeParam(qudaGaugeParam);
 
   GaugeFieldParam gParam(*qudaGaugeParam, siteLink);
@@ -3977,13 +3955,12 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
     delete cudaGauge;
   }
 
-  popProfile();
   return 0;
 }
 
 void momResidentQuda(void *mom, QudaGaugeParam *param)
 {
-  pushProfile(profileGaugeForce);
+  auto profile = pushProfile(profileGaugeForce);
   checkGaugeParam(param);
 
   GaugeFieldParam gParamMom(*param, mom, QUDA_ASQTAD_MOM_LINKS);
@@ -4014,13 +3991,11 @@ void momResidentQuda(void *mom, QudaGaugeParam *param)
     cpuMom.copy(momResident);
     momResident = GaugeField();
   }
-
-  popProfile();
 }
 
 void createCloverQuda(QudaInvertParam* invertParam)
 {
-  pushProfile(profileClover);
+  auto profile = pushProfile(profileClover);
   if (!cloverPrecise) errorQuda("Clover field not allocated");
 
   QudaReconstructType recon = (gaugePrecise->Reconstruct() == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_12 : gaugePrecise->Reconstruct();
@@ -4052,7 +4027,6 @@ void createCloverQuda(QudaInvertParam* invertParam)
 
   // FIXME always preserve the extended gauge
   extendedGaugeResident = gauge;
-  popProfile();
 }
 
 void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
@@ -4097,7 +4071,7 @@ void destroyGaugeFieldQuda(void *gauge)
 void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, void **, QudaGaugeParam *gauge_param,
                                QudaInvertParam *inv_param)
 {
-  pushProfile(profileStaggeredForce);
+  auto profile = pushProfile(profileStaggeredForce);
 
   GaugeFieldParam gParam(*gauge_param, h_mom, QUDA_ASQTAD_MOM_LINKS);
 
@@ -4209,8 +4183,6 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   else momResident = GaugeField();
 
   for (int i=0; i<nvector; i++) delete X[i];
-
-  popProfile();
 }
 
 void computeHISQForceQuda(void* const milc_momentum,
@@ -4226,7 +4198,7 @@ void computeHISQForceQuda(void* const milc_momentum,
                           double **coeff,
                           QudaGaugeParam* gParam)
 {
-  pushProfile(profileHISQForce);
+  auto profile = pushProfile(profileHISQForce);
   checkGaugeParam(gParam);
 
   using namespace quda;
@@ -4510,8 +4482,6 @@ void computeHISQForceQuda(void* const milc_momentum,
     std::exchange(momResident, mom);
   else
     momResident = GaugeField();
-
-  popProfile();
 }
 
 void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double *coeff, double kappa2, double ck,
@@ -4519,7 +4489,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
                             QudaInvertParam *inv_param)
 {
   using namespace quda;
-  pushProfile(profileCloverForce);
+  auto profile = pushProfile(profileCloverForce);
 
   checkGaugeParam(gauge_param);
   if (!gaugePrecise) errorQuda("No resident gauge field");
@@ -4671,13 +4641,11 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
   if (inv_param->use_resident_solution) solutionResident.clear();
 #endif
   delete dirac;
-
-  popProfile();
 }
 
 void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom, int exact, QudaGaugeParam* param)
 {
-  pushProfile(profileGaugeUpdate);
+  auto profile = pushProfile(profileGaugeUpdate);
   checkGaugeParam(param);
 
   // create the host fields
@@ -4723,13 +4691,11 @@ void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom,
 
   if (param->make_resident_mom && !param->use_resident_mom) std::exchange(momResident, cudaMom);
   else momResident = GaugeField();
-
-  popProfile();
 }
 
 void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
 {
-  pushProfile(profileProject);
+  auto profile = pushProfile(profileProject);
   checkGaugeParam(param);
 
   // create the gauge field
@@ -4763,13 +4729,11 @@ void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
     gaugePrecise = new GaugeField();
     std::exchange(*gaugePrecise, cudaGauge);
   }
-
-  popProfile();
 }
 
 void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
 {
-  pushProfile(profilePhase);
+  auto profile = pushProfile(profilePhase);
   checkGaugeParam(param);
 
   // create the gauge field
@@ -4802,14 +4766,12 @@ void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
     gaugePrecise = new GaugeField();
     std::exchange(*gaugePrecise, cudaGauge);
   }
-
-  popProfile();
 }
 
 // evaluate the momentum action
 double momActionQuda(void* momentum, QudaGaugeParam* param)
 {
-  pushProfile(profileMomAction);
+  auto profile = pushProfile(profileMomAction);
   checkGaugeParam(param);
 
   // create the momentum fields
@@ -4834,13 +4796,12 @@ double momActionQuda(void* momentum, QudaGaugeParam* param)
     std::exchange(momResident, cudaMom);
   else momResident = GaugeField();
 
-  popProfile();
   return action;
 }
 
 void gaussGaugeQuda(unsigned long long seed, double sigma)
 {
-  pushProfile(profileGauss);
+  auto profile = pushProfile(profileGauss);
 
   if (!gaugePrecise) errorQuda("Cannot generate Gauss GaugeField as there is no resident gauge field");
   quda::gaugeGauss(*gaugePrecise, seed, sigma);
@@ -4849,16 +4810,13 @@ void gaussGaugeQuda(unsigned long long seed, double sigma)
     extendedGaugeResident->copy(*gaugePrecise);
     extendedGaugeResident->exchangeExtendedGhost(R, profileGauss, redundant_comms);
   }
-
-  popProfile();
 }
 
 void gaussMomQuda(unsigned long long seed, double sigma)
 {
-  pushProfile(profileGauss);
+  auto profile = pushProfile(profileGauss);
   if (!momResident.Volume()) errorQuda("Cannot generate Gauss GaugeField as there is no resident momentum field");
   quda::gaugeGauss(momResident, seed, sigma);
-  popProfile();
 }
 
 /*
@@ -4866,7 +4824,7 @@ void gaussMomQuda(unsigned long long seed, double sigma)
  */
 void plaqQuda(double plaq[3])
 {
-  pushProfile(profilePlaq);
+  auto profile = pushProfile(profilePlaq);
 
   if (!gaugePrecise) errorQuda("Cannot compute plaquette as there is no resident gauge field");
 
@@ -4877,8 +4835,6 @@ void plaqQuda(double plaq[3])
   plaq[0] = plaq3.x;
   plaq[1] = plaq3.y;
   plaq[2] = plaq3.z;
-
-  popProfile();
 }
 
 /*
@@ -4932,7 +4888,7 @@ void copyExtendedResidentGaugeQuda(void *resident_gauge)
 
 void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int n_steps, double alpha)
 {
-  pushProfile(profileWuppertal);
+  auto profile = pushProfile(profileWuppertal);
   pushVerbosity(inv_param->verbosity);
   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
 
@@ -4992,14 +4948,13 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param,
   if (gaugeSmeared != nullptr) delete precise;
 
   popVerbosity();
-  popProfile();
 }
 
 
 void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_param)
 {
   if (smear_param->n_steps == 0) return;
-  pushProfile(profileGaussianSmear);
+  auto profile = pushProfile(profileGaussianSmear);
 
   QudaInvertParam *inv_param = smear_param->inv_param;
 
@@ -5121,13 +5076,12 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
   if (smear_param->delete_2link != 0) { freeUniqueGaugeQuda(QUDA_SMEARED_LINKS); }
 
   saveTuneCache();
-  popProfile();
 }
 
 
 void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param)
 {
-  pushProfile(profileGaugeSmear);
+  auto profile = pushProfile(profileGaugeSmear);
   pushOutputPrefix("performGaugeSmearQuda: ");
   checkGaugeSmearParam(smear_param);
 
@@ -5161,12 +5115,11 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
   }
 
   popOutputPrefix();
-  popProfile();
 }
 
 void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param)
 {
-  pushProfile(profileWFlow);
+  auto profile = pushProfile(profileWFlow);
   pushOutputPrefix("performWFlowQuda: ");
   checkGaugeSmearParam(smear_param);
 
@@ -5210,14 +5163,13 @@ void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam
   }
 
   popOutputPrefix();
-  popProfile();
 }
 
 int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps,
                               const unsigned int verbose_interval, const double relax_boost, const double tolerance,
                               const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param)
 {
-  pushProfile(GaugeFixOVRQuda);
+  auto profile = pushProfile(GaugeFixOVRQuda);
   checkGaugeParam(param);
 
   GaugeFieldParam gParam(*param, gauge);
@@ -5254,7 +5206,6 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u
     delete cudaInGaugeEx;
   }
 
-  popProfile();
   return 0;
 }
 
@@ -5262,7 +5213,7 @@ int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const u
                               const unsigned int verbose_interval, const double alpha, const unsigned int autotune,
                               const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param)
 {
-  pushProfile(GaugeFixFFTQuda);
+  auto profile = pushProfile(GaugeFixFFTQuda);
   checkGaugeParam(param);
 
   GaugeFieldParam gParam(*param, gauge);
@@ -5290,14 +5241,13 @@ int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const u
     std::exchange(*gaugePrecise, cudaInGauge);
   }
 
-  popProfile();
   return 0;
 }
 
 void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const QudaContractType cType,
                   QudaInvertParam *param, const int *X)
 {
-  pushProfile(profileContract);
+  auto profile = pushProfile(profileContract);
   // DMH: Easiest way to construct ColorSpinorField? Do we require the user
   //     to declare and fill and invert_param, or can it just be hacked?.
 
@@ -5334,12 +5284,11 @@ void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const Quda
   profileContract.TPSTOP(QUDA_PROFILE_D2H);
 
   pool_device_free(d_result);
-  popProfile();
 }
 
 void gaugeObservablesQuda(QudaGaugeObservableParam *param)
 {
-  pushProfile(profileGaugeObs);
+  auto profile = pushProfile(profileGaugeObs);
   checkGaugeObservableParam(param);
 
   if (!gaugePrecise) errorQuda("Cannot compute Polyakov loop as there is no resident gauge field");
@@ -5361,5 +5310,4 @@ void gaugeObservablesQuda(QudaGaugeObservableParam *param)
   }
 
   gaugeObservables(*gauge, *param);
-  popProfile();
 }
diff --git a/lib/timer.cpp b/lib/timer.cpp
index 2214ebd0ec..986d7b045f 100644
--- a/lib/timer.cpp
+++ b/lib/timer.cpp
@@ -202,16 +202,17 @@ namespace quda {
 
   static std::stack<TimeProfile*> tpstack;
 
-  void pushProfile(TimeProfile &profile)
+  pushProfile::pushProfile(TimeProfile &profile) : profile(profile)
   {
     profile.TPSTART(QUDA_PROFILE_TOTAL);
     tpstack.push(&profile);
   }
 
-  void popProfile()
+  pushProfile::~pushProfile()
   {
     if (tpstack.empty()) errorQuda("popProfile() called with empty stack");
     auto &profile = *(tpstack.top());
+    if (&(this->profile) != &profile) errorQuda("Popped profile is not the expected one");
     tpstack.pop();
     profile.TPSTOP(QUDA_PROFILE_TOTAL);
   }

From 426b59a579dd093e76940ecc539f17a3f927b9aa Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 31 Aug 2023 14:38:54 -0700
Subject: [PATCH 37/60] Respond to review comments

---
 include/gauge_field.h  |  6 ++++++
 lib/interface_quda.cpp | 21 +++++++++++----------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/include/gauge_field.h b/include/gauge_field.h
index bf75bc6bfa..297065842f 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -288,6 +288,12 @@ namespace quda {
      */
     GaugeField &operator=(GaugeField &&field);
 
+    /**
+       @brief Returns if the object is empty (not initialized)
+       @return true if the object has been allocated, otherwise false
+    */
+    bool empty() const { return init; }
+
     /**
        @brief Create the communication handlers and buffers
        @param[in] R The thickness of the extended region in each dimension
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 0dbe006026..48f342a31b 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -3835,7 +3835,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
 
   GaugeField cpuMom = !qudaGaugeParam->use_resident_mom ? GaugeField(gParamMom) : GaugeField();
 
-  if (qudaGaugeParam->use_resident_mom && !momResident.Volume()) errorQuda("No resident momentum field to use");
+  if (qudaGaugeParam->use_resident_mom && momResident.empty()) errorQuda("No resident momentum field to use");
   gParamMom.location = QUDA_CUDA_FIELD_LOCATION;
   gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_COPY_FIELD_CREATE;
   gParamMom.field = &cpuMom;
@@ -4017,7 +4017,7 @@ void createCloverQuda(QudaInvertParam* invertParam)
   GaugeFieldParam tensorParam(gaugePrecise->X(), ex->Precision(), QUDA_RECONSTRUCT_NO, 0, QUDA_TENSOR_GEOMETRY);
   tensorParam.location = QUDA_CUDA_FIELD_LOCATION;
   tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
-  tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
+  tensorParam.setPrecision(tensorParam.Precision(), true);
   tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   GaugeField Fmunu(tensorParam);
   computeFmunu(Fmunu, *ex);
@@ -4039,7 +4039,7 @@ void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
   GaugeField *cpuGauge = nullptr;
   if (gauge) cpuGauge = new GaugeField(gParam);
 
-  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
+  gParam.setPrecision(gParam.Precision(), true);
   gParam.create = QUDA_ZERO_FIELD_CREATE;
   auto* cudaGauge = new GaugeField(gParam);
 
@@ -4087,8 +4087,8 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParam.create = QUDA_COPY_FIELD_CREATE;
   gParam.field = &cpuMom;
-  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
+  gParam.setPrecision(gParam.Precision(), true);
   GaugeField cudaMom = gauge_param->use_resident_mom ? momResident.create_alias() : GaugeField(gParam);
 
   // create temporary field for quark-field outer product
@@ -4355,7 +4355,7 @@ void computeHISQForceQuda(void* const milc_momentum,
 
   param.location = QUDA_CUDA_FIELD_LOCATION;
   param.create = QUDA_ZERO_FIELD_CREATE;
-  param.order = QUDA_FLOAT2_GAUGE_ORDER;
+  param.setPrecision(param.Precision(), true);
   GaugeFieldParam momParam(param);
 
   // Create CPU W, V, and U fields
@@ -4504,14 +4504,14 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
   // create the device momentum field
   fParam.location = QUDA_CUDA_FIELD_LOCATION;
   fParam.create = QUDA_ZERO_FIELD_CREATE;
-  fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
+  fParam.setPrecision(fParam.Precision(), true);
   GaugeField cudaMom(fParam);
 
   // create the device force field
   fParam.link_type = QUDA_GENERAL_LINKS;
   fParam.create = QUDA_ZERO_FIELD_CREATE;
-  fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   fParam.reconstruct = QUDA_RECONSTRUCT_NO;
+  fParam.setPrecision(fParam.Precision(), true);
   GaugeField cudaForce(fParam);
 
   ColorSpinorParam qParam;
@@ -4662,9 +4662,9 @@ void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom,
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.create = QUDA_COPY_FIELD_CREATE;
   gParam.field = &cpuMom;
-  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
+  gParam.setPrecision(gParam.Precision(), true);
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   gParam.pad = 0;
   GaugeField cudaMom = param->use_resident_mom ? momResident.create_alias() : GaugeField(gParam);
@@ -4672,6 +4672,7 @@ void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom,
   if (param->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field allocated");
   gParam.link_type = QUDA_SU3_LINKS;
   gParam.reconstruct = param->reconstruct;
+  gParam.setPrecision(gParam.Precision(), true);
   gParam.field = &cpuGauge;
   GaugeField u_in = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
   gParam.create = QUDA_NULL_FIELD_CREATE;
@@ -4709,8 +4710,8 @@ void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.create = QUDA_COPY_FIELD_CREATE;
   gParam.field = &cpuGauge;
-  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.reconstruct = param->reconstruct;
+  gParam.setPrecision(gParam.Precision(), true);
   GaugeField cudaGauge = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
 
   *num_failures_h = 0;
@@ -4747,8 +4748,8 @@ void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.create = QUDA_COPY_FIELD_CREATE;
   gParam.field = &cpuGauge;
-  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParam.reconstruct = param->reconstruct;
+  gParam.setPrecision(gParam.Precision(), true);
   GaugeField cudaGauge = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
 
   *num_failures_h = 0;

From 63e474d18a5985bcc393942149688b5192e85942 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 11 Sep 2023 14:30:37 -0700
Subject: [PATCH 38/60] Fix some overflow issues with large volumes

---
 include/gauge_field_order.h | 38 ++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 17eb70b42c..c50f216e5e 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -690,7 +690,7 @@ namespace quda {
     struct GhostAccessor<Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat> {
       using wrapper = fieldorder_wrapper<Float, storeFloat>;
       complex<storeFloat> *ghost[8] = {};
-      const int volumeCB;
+      const unsigned int volumeCB;
       unsigned int ghostVolumeCB[8] = {};
       Float scale = static_cast<Float>(1.0);
       Float scale_inv = static_cast<Float>(1.0);
@@ -751,7 +751,7 @@ namespace quda {
       using wrapper = fieldorder_wrapper<Float, storeFloat>;
 
       /** An internal reference to the actual field we are accessing */
-      const int volumeCB;
+      const unsigned int volumeCB;
       const int nDim;
       const int_fastdiv geometry;
       const QudaFieldLocation location;
@@ -870,10 +870,10 @@ namespace quda {
 	__device__ __host__ inline int Ncolor() const { return nColor; }
 
 	/** Returns the field volume */
-	__device__ __host__ inline int Volume() const { return 2*volumeCB; }
+	__device__ __host__ inline auto Volume() const { return 2*volumeCB; }
 
 	/** Returns the field volume */
-	__device__ __host__ inline int VolumeCB() const { return volumeCB; }
+	__device__ __host__ inline auto VolumeCB() const { return volumeCB; }
 
 	/** Returns the field geometric dimension */
 	__device__ __host__ inline int Ndim() const { return nDim; }
@@ -1526,7 +1526,7 @@ namespace quda {
         int coords[QUDA_MAX_DIM];
         int_fastdiv X[QUDA_MAX_DIM];
         int R[QUDA_MAX_DIM];
-        const int volumeCB;
+        const unsigned int volumeCB;
         int faceVolumeCB[4];
         const int stride;
         const int geometry;
@@ -1773,7 +1773,7 @@ namespace quda {
         using complex = complex<real>;
         Float *ghost[QUDA_MAX_DIM] = {};
         int faceVolumeCB[QUDA_MAX_DIM] = {};
-        const int volumeCB;
+        const unsigned int volumeCB;
         const int stride;
         const int geometry;
         const int hasPhase;
@@ -1846,7 +1846,7 @@ namespace quda {
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       Float *gauge[QUDA_MAX_DIM];
-      const int volumeCB;
+      const unsigned int volumeCB;
     QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
       : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
     {
@@ -1892,7 +1892,7 @@ namespace quda {
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       Float *gauge[QUDA_MAX_DIM];
-      const int volumeCB;
+      const unsigned int volumeCB;
     QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
       : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
     {
@@ -1942,7 +1942,7 @@ namespace quda {
     using real = typename mapper<Float>::type;
     using complex = complex<real>;
     Float *gauge;
-    const int volumeCB;
+    const unsigned int volumeCB;
     const int geometry;
     MILCOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
       LegacyOrder<Float, length>(u, ghost_),
@@ -1953,10 +1953,10 @@ namespace quda {
       ;
     }
 
-  __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const
-  {
-    auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
-    block_load<complex, length / 2>(v, reinterpret_cast<complex *>(in));
+    __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const
+    {
+      auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
+      block_load<complex, length / 2>(v, reinterpret_cast<complex *>(in));
     }
 
     __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const
@@ -2003,7 +2003,7 @@ namespace quda {
     using real = typename mapper<Float>::type;
     using complex = complex<real>;
     Float *gauge;
-    const int volumeCB;
+    const unsigned int volumeCB;
     const int geometry;
     const size_t offset;
     const size_t size;
@@ -2062,7 +2062,7 @@ namespace quda {
     using real = typename mapper<Float>::type;
     using complex = complex<real>;
     Float *gauge;
-    const int volumeCB;
+    const unsigned int volumeCB;
     const real anisotropy;
     const real anisotropy_inv;
     static constexpr int Nc = 3;
@@ -2131,8 +2131,8 @@ namespace quda {
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       Float *gauge;
-      const int volumeCB;
-      int exVolumeCB; // extended checkerboard volume
+      const unsigned int volumeCB;
+      unsigned int exVolumeCB; // extended checkerboard volume
       static constexpr int Nc = 3;
       BQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
         LegacyOrder<Float, length>(u, ghost_), gauge(gauge_ ? gauge_ : u.data<Float *>()), volumeCB(u.VolumeCB())
@@ -2193,7 +2193,7 @@ namespace quda {
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       Float *gauge;
-      const int volumeCB;
+      const unsigned int volumeCB;
       static constexpr int Nc = 3;
       const real scale;
       const real scale_inv;
@@ -2257,7 +2257,7 @@ namespace quda {
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       Float *gauge;
-      const int volumeCB;
+      const unsigned int volumeCB;
       int exVolumeCB;
       static constexpr int Nc = 3;
       const real scale;

From 56a719d7d03d54df18bacc4230358988893733b6 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 12 Sep 2023 10:27:22 -0700
Subject: [PATCH 39/60] Fix some overflow issues with tests

---
 tests/host_reference/gauge_force_reference.cpp | 2 +-
 tests/host_reference/hisq_force_reference.cpp  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp
index eb18f10568..a575532731 100644
--- a/tests/host_reference/gauge_force_reference.cpp
+++ b/tests/host_reference/gauge_force_reference.cpp
@@ -446,7 +446,7 @@ void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *s
                                QudaPrecision prec, int **path_dir, int *length, void *loop_coeff, int num_paths,
                                const lattice_t &lat, bool compute_force)
 {
-  size_t size = V * 2 * lat.n_color * lat.n_color * prec;
+  size_t size = size_t(V) * 2 * lat.n_color * lat.n_color * prec;
   void *staple = safe_malloc(size);
   memset(staple, 0, size);
 
diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp
index d0cfc197a2..9cd4ee4d9c 100644
--- a/tests/host_reference/hisq_force_reference.cpp
+++ b/tests/host_reference/hisq_force_reference.cpp
@@ -1205,9 +1205,9 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda
   QudaPrecision precision = oprod.Precision();
 
 #ifdef MULTI_GPU
-  int len = Vh_ex * 2;
+  uint64_t len = Vh_ex * 2;
 #else
-  int len = 1;
+  uint64_t len = 1;
   for (int dir = 0; dir < 4; ++dir) len *= X_[dir];
 #endif
   // allocate memory for temporary fields

From f14d7ffbee054bb182463273f56f0b8bbf2be62e Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 12 Sep 2023 11:21:02 -0700
Subject: [PATCH 40/60] Minor cleanup of heatbath_test and fix an issue found
 in testing with saveGaugeQuda

---
 lib/interface_quda.cpp  |   2 +-
 tests/heatbath_test.cpp | 117 ++++++++++++++++++++--------------------
 2 files changed, 59 insertions(+), 60 deletions(-)

diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 48f342a31b..1833a0f766 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -4055,7 +4055,7 @@ void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param)
 {
   auto* cudaGauge = reinterpret_cast<GaugeField*>(inGauge);
 
-  GaugeFieldParam gParam(*param, gauge, QUDA_GENERAL_LINKS);
+  GaugeFieldParam gParam(*param, gauge);
   gParam.geometry = cudaGauge->Geometry();
 
   GaugeField cpuGauge(gParam);
diff --git a/tests/heatbath_test.cpp b/tests/heatbath_test.cpp
index 4ad648958b..557914b772 100644
--- a/tests/heatbath_test.cpp
+++ b/tests/heatbath_test.cpp
@@ -53,33 +53,9 @@ void display_test_info()
              dimPartitioned(3));
 }
 
-int main(int argc, char **argv)
+void heatbath_test(int argc, char **argv)
 {
-  // command line options
-  auto app = make_app();
-  add_heatbath_option_group(app);
-  try {
-    app->parse(argc, argv);
-  } catch (const CLI::ParseError &e) {
-    return app->exit(e);
-  }
-
-  if (prec_sloppy == QUDA_INVALID_PRECISION) prec_sloppy = prec;
-  if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID) link_recon_sloppy = link_recon;
-
-  // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp)
-  initComms(argc, argv, gridsize_from_cmdline);
-
-  // call srand() with a rank-dependent seed
-  initRand();
-
-  display_test_info();
-
-  // initialize the QUDA library
-  initQuda(device_ordinal);
-
   // *** QUDA parameters begin here.
-
   QudaGaugeParam gauge_param = newQudaGaugeParam();
   setWilsonGaugeParam(gauge_param);
   gauge_param.t_boundary = QUDA_PERIODIC_T;
@@ -91,12 +67,17 @@ int main(int argc, char **argv)
   // Allocate space on the host (always best to allocate and free in the same scope)
   for (int dir = 0; dir < 4; dir++) { load_gauge[dir] = safe_malloc(V * gauge_site_size * gauge_param.cpu_prec); }
   constructHostGaugeField(load_gauge, gauge_param, argc, argv);
+
+  if (prec_sloppy == QUDA_INVALID_PRECISION) prec_sloppy = prec;
+  if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID) link_recon_sloppy = link_recon;
+
   // Load the gauge field to the device
   loadGaugeQuda((void *)load_gauge, &gauge_param);
 
-  int *num_failures_h = (int *)mapped_malloc(sizeof(int));
-  int *num_failures_d = (int *)get_mapped_device_pointer(num_failures_h);
-  *num_failures_h = 0;
+  quda::quda_ptr num_failures(QUDA_MEMORY_MAPPED, sizeof(int), false);
+  int &num_failures_h = *static_cast<int*>(num_failures.data_host());
+  int &num_failures_d = *static_cast<int*>(num_failures.data_device());
+  num_failures_h = 0;
 
   // start the timer
   double time0 = -((double)clock());
@@ -110,7 +91,7 @@ int main(int argc, char **argv)
     gParam.link_type = gauge_param.type;
     gParam.reconstruct = gauge_param.reconstruct;
     gParam.setPrecision(gParam.Precision(), true);
-    GaugeField *gauge = new GaugeField(gParam);
+    GaugeField gauge(gParam);
 
     int pad = 0;
     lat_dim_t y;
@@ -126,9 +107,9 @@ int main(int argc, char **argv)
     gParamEx.t_boundary = gParam.t_boundary;
     gParamEx.nFace = 1;
     for (int dir = 0; dir < 4; ++dir) gParamEx.r[dir] = R[dir];
-    GaugeField *gaugeEx = new GaugeField(gParamEx);
+    GaugeField gaugeEx(gParamEx);
     // CURAND random generator initialization
-    RNG *randstates = new RNG(*gauge, 1234);
+    RNG randstates(gauge, 1234);
 
     int nsteps = heatbath_num_steps;
     int nwarm = heatbath_warmup_steps;
@@ -145,21 +126,21 @@ int main(int argc, char **argv)
 
     if (latfile.size() > 0) { // We loaded in a gauge field
       // copy internal extended field to gaugeEx
-      copyExtendedResidentGaugeQuda((void *)gaugeEx);
+      copyExtendedResidentGaugeQuda(&gaugeEx);
     } else {
       if (coldstart)
-        InitGaugeField(*gaugeEx);
+        InitGaugeField(gaugeEx);
       else
-        InitGaugeField(*gaugeEx, *randstates);
+        InitGaugeField(gaugeEx, randstates);
 
       // copy into regular field
-      copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
+      copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
       // load the gauge field from gauge
-      gauge_param.gauge_order = gauge->Order();
+      gauge_param.gauge_order = gauge.Order();
       gauge_param.location = QUDA_CUDA_FIELD_LOCATION;
 
-      loadGaugeQuda(gauge->data(), &gauge_param);
+      loadGaugeQuda(gauge.data(), &gauge_param);
     }
 
     QudaGaugeObservableParam param = newQudaGaugeObservableParam();
@@ -175,37 +156,37 @@ int main(int argc, char **argv)
     // Do a warmup if requested
     if (nwarm > 0) {
       for (int step = 1; step <= nwarm; ++step) {
-        Monte(*gaugeEx, *randstates, beta_value, nhbsteps, novrsteps);
+        Monte(gaugeEx, randstates, beta_value, nhbsteps, novrsteps);
 
-        quda::unitarizeLinks(*gaugeEx, num_failures_d);
-        if (*num_failures_h > 0) errorQuda("Error in the unitarization\n");
+        quda::unitarizeLinks(gaugeEx, &num_failures_d);
+        if (num_failures_h > 0) errorQuda("Error in the unitarization\n");
       }
     }
 
     // copy into regular field
-    copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
+    copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
     // load the gauge field from gauge
-    gauge_param.gauge_order = gauge->Order();
+    gauge_param.gauge_order = gauge.Order();
     gauge_param.location = QUDA_CUDA_FIELD_LOCATION;
 
-    loadGaugeQuda(gauge->data(), &gauge_param);
+    loadGaugeQuda(gauge.data(), &gauge_param);
     gaugeObservablesQuda(&param);
     printfQuda("step=0 plaquette = %e topological charge = %e\n", param.plaquette[0], param.qcharge);
 
     freeGaugeQuda();
 
     for (int step = 1; step <= nsteps; ++step) {
-      Monte(*gaugeEx, *randstates, beta_value, nhbsteps, novrsteps);
+      Monte(gaugeEx, randstates, beta_value, nhbsteps, novrsteps);
 
       // Reunitarize gauge links...
-      quda::unitarizeLinks(*gaugeEx, num_failures_d);
-      if (*num_failures_h > 0) errorQuda("Error in the unitarization\n");
+      quda::unitarizeLinks(gaugeEx, &num_failures_d);
+      if (num_failures_h > 0) errorQuda("Error in the unitarization\n");
 
       // copy into regular field
-      copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
+      copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
-      loadGaugeQuda(gauge->data(), &gauge_param);
+      loadGaugeQuda(gauge.data(), &gauge_param);
       gaugeObservablesQuda(&param);
       printfQuda("step=%d plaquette = %e topological charge = %e\n", step, param.plaquette[0], param.qcharge);
 
@@ -219,14 +200,15 @@ int main(int argc, char **argv)
 
       QudaGaugeParam gauge_param = newQudaGaugeParam();
       setWilsonGaugeParam(gauge_param);
+      gauge_param.t_boundary = gauge.TBoundary();
 
       void *cpu_gauge[4];
       for (int dir = 0; dir < 4; dir++) { cpu_gauge[dir] = safe_malloc(V * gauge_site_size * gauge_param.cpu_prec); }
 
       // copy into regular field
-      copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
+      copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
-      saveGaugeFieldQuda((void *)cpu_gauge, (void *)gauge, &gauge_param);
+      saveGaugeFieldQuda((void *)cpu_gauge, &gauge, &gauge_param);
 
       write_gauge_field(gauge_outfile.c_str(), cpu_gauge, gauge_param.cpu_prec, gauge_param.X, 0, (char **)0);
 
@@ -235,27 +217,44 @@ int main(int argc, char **argv)
       printfQuda("No output file specified.\n");
     }
 
-    delete gauge;
-    delete gaugeEx;
     // Release all temporary memory used for data exchange between GPUs in multi-GPU mode
     PGaugeExchangeFree();
-
-    delete randstates;
   }
 
   // stop the timer
   time0 += clock();
   time0 /= CLOCKS_PER_SEC;
 
-  // printfQuda("\nDone: %i iter / %g secs = %g Gflops, total time = %g secs\n",
-  // inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs, time0);
   printfQuda("\nDone, total time = %g secs\n", time0);
 
-  host_free(num_failures_h);
-
   freeGaugeQuda();
-
   for (int dir = 0; dir < 4; dir++) host_free(load_gauge[dir]);
+}
+
+int main(int argc, char **argv)
+{
+  // command line options
+  auto app = make_app();
+  add_heatbath_option_group(app);
+  try {
+    app->parse(argc, argv);
+  } catch (const CLI::ParseError &e) {
+    return app->exit(e);
+  }
+
+  // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp)
+  initComms(argc, argv, gridsize_from_cmdline);
+
+  // call srand() with a rank-dependent seed
+  initRand();
+
+  display_test_info();
+
+  // initialize the QUDA library
+  initQuda(device_ordinal);
+
+  // run the test
+  heatbath_test(argc, argv);
 
   // finalize the QUDA library
   endQuda();

From b19fe5443e6b9b8a2787cc684d3e65500cc19268 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 12 Sep 2023 11:24:57 -0700
Subject: [PATCH 41/60] Fix typo

---
 include/gauge_field.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/gauge_field.h b/include/gauge_field.h
index 297065842f..54d446839d 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -53,7 +53,7 @@ namespace quda {
 
     QudaFieldCreate create = QUDA_REFERENCE_FIELD_CREATE; // used to determine the type of field created
 
-    QudaFieldGeometry geometry = QUDA_VECTOR_GEOMETRY; // whether the field is a scale, vector or tensor
+    QudaFieldGeometry geometry = QUDA_VECTOR_GEOMETRY; // whether the field is a scalar, vector or tensor
 
     // whether we need to compute the fat link maxima
     // FIXME temporary flag until we have a kernel that can do this, then we just do this in copy()

From bf29f03d02a404ac384ca34eee4aa3ca570e4682 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 12 Sep 2023 16:43:52 -0700
Subject: [PATCH 42/60] Fix typo

---
 lib/llfat_quda.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/llfat_quda.cu b/lib/llfat_quda.cu
index 8ac2e25d36..cd32a54e6c 100644
--- a/lib/llfat_quda.cu
+++ b/lib/llfat_quda.cu
@@ -186,7 +186,7 @@ namespace quda {
 
     if ( ((fat.X()[0] % 2 != 0) || (fat.X()[1] % 2 != 0) || (fat.X()[2] % 2 != 0) || (fat.X()[3] % 2 != 0))
 	&& (u.Reconstruct()  != QUDA_RECONSTRUCT_NO)){
-      errorQuda("Reconstruct %d and odd dimensionsize is not supported by link fattening code (yet)\n",
+      errorQuda("Reconstruct %d and odd dimension size is not supported by link fattening code (yet)",
 		u.Reconstruct());
     }
 

From dc5ec219a0b1f1502bc60ea4d0267b5177402485 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 21 Sep 2023 12:52:08 -0700
Subject: [PATCH 43/60] Updates for quda_ptr: add custom exchange function
 since std::exchange doesn't work; add ostream overload; add reference() query
 function; move assignment will now fail if destination is already allocated

---
 include/quda_ptr.h | 36 +++++++++++++++++++++++++++++++-----
 lib/quda_ptr.cpp   | 30 ++++++++++++++++++++++++++----
 2 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/include/quda_ptr.h b/include/quda_ptr.h
index 3e829f310f..185d852d57 100644
--- a/include/quda_ptr.h
+++ b/include/quda_ptr.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <ostream>
 #include "malloc_quda.h"
 
 namespace quda {
@@ -18,16 +19,25 @@ namespace quda {
      QUDA_MEMORY_MANAGED        both
    */
   class quda_ptr {
-    QudaMemoryType type = QUDA_MEMORY_INVALID;
-    size_t size = 0;
-    bool pool = false;
-    void *device = nullptr;
-    void *host = nullptr;
+    friend std::ostream& operator<<(std::ostream& output, const quda_ptr& ptr);
+    QudaMemoryType type = QUDA_MEMORY_INVALID; /** Memory type of the allocation */
+    size_t size = 0;                           /** Size of the allocation */
+    bool pool = false;                         /** Is the allocation is pooled */
+    void *device = nullptr;                    /** Device-view of the allocation */
+    void *host = nullptr;                      /** Host-view of the allocation */
+    bool reference = false;                    /** Is this a reference to another allocation */
+
+    /**
+       @brief Internal deallocation routine
+     */
+    void destroy();
 
   public:
     quda_ptr() = default;
     quda_ptr(quda_ptr &&) = default;
     quda_ptr &operator=(quda_ptr &&);
+    quda_ptr(const quda_ptr &) = delete;
+    quda_ptr &operator=(const quda_ptr &) = delete;
 
     /**
        @brief Constructor for quda_ptr
@@ -49,6 +59,15 @@ namespace quda {
     */
     virtual ~quda_ptr();
 
+    /**
+       @brief Specialized exchange function to use in place of
+       std::exchange when exchanging quda_ptr objects: moves obj to
+       *this, and moves new_value to obj
+       @param[in,out] obj
+       @param[in] new_value New value for obj to take
+     */
+    void exchange(quda_ptr &obj, quda_ptr &&new_value);
+
     /**
        @return Returns true if allocation is visible to the device
     */
@@ -73,6 +92,13 @@ namespace quda {
        Return the host view of the pointer
      */
     void *data_host() const;
+
+    /**
+       Return if the instance is a reference rather than an allocation
+     */
+    bool is_reference() const;
   };
 
+  std::ostream& operator<<(std::ostream& output, const quda_ptr& ptr);
+
 }
diff --git a/lib/quda_ptr.cpp b/lib/quda_ptr.cpp
index 7db16b641d..bbb8d88457 100644
--- a/lib/quda_ptr.cpp
+++ b/lib/quda_ptr.cpp
@@ -43,7 +43,8 @@ namespace quda {
   }
 
   quda_ptr::quda_ptr(void *ptr, QudaMemoryType type) :
-    type(type)
+    type(type),
+    reference(true)
   {
     getProfile().TPSTART(QUDA_PROFILE_INIT);
     switch (type) {
@@ -69,6 +70,7 @@ namespace quda {
   quda_ptr& quda_ptr::operator=(quda_ptr &&other)
   {
     if (&other != this) {
+      if (size > 0) errorQuda("Cannot move to already initialized quda_ptr");
       type = std::exchange(other.type, QUDA_MEMORY_INVALID);
       size = std::exchange(other.size, 0);
       pool = std::exchange(other.pool, false);
@@ -78,10 +80,8 @@ namespace quda {
     return *this;
   }
 
-  quda_ptr::~quda_ptr()
+  void quda_ptr::destroy()
   {
-    getProfile().TPSTART(QUDA_PROFILE_FREE);
-
     if (size > 0) {
       switch (type) {
       case QUDA_MEMORY_DEVICE:        pool ? pool_device_free(device) : device_free(device); break;
@@ -93,12 +93,25 @@ namespace quda {
       }
     }
 
+    size = 0;
     device = nullptr;
     host = nullptr;
+  }
 
+  quda_ptr::~quda_ptr()
+  {
+    getProfile().TPSTART(QUDA_PROFILE_FREE);
+    destroy();
     getProfile().TPSTOP(QUDA_PROFILE_FREE);
   }
 
+  void quda_ptr::exchange(quda_ptr &obj, quda_ptr &&new_value)
+  {
+    destroy();
+    *this = std::move(obj);
+    obj = std::move(new_value);
+  }
+
   bool quda_ptr::is_device() const
   {
     switch (type) {
@@ -155,4 +168,13 @@ namespace quda {
     return host;
   }
 
+  bool quda_ptr::is_reference() const { return reference; }
+
+  std::ostream& operator<<(std::ostream& output, const quda_ptr& ptr)
+  {
+    output << "{type = " << ptr.type << ", size = " << ptr.size << ", pool = " << ptr.pool << ", device = " << ptr.device
+           << ", host = " << ptr.host << ", reference = " << ptr.reference << "}";
+    return output;
+  }
+
 }

From 8d6871e58bb04b1e0909b0122b663dfd5a97b2b5 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 21 Sep 2023 13:07:03 -0700
Subject: [PATCH 44/60] Fix issues with move assignment with GaugeField and
 ColorSpinorField objects (quda_ptr should use internal exchange, not
 std::exchange); add ostream overloads for LatticeField and GaugeField. Fix
 verbosity for llfat_test

---
 include/gauge_field.h      | 26 ++++++++++++++--
 include/lattice_field.h    | 12 +++++++-
 lib/color_spinor_field.cpp |  2 +-
 lib/gauge_field.cpp        | 57 ++++++++++++++++++++++++++++++++---
 lib/lattice_field.cpp      | 61 ++++++++++++++++++++++++++++++++++----
 tests/llfat_test.cpp       |  1 +
 6 files changed, 144 insertions(+), 15 deletions(-)

diff --git a/include/gauge_field.h b/include/gauge_field.h
index 54d446839d..1c4bdfc852 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -104,7 +104,7 @@ namespace quda {
       staggeredPhaseType(param.staggered_phase_type),
       staggeredPhaseApplied(param.staggered_phase_applied),
       i_mu(param.i_mu),
-      site_offset(param.gauge_offset),
+      site_offset(link_type == QUDA_ASQTAD_MOM_LINKS ? param.mom_offset : param.gauge_offset),
       site_size(param.site_size)
     {
       switch (link_type) {
@@ -144,9 +144,12 @@ namespace quda {
   };
 
   std::ostream& operator<<(std::ostream& output, const GaugeFieldParam& param);
+  std::ostream& operator<<(std::ostream& output, const GaugeField& param);
 
   class GaugeField : public LatticeField {
 
+    friend std::ostream& operator<<(std::ostream& output, const GaugeField& param);
+
   private:
     /**
        @brief Create the field as specified by the param
@@ -290,9 +293,9 @@ namespace quda {
 
     /**
        @brief Returns if the object is empty (not initialized)
-       @return true if the object has been allocated, otherwise false
+       @return true if the object has not been allocated, otherwise false
     */
-    bool empty() const { return init; }
+    bool empty() const { return !init; }
 
     /**
        @brief Create the communication handlers and buffers
@@ -605,6 +608,23 @@ namespace quda {
     */
     void copy_from_buffer(void *buffer);
 
+    /**
+       @brief Check if two instances are compatible
+       @param[in] a Input field
+       @param[in] b Input field
+       @return Return true if two fields are compatible
+     */
+    static bool are_compatible(const GaugeField &a, const GaugeField &b);
+
+    /**
+       @brief Check if two instances are weakly compatible (precision
+       and order can differ)
+       @param[in] a Input field
+       @param[in] b Input field
+       @return Return true if two fields are compatible
+     */
+    static bool are_compatible_weak(const GaugeField &a, const GaugeField &b);
+
     friend struct GaugeFieldParam;
   };
 
diff --git a/include/lattice_field.h b/include/lattice_field.h
index e7c43b7d69..6c13df2fda 100644
--- a/include/lattice_field.h
+++ b/include/lattice_field.h
@@ -68,10 +68,13 @@ namespace quda {
     /** Array storing the length of dimension */
     lat_dim_t x = {};
 
+    /** Padding to be added to the checker-boarded volume (only for native field ordering) */
     int pad = 0;
 
+    /** Whether the field is full or single parity */
     QudaSiteSubset siteSubset = QUDA_INVALID_SITE_SUBSET;
 
+    /** The type of memory allocation to use for the field */
     QudaMemoryType mem_type = QUDA_MEMORY_INVALID;
 
     /** The type of ghost exchange to be done with this field */
@@ -141,15 +144,18 @@ namespace quda {
     }
 
     /**
-       @brief Contructor for creating LatticeFieldParam from a LatticeField
+       @brief Constructor for creating LatticeFieldParam from a LatticeField
     */
     LatticeFieldParam(const LatticeField &field);
   };
 
   std::ostream& operator<<(std::ostream& output, const LatticeFieldParam& param);
+  std::ostream& operator<<(std::ostream& output, const LatticeField& field);
 
   class LatticeField : public Object {
 
+    friend std::ostream& operator<<(std::ostream& output, const LatticeField& param);
+
     /**
        @brief Create the field as specified by the param
        @param[in] Parameter struct
@@ -175,9 +181,13 @@ namespace quda {
     /** Checkerboarded local volume */
     size_t localVolumeCB = 0;
 
+    /** Stride used for native field ordering (stride = volumeCB + pad) */
     size_t stride = 0;
+
+    /** Padding to be added to the checker-boarded volume (only for native field ordering) */
     int pad = 0;
 
+    /** Total size of the allocation */
     size_t total_bytes = 0;
 
     /** Number of field dimensions */
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index a76a29b0eb..b1e7aa6060 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -230,7 +230,7 @@ namespace quda
     pc_type = std::exchange(src.pc_type, QUDA_PC_INVALID);
     suggested_parity = std::exchange(src.suggested_parity, QUDA_INVALID_PARITY);
     length = std::exchange(src.length, 0);
-    v = std::exchange(src.v, {});
+    v.exchange(src.v, {}); // cannot use std::exchange for quda_ptr
     norm_offset = std::exchange(src.norm_offset, 0);
     ghost = std::exchange(src.ghost, {});
     ghostFace = std::exchange(src.ghostFace, {});
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index d1700709fc..f9975bb757 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -33,6 +33,7 @@ namespace quda {
 
   GaugeField &GaugeField::operator=(const GaugeField &src)
   {
+    if (src.empty()) errorQuda("Copying from empty field");
     if (&src != this) {
       if (!init) { // keep current attributes unless unset
         LatticeField::operator=(src);
@@ -51,7 +52,7 @@ namespace quda {
   {
     if (&src != this) {
       // if field not already initialized then move the field
-      if (!init) {
+      if (!init || are_compatible(*this, src) || src.empty()) {
         LatticeField::operator=(std::move(src));
         move(std::move(src));
       } else {
@@ -237,8 +238,10 @@ namespace quda {
 
   void GaugeField::move(GaugeField &&src)
   {
-    gauge = std::exchange(src.gauge, {});
-    gauge_array = std::exchange(src.gauge_array, {});
+    init = std::exchange(src.init, {});
+    if (src.gauge.is_reference()) errorQuda("Cannot move a reference allocation");
+    gauge.exchange(src.gauge, {});
+    for (auto i = 0; i < gauge_array.size(); i++) gauge_array[i].exchange(src.gauge_array[i], {});
     bytes = std::exchange(src.bytes, 0);
     phase_offset = std::exchange(src.phase_offset, 0);
     phase_bytes = std::exchange(src.phase_bytes, 0);
@@ -257,7 +260,7 @@ namespace quda {
     anisotropy = std::exchange(src.anisotropy, 0.0);
     tadpole = std::exchange(src.tadpole, 0.0);
     fat_link_max = std::exchange(src.fat_link_max, 0.0);
-    ghost = std::exchange(src.ghost, {});
+    for (auto i = 0; i < ghost.size(); i++) ghost[i].exchange(src.ghost[i], {});
     ghostFace = std::exchange(src.ghostFace, {});
     staggeredPhaseType = std::exchange(src.staggeredPhaseType, QUDA_STAGGERED_PHASE_INVALID);
     staggeredPhaseApplied = std::exchange(src.staggeredPhaseApplied, false);
@@ -871,6 +874,17 @@ namespace quda {
 
   }
 
+  bool GaugeField::are_compatible_weak(const GaugeField &a, const GaugeField &b)
+  {
+    return (a.LinkType() == b.LinkType() && a.Ncolor() == b.Ncolor() && a.Nface() == b.Nface() && a.GaugeFixed() == b.GaugeFixed()
+            && a.TBoundary() == b.TBoundary() && a.Anisotropy() == b.Anisotropy() && a.Tadpole() == b.Tadpole());
+  }
+
+  bool GaugeField::are_compatible(const GaugeField &a, const GaugeField &b)
+  {
+    return (a.Precision() == b.Precision() && a.Order() == b.Order() && are_compatible_weak(a, b));
+  }
+
   void GaugeField::checkField(const LatticeField &l) const {
     LatticeField::checkField(l);
     try {
@@ -1132,6 +1146,40 @@ namespace quda {
     return output;  // for multiple << operators.
   }
 
+  std::ostream& operator<<(std::ostream& output, const GaugeField& field)
+  {
+    output << static_cast<const LatticeField &>(field);
+    output << "init = " << field.init << std::endl;
+    output << "gauge = " << field.gauge << std::endl;
+    output << "gauge_array = " << field.gauge_array << std::endl;
+    output << "bytes = " << field.bytes << std::endl;
+    output << "phase_offset = " << field.phase_offset << std::endl;
+    output << "phase_bytes = " << field.phase_bytes << std::endl;
+    output << "length = " << field.length << std::endl;
+    output << "real_length = " << field.real_length << std::endl;
+    output << "nColor = " << field.nColor << std::endl;
+    output << "nFace = " << field.nFace << std::endl;
+    output << "geometry = " << field.geometry << std::endl;
+    output << "site_dim = " << field.geometry << std::endl;
+    output << "reconstruct = " << field.reconstruct << std::endl;
+    output << "nInternal = " << field.nInternal << std::endl;
+    output << "order = " << field.order << std::endl;
+    output << "fixed = " << field.fixed << std::endl;
+    output << "link_type = " << field.link_type << std::endl;
+    output << "t_boundary = " << field.t_boundary << std::endl;
+    output << "anisotropy = " << field.anisotropy << std::endl;
+    output << "tadpole = " << field.tadpole << std::endl;
+    output << "fat_link_max = " << field.fat_link_max << std::endl;
+    output << "ghost = " << field.ghost << std::endl;
+    output << "ghostFace = " << field.ghostFace << std::endl;
+    output << "staggeredPhaseType = " << field.staggeredPhaseType << std::endl;
+    output << "staggeredPhaseApplied = " << field.staggeredPhaseApplied << std::endl;
+    output << "i_mu = " << field.i_mu << std::endl;
+    output << "site_offset = " << field.site_offset << std::endl;
+    output << "size_size = " << field.site_size << std::endl;
+    return output;  // for multiple << operators.
+  }
+
   void GaugeField::zero()
   {
     if (order != QUDA_QDP_GAUGE_ORDER) {
@@ -1201,6 +1249,7 @@ namespace quda {
       errorQuda("Cannot create an alias to source with lower precision than the alias");
     GaugeFieldParam param = param_.init ? param_ : GaugeFieldParam(*this);
     param.create = QUDA_REFERENCE_FIELD_CREATE;
+    param.gauge = gauge.data();
     return GaugeField(param);
   }
 
diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp
index b75b1dcff8..d195b7edb3 100644
--- a/lib/lattice_field.cpp
+++ b/lib/lattice_field.cpp
@@ -613,20 +613,69 @@ namespace quda {
   std::ostream& operator<<(std::ostream& output, const LatticeFieldParam& param)
   {
     output << "nDim = " << param.nDim << std::endl;
-    for (int i = 0; i < param.nDim; i++) { output << "x[" << i << "] = " << param.x[i] << std::endl; }
+    output << "x = " << param.x << std::endl;
     output << "pad = " << param.pad << std::endl;
     output << "precision = " << param.Precision() << std::endl;
     output << "ghost_precision = " << param.GhostPrecision() << std::endl;
     output << "scale = " << param.scale << std::endl;
-
     output << "ghostExchange = " << param.ghostExchange << std::endl;
-    for (int i=0; i<param.nDim; i++) {
-      output << "r[" << i << "] = " << param.r[i] << std::endl;
-    }
-
+    output << "r = " << param.r << std::endl;
     return output;  // for multiple << operators.
   }
 
+  std::ostream& operator<<(std::ostream& output, const LatticeField &field)
+  {
+    output << "volume = " << field.volume << std::endl;
+    output << "volumeCB = " << field.volumeCB << std::endl;
+    output << "localVolume = " << field.localVolume << std::endl;
+    output << "localVolumeCB = " << field.localVolumeCB << std::endl;
+    output << "stride = " << field.stride << std::endl;
+    output << "pad = " << field.stride << std::endl;
+    output << "total_bytes = " << field.total_bytes << std::endl;
+    output << "nDim = " << field.nDim << std::endl;
+    output << "x = " << field.x << std::endl;
+    output << "r = " << field.r << std::endl;
+    output << "local_x = " << field.local_x << std::endl;
+    output << "surface = " << field.surface << std::endl;
+    output << "surfaceCB = " << field.surfaceCB << std::endl;
+    output << "local_surface = " << field.local_surface << std::endl;
+    output << "local_surfaceCB = " << field.local_surfaceCB << std::endl;
+    output << "location = " << field.location << std::endl;
+    output << "precision = " << field.precision << std::endl;
+    output << "ghost_precision = " << field.ghost_precision_reset << std::endl;
+    output << "scale = " << field.scale << std::endl;
+    output << "siteSubset = " << field.siteSubset << std::endl;
+    output << "ghostExchange = " << field.ghostExchange<< std::endl;
+    output << "nDimComms = " << field.nDimComms << std::endl;
+    output << "ghost_bytes = " << field.ghost_bytes_old << std::endl;
+    output << "ghost_bytes_old = " << field.ghost_bytes_old << std::endl;
+    output << "ghost_face_bytes = " << field.ghost_face_bytes << std::endl;
+    output << "ghost_face_bytes_aligned = " << field.ghost_face_bytes_aligned << std::endl;
+    output << "ghost_offset = " << field.ghost_offset << std::endl;
+    output << "my_face_h = " << field.my_face_h << std::endl;
+    output << "my_face_hd = " << field.my_face_hd << std::endl;
+    output << "my_face_d = " << field.my_face_d << std::endl;
+    output << "my_face_dim_dir_h = " << field.my_face_dim_dir_h << std::endl;
+    output << "my_face_dim_dir_hd = " << field.my_face_dim_dir_hd << std::endl;
+    output << "my_face_dim_dir_d = " << field.my_face_dim_dir_d << std::endl;
+    output << "from_face_h = " << field.from_face_h << std::endl;
+    output << "from_face_hd = " << field.from_face_hd << std::endl;
+    output << "from_face_d = " << field.from_face_d << std::endl;
+    output << "from_face_dim_dir_h = " << field.from_face_dim_dir_h << std::endl;
+    output << "from_face_dim_dir_hd = " << field.from_face_dim_dir_hd << std::endl;
+    output << "from_face_dim_dir_d = " << field.from_face_dim_dir_d << std::endl;
+    output << "mh_recv = " << field.mh_recv << std::endl;
+    output << "mh_send = " << field.mh_send << std::endl;
+    output << "mh_recv_rdma = " << field.mh_recv_rdma << std::endl;
+    output << "mh_send_rdma = " << field.mh_send_rdma << std::endl;
+    output << "initComms = " << field.initComms << std::endl;
+    output << "vol_string = " << field.vol_string << std::endl;
+    output << "aux_string = " << field.aux_string << std::endl;
+    output << "mem_type = " << field.mem_type << std::endl;
+    for (auto i = 0u; i < field.backup_h.size(); i++) output << "backup_h[" << i << "] = " << field.backup_h[i] << std::endl;
+    return output;
+  }
+
   static QudaFieldLocation reorder_location_ = QUDA_CUDA_FIELD_LOCATION;
 
   QudaFieldLocation reorder_location() { return reorder_location_; }
diff --git a/tests/llfat_test.cpp b/tests/llfat_test.cpp
index 221ab352e3..0342007de3 100644
--- a/tests/llfat_test.cpp
+++ b/tests/llfat_test.cpp
@@ -41,6 +41,7 @@ static void llfat_test()
   qudaGaugeParam.X[3] = tdim;
 
   setDims(qudaGaugeParam.X);
+  setVerbosity(verbosity);
 
   qudaGaugeParam.cpu_prec = cpu_prec;
   qudaGaugeParam.cuda_prec = qudaGaugeParam.cuda_prec_sloppy = prec;

From 97ee4ee81f72c056932791610713ead6de240909 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 21 Sep 2023 13:28:17 -0700
Subject: [PATCH 45/60] Fix some residency issues found while testing MILC, use
 GaugeField::empty to check if the field been allocated

---
 lib/interface_quda.cpp | 55 +++++++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 1833a0f766..840d6f5e96 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -1335,6 +1335,7 @@ void endQuda(void)
     for (int i = 0; i < QUDA_MAX_CHRONO; i++) flushChronoQuda(i);
 
     solutionResident.clear();
+    momResident = GaugeField();
 
     LatticeField::freeGhostBuffer();
     ColorSpinorField::freeGhostBuffer();
@@ -3876,15 +3877,14 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
 
   if (qudaGaugeParam->return_result_mom) cpuMom.copy(cudaMom);
 
-  if (qudaGaugeParam->make_resident_gauge) {
-    if (gaugePrecise && !qudaGaugeParam->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+  if (qudaGaugeParam->make_resident_gauge && !qudaGaugeParam->use_resident_gauge) {
+    if (gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
     gaugePrecise = new GaugeField();
     std::exchange(*gaugePrecise, cudaSiteLink);
   }
 
-  if (qudaGaugeParam->make_resident_mom && !qudaGaugeParam->use_resident_mom)
-    std::exchange(momResident, cudaMom);
-  else momResident = GaugeField();
+  if (qudaGaugeParam->make_resident_mom && !qudaGaugeParam->use_resident_mom) std::exchange(momResident, cudaMom);
+  else if (!qudaGaugeParam->make_resident_mom) momResident = GaugeField();
 
   if (qudaGaugeParam->make_resident_gauge) {
     if (extendedGaugeResident) delete extendedGaugeResident;
@@ -3945,10 +3945,13 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
 
   cpuOut.copy(cudaOut);
 
-  if (qudaGaugeParam->make_resident_gauge) {
-    if (gaugePrecise && !qudaGaugeParam->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+  if (qudaGaugeParam->make_resident_gauge && !qudaGaugeParam->use_resident_gauge) {
+    if (gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
     gaugePrecise = new GaugeField();
     std::exchange(*gaugePrecise, cudaSiteLink);
+  }
+
+  if (qudaGaugeParam->make_resident_gauge) {
     if (extendedGaugeResident) delete extendedGaugeResident;
     extendedGaugeResident = cudaGauge;
   } else {
@@ -3977,7 +3980,7 @@ void momResidentQuda(void *mom, QudaGaugeParam *param)
     gParamMom.create = QUDA_ZERO_FIELD_CREATE;
     momResident = GaugeField(gParamMom);
   } else if (param->return_result_mom && !param->make_resident_mom) {
-    if (!momResident.Volume()) errorQuda("No resident momentum to return");
+    if (momResident.empty()) errorQuda("No resident momentum to return");
   } else {
     errorQuda("Unexpected combination make_resident_mom = %d return_result_mom = %d", param->make_resident_mom,
               param->return_result_mom);
@@ -4082,7 +4085,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   GaugeField cpuMom(gParam);
 
   // create the device momentum field
-  if (gauge_param->use_resident_mom && !momResident.Volume()) errorQuda("Cannot use resident momentum field since none appears resident");
+  if (gauge_param->use_resident_mom && momResident.empty()) errorQuda("Cannot use resident momentum field since none appears resident");
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParam.create = QUDA_COPY_FIELD_CREATE;
@@ -4180,7 +4183,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   if (gauge_param->return_result_mom) cpuMom.copy(cudaMom);
 
   if (gauge_param->make_resident_mom && !gauge_param->use_resident_mom) std::exchange(momResident, cudaMom);
-  else momResident = GaugeField();
+  else if (!gauge_param->make_resident_mom) momResident = GaugeField();
 
   for (int i=0; i<nvector; i++) delete X[i];
 }
@@ -4476,12 +4479,9 @@ void computeHISQForceQuda(void* const milc_momentum,
 
   // Close the paths, make anti-hermitian, and store in compressed format
   if (gParam->return_result_mom) cpuMom.copy(mom);
-  if (!gParam->make_resident_mom) momResident = GaugeField();
 
-  if (gParam->make_resident_mom && !gParam->use_resident_mom)
-    std::exchange(momResident, mom);
-  else
-    momResident = GaugeField();
+  if (gParam->make_resident_mom && !gParam->use_resident_mom) std::exchange(momResident, mom);
+  else if (!gParam->make_resident_mom) momResident = GaugeField();
 }
 
 void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double *coeff, double kappa2, double ck,
@@ -4658,7 +4658,7 @@ void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom,
   GaugeField cpuMom = !param->use_resident_mom ? GaugeField(gParamMom) : GaugeField();
 
   // create the device fields
-  if (param->use_resident_mom && !momResident.Volume()) errorQuda("No resident mom field allocated");
+  if (param->use_resident_mom && momResident.empty()) errorQuda("No resident mom field allocated");
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.create = QUDA_COPY_FIELD_CREATE;
   gParam.field = &cpuMom;
@@ -4685,13 +4685,13 @@ void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom,
   if (param->return_result_gauge) cpuGauge.copy(u_out);
 
   if (param->make_resident_gauge) {
-    if (gaugePrecise && !param->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    if (gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
     gaugePrecise = new GaugeField();
     std::exchange(*gaugePrecise, u_out);
   }
 
   if (param->make_resident_mom && !param->use_resident_mom) std::exchange(momResident, cudaMom);
-  else momResident = GaugeField();
+  else if (!param->make_resident_mom) momResident = GaugeField();
 }
 
 void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
@@ -4700,7 +4700,7 @@ void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
   checkGaugeParam(param);
 
   // create the gauge field
-  GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS);
+  GaugeFieldParam gParam(*param, gauge_h, QUDA_SU3_LINKS);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
   bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
   GaugeField cpuGauge = need_cpu ? GaugeField(gParam) : GaugeField();
@@ -4725,8 +4725,8 @@ void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
 
   if (param->return_result_gauge) cpuGauge.copy(cudaGauge);
 
-  if (param->make_resident_gauge) {
-    if (gaugePrecise != nullptr && !param->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+  if (param->make_resident_gauge && !param->use_resident_gauge) {
+    if (gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
     gaugePrecise = new GaugeField();
     std::exchange(*gaugePrecise, cudaGauge);
   }
@@ -4762,8 +4762,8 @@ void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
 
   if (param->return_result_gauge) cpuGauge.copy(cudaGauge);
 
-  if (param->make_resident_gauge) {
-    if (gaugePrecise != nullptr && !param->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+  if (param->make_resident_gauge && !param->use_resident_gauge) {
+    if (gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
     gaugePrecise = new GaugeField();
     std::exchange(*gaugePrecise, cudaGauge);
   }
@@ -4787,15 +4787,14 @@ double momActionQuda(void* momentum, QudaGaugeParam* param)
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
   gParam.setPrecision(param->cuda_prec, true);
 
-  if (param->use_resident_mom && !momResident.Volume()) errorQuda("No resident mom field allocated");
+  if (param->use_resident_mom && momResident.empty()) errorQuda("No resident mom field allocated");
   GaugeField cudaMom = param->use_resident_mom ? momResident.create_alias() : GaugeField(gParam);
 
   // perform the update
   double action = computeMomAction(cudaMom);
 
-  if (param->make_resident_mom && !param->use_resident_mom)
-    std::exchange(momResident, cudaMom);
-  else momResident = GaugeField();
+  if (param->make_resident_mom && !param->use_resident_mom) std::exchange(momResident, cudaMom);
+  else if (!param->make_resident_mom) momResident = GaugeField();
 
   return action;
 }
@@ -4816,7 +4815,7 @@ void gaussGaugeQuda(unsigned long long seed, double sigma)
 void gaussMomQuda(unsigned long long seed, double sigma)
 {
   auto profile = pushProfile(profileGauss);
-  if (!momResident.Volume()) errorQuda("Cannot generate Gauss GaugeField as there is no resident momentum field");
+  if (momResident.empty()) errorQuda("Cannot generate Gauss GaugeField as there is no resident momentum field");
   quda::gaugeGauss(momResident, seed, sigma);
 }
 

From 69f73031f9c3bc39be494496bae75aedc3679998 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 21 Sep 2023 13:36:03 -0700
Subject: [PATCH 46/60] Fix #1406

---
 include/kernels/dslash_gamma_helper.cuh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/kernels/dslash_gamma_helper.cuh b/include/kernels/dslash_gamma_helper.cuh
index 3b5e27492a..5261ea5b32 100644
--- a/include/kernels/dslash_gamma_helper.cuh
+++ b/include/kernels/dslash_gamma_helper.cuh
@@ -78,11 +78,11 @@ namespace quda {
     {
       ColorSpinor<typename Arg::real, Arg::nColor, 4> in = arg.in(x_cb, parity);
       switch(arg.d) {
-      case 0: arg.out(x_cb, parity) = in.gamma(0);
-      case 1: arg.out(x_cb, parity) = in.gamma(1);
-      case 2: arg.out(x_cb, parity) = in.gamma(2);
-      case 3: arg.out(x_cb, parity) = in.gamma(3);
-      case 4: arg.out(x_cb, parity) = in.gamma(4);
+      case 0: arg.out(x_cb, parity) = in.gamma(0); break;
+      case 1: arg.out(x_cb, parity) = in.gamma(1); break;
+      case 2: arg.out(x_cb, parity) = in.gamma(2); break;
+      case 3: arg.out(x_cb, parity) = in.gamma(3); break;
+      case 4: arg.out(x_cb, parity) = in.gamma(4); break;
       }
     }
   };

From 8aac21a3318fa1015ff9794e6615943feaa07e04 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 21 Sep 2023 13:38:00 -0700
Subject: [PATCH 47/60] Fix 32-bit overflow issue when sizing compressed gauge
 fields (Thanks to @stevengottlieb)

---
 lib/gauge_field.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index f9975bb757..af9cc7bf90 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -129,9 +129,9 @@ namespace quda {
     if (isNative()) {
       if (reconstruct == QUDA_RECONSTRUCT_9 || reconstruct == QUDA_RECONSTRUCT_13) {
         // Need to adjust the phase alignment as well.
-        int half_phase_bytes
+        size_t half_phase_bytes
           = (length / (2 * reconstruct)) * precision; // bytes needed to store phases for a single parity
-        int half_gauge_bytes = (length / 2) * precision
+        size_t half_gauge_bytes = (length / 2) * precision
           - half_phase_bytes; // bytes needed to store the gauge field for a single parity excluding the phases
         // Adjust the alignments for the gauge and phase separately
         half_phase_bytes = ALIGNMENT_ADJUST(half_phase_bytes);

From 415a443afae5aea024a3cfc1aa7a389c3c6721b0 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 3 Oct 2023 12:35:16 -0700
Subject: [PATCH 48/60] LatticeFieldParam should set its location from
 QudaGaugeParam::location

---
 include/lattice_field.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/lattice_field.h b/include/lattice_field.h
index 6c13df2fda..a7ca3984ee 100644
--- a/include/lattice_field.h
+++ b/include/lattice_field.h
@@ -126,7 +126,7 @@ namespace quda {
        @param[in] param Contains the metadata for filling out the LatticeFieldParam
     */
     LatticeFieldParam(const QudaGaugeParam &param) :
-      location(QUDA_CPU_FIELD_LOCATION),
+      location(param.location),
       precision(param.cpu_prec),
       ghost_precision(param.cpu_prec),
       init(true),

From b211699a7398bac2e3a1169046b9662d57a7356f Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 3 Oct 2023 13:06:53 -0700
Subject: [PATCH 49/60] Fix for QUDA_CTEST_LAUNCH

---
 tests/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 533de3a8c1..d7532d821c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -265,8 +265,8 @@ if(QUDA_MPI OR QUDA_QMP)
   if(DEFINED ENV{QUDA_TEST_GRID_SIZE})
     get_test_ranks($ENV{QUDA_TEST_GRID_SIZE} QUDA_TEST_NUM_PROCS)
   endif()
-  set(QUDA_CTEST_LAUNCH "${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${QUDA_TEST_NUM_PROCS} ${MPIEXEC_PREFLAGS}"
-      CACHE STRING "CTest Launcher command for QUDA's tests")
+  set(QUDA_CTEST_LAUNCH ${MPIEXEC_EXECUTABLE};${MPIEXEC_NUMPROC_FLAG};${QUDA_TEST_NUM_PROCS};${MPIEXEC_PREFLAGS}
+    CACHE STRING "CTest Launcher command for QUDA's tests")
 endif()
 
 # BLAS tests

From dfef80f6b7d3b7707cae49442919591f0d3d943b Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 4 Oct 2023 10:59:36 -0700
Subject: [PATCH 50/60] Fix for modern Fortran compilers

---
 include/enum_quda_fortran.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h
index e77d5a0e15..8a21cf2660 100644
--- a/include/enum_quda_fortran.h
+++ b/include/enum_quda_fortran.h
@@ -9,7 +9,7 @@
 #   gfortran).
 #*/
 
-#define QUDA_INVALID_ENUM (-Z'7fffffff' - 1)
+#define QUDA_INVALID_ENUM -int(Z'7FFFFFFF') - 1
 
 #define QudaLinkType integer(4)
 

From c5410be65a168ff26495e05d6c960afbc2e89955 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 4 Oct 2023 11:01:40 -0700
Subject: [PATCH 51/60] When creating momentum field, always use periodic
 boundary conditions

---
 include/gauge_field.h  | 2 +-
 lib/interface_quda.cpp | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/gauge_field.h b/include/gauge_field.h
index 1c4bdfc852..c85a8bed06 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -94,7 +94,7 @@ namespace quda {
       order(param.gauge_order),
       fixed(param.gauge_fix),
       link_type(link_type_ != QUDA_INVALID_LINKS ? link_type_ : param.type),
-      t_boundary(param.t_boundary),
+      t_boundary(link_type == QUDA_ASQTAD_MOM_LINKS ? QUDA_PERIODIC_T : param.t_boundary),
       // if we have momentum field and not using TIFR field, then we always have recon-10
       reconstruct(link_type == QUDA_ASQTAD_MOM_LINKS && order != QUDA_TIFR_GAUGE_ORDER && order != QUDA_TIFR_PADDED_GAUGE_ORDER ?
                   QUDA_RECONSTRUCT_10 : QUDA_RECONSTRUCT_NO),
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 840d6f5e96..6e4a058e9e 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -4081,7 +4081,6 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   // create the host momentum field
   gParam.location = QUDA_CPU_FIELD_LOCATION;
   gParam.reconstruct = gauge_param->reconstruct;
-  gParam.t_boundary = QUDA_PERIODIC_T;
   GaugeField cpuMom(gParam);
 
   // create the device momentum field

From 4c308f6f72a2239e97f3423137459e049dc56b03 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Sat, 7 Oct 2023 16:11:42 -0700
Subject: [PATCH 52/60] Don't dereference nullptr when creating reference QDP
 fields

---
 lib/gauge_field.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index af9cc7bf90..6a0b4c6bb9 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -181,7 +181,7 @@ namespace quda {
         if (param.create != QUDA_REFERENCE_FIELD_CREATE) {
           gauge_array[d] = quda_ptr(mem_type, nbytes);
         } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
-          gauge_array[d] = quda_ptr(static_cast<void **>(param.gauge)[d], mem_type);
+          if (param.gauge) gauge_array[d] = quda_ptr(static_cast<void **>(param.gauge)[d], mem_type);
         } else {
           errorQuda("Unsupported creation type %d", param.create);
         }

From a16e51c5cae63fba7b8bfdbb7f217ce2a54faf7c Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 11 Oct 2023 16:54:02 -0700
Subject: [PATCH 53/60] Prevent concurrent timers from running: check if a
 timer is already running, and if so push it to the stack, and restore after
 the newly started timer is stopped.  Fixes timing issues as noted by Jiqun

---
 lib/timer.cpp | 49 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/lib/timer.cpp b/lib/timer.cpp
index 986d7b045f..2c6f9b21c3 100644
--- a/lib/timer.cpp
+++ b/lib/timer.cpp
@@ -136,6 +136,8 @@ namespace quda {
 #define POP_RANGE
 #endif
 
+  static std::stack<QudaProfileType> pt_stack;
+
   void TimeProfile::Start_(const char *func, const char *file, int line, QudaProfileType idx)
   {
     // if total timer isn't running, then start it running
@@ -144,6 +146,17 @@ namespace quda {
       switchOff = true;
     }
 
+    // if a timer is already running, stop it and push to stack
+    for (auto i = 0; i < QUDA_PROFILE_COUNT - 1; i++) {
+      if (i == static_cast<int>(idx)) continue;
+      if (profile[i].running) {
+        if (i == QUDA_PROFILE_COMPUTE || i == QUDA_PROFILE_H2D || i == QUDA_PROFILE_D2H) qudaDeviceSynchronize();
+        profile[i].stop(file, func, line);
+        if (use_global) StopGlobal(func, file, line, static_cast<QudaProfileType>(i));
+        pt_stack.push(static_cast<QudaProfileType>(i));
+      }
+    }
+
     profile[idx].start(func, file, line);
     PUSH_RANGE(fname.c_str(), idx)
     if (use_global) StartGlobal(func, file, line, idx);
@@ -156,12 +169,22 @@ namespace quda {
     profile[idx].stop(func, file, line);
     POP_RANGE
 
-    // switch off total timer if we need to
-    if (switchOff && idx != QUDA_PROFILE_TOTAL) {
-      profile[QUDA_PROFILE_TOTAL].stop(func, file, line);
-      switchOff = false;
+    if (pt_stack.empty()) {
+      // switch off total timer if we need to (only if no timer being popped)
+      if (switchOff && idx != QUDA_PROFILE_TOTAL) {
+        profile[QUDA_PROFILE_TOTAL].stop(func, file, line);
+        switchOff = false;
+      }
+      if (use_global) StopGlobal(func, file, line, idx);
+    }
+
+    // restore any pre-existing timers if needed
+    if (!pt_stack.empty()) {
+      auto i = pt_stack.top();
+      pt_stack.pop();
+      profile[i].start(func, file, line);
+      if (use_global) StartGlobal(func, file, line, i);
     }
-    if (use_global) StopGlobal(func, file, line, idx);
   }
 
 #undef PUSH_RANGE
@@ -198,28 +221,28 @@ namespace quda {
     }
   }
 
-  TimeProfile dummy("dummy");
+  TimeProfile dummy("default", false);
 
-  static std::stack<TimeProfile*> tpstack;
+  static std::stack<TimeProfile*> tp_stack;
 
   pushProfile::pushProfile(TimeProfile &profile) : profile(profile)
   {
     profile.TPSTART(QUDA_PROFILE_TOTAL);
-    tpstack.push(&profile);
+    tp_stack.push(&profile);
   }
 
   pushProfile::~pushProfile()
   {
-    if (tpstack.empty()) errorQuda("popProfile() called with empty stack");
-    auto &profile = *(tpstack.top());
+    if (tp_stack.empty()) errorQuda("popProfile() called with empty stack");
+    auto &profile = *(tp_stack.top());
     if (&(this->profile) != &profile) errorQuda("Popped profile is not the expected one");
-    tpstack.pop();
+    tp_stack.pop();
     profile.TPSTOP(QUDA_PROFILE_TOTAL);
   }
 
   TimeProfile& getProfile()
   {
-    if (tpstack.empty()) return dummy;
-    return *(tpstack.top());
+    if (tp_stack.empty()) return dummy;
+    return *(tp_stack.top());
   }
 }

From 85292b24463bfade92a4451a9b26943209cb9f92 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 13 Oct 2023 16:22:02 -0700
Subject: [PATCH 54/60] Cleanup of solver timing and flops handling: add global
 flops counter which is incremented whenever tuneLaunch is called; for solver
 gflops and timing, we now compute the time and gflop between pushing the
 present interface profile, this now ensures we include all operations and
 includes upload/download time

---
 include/invert_quda.h      | 19 -----------------
 include/multigrid.h        |  5 -----
 include/quda.h             |  4 +++-
 include/timer.h            |  7 ++++++-
 include/tune_quda.h        | 10 +++++++++
 lib/eigensolve_quda.cpp    |  2 --
 lib/gauge_fix_fft.cu       | 10 ++++-----
 lib/interface_quda.cpp     | 40 ++++++++++++------------------------
 lib/inv_bicgstab_quda.cpp  | 14 -------------
 lib/inv_bicgstabl_quda.cpp | 11 ----------
 lib/inv_ca_cg.cpp          | 27 +-----------------------
 lib/inv_ca_gcr.cpp         | 25 +----------------------
 lib/inv_cg3_quda.cpp       | 13 +-----------
 lib/inv_cg_quda.cpp        | 42 +-------------------------------------
 lib/inv_eigcg_quda.cpp     | 24 ----------------------
 lib/inv_gcr_quda.cpp       | 15 --------------
 lib/inv_gmresdr_quda.cpp   |  9 --------
 lib/inv_mr_quda.cpp        | 15 +-------------
 lib/inv_msrc_cg_quda.cpp   | 10 ---------
 lib/inv_multi_cg_quda.cpp  |  9 --------
 lib/inv_pcg_quda.cpp       | 13 ------------
 lib/multigrid.cpp          | 28 -------------------------
 lib/solver.cpp             |  3 ---
 lib/timer.cpp              | 11 +++++++++-
 lib/tune.cpp               |  6 ++++++
 25 files changed, 58 insertions(+), 314 deletions(-)

diff --git a/include/invert_quda.h b/include/invert_quda.h
index 35a21ce31c..11ac64708e 100644
--- a/include/invert_quda.h
+++ b/include/invert_quda.h
@@ -225,12 +225,6 @@ namespace quda {
     /** The type of accelerator type to use for preconditioner */
     QudaAcceleratorType accelerator_type_precondition;
 
-    /**< The time taken by the solver */
-    double secs;
-
-    /**< The Gflops rate of the solver */
-    double gflops;
-
     // Incremental EigCG solver parameters
     /**< The precision of the Ritz vectors */
     QudaPrecision precision_ritz;//also search space precision
@@ -333,8 +327,6 @@ namespace quda {
       ca_lambda_max_precondition(param.ca_lambda_max_precondition),
       schwarz_type(param.schwarz_type),
       accelerator_type_precondition(param.accelerator_type_precondition),
-      secs(param.secs),
-      gflops(param.gflops),
       precision_ritz(param.cuda_prec_ritz),
       n_ev(param.n_ev),
       m(param.max_search_dim),
@@ -422,8 +414,6 @@ namespace quda {
       ca_lambda_max_precondition(param.ca_lambda_max_precondition),
       schwarz_type(param.schwarz_type),
       accelerator_type_precondition(param.accelerator_type_precondition),
-      secs(param.secs),
-      gflops(param.gflops),
       precision_ritz(param.precision_ritz),
       n_ev(param.n_ev),
       m(param.m),
@@ -466,9 +456,6 @@ namespace quda {
       param.true_res = true_res;
       param.true_res_hq = true_res_hq;
       param.iter += iter;
-      comm_allreduce_sum(gflops);
-      param.gflops += gflops;
-      param.secs += secs;
       if (offset >= 0) {
 	param.true_res_offset[offset] = true_res_offset[offset];
         param.iter_res_offset[offset] = iter_res_offset[offset];
@@ -786,12 +773,6 @@ namespace quda {
     static void computeCAKrylovSpace(const DiracMatrix &diracm, std::vector<ColorSpinorField> &Ap,
                                      std::vector<ColorSpinorField> &p, int n_krylov, QudaCABasis basis, double m_map,
                                      double b_map, Args &&...args);
-
-    /**
-     * @brief Return flops
-     * @return flops expended by this operator
-     */
-    virtual double flops() const { return 0; }
   };
 
   /**
diff --git a/include/multigrid.h b/include/multigrid.h
index 82a46998c4..e5981baac2 100644
--- a/include/multigrid.h
+++ b/include/multigrid.h
@@ -486,11 +486,6 @@ namespace quda {
     */
     void buildFreeVectors(std::vector<ColorSpinorField*> &B);
 
-    /**
-       @brief Return the total flops done on this and all coarser levels.
-     */
-    double flops() const;
-
     /**
       @brief Return if we're on a fine grid right now
     */
diff --git a/include/quda.h b/include/quda.h
index d6e9ee66aa..b2ddefa72c 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -1760,8 +1760,10 @@ extern "C" {
     int delete_2link;
     /** Set if the input spinor is on a time slice **/
     int t0;
+    /** Time taken for the smearing operations **/
+    double secs;
     /** Flops count for the smearing operations **/
-    int gflops;
+    double gflops;
     
   } QudaQuarkSmearParam;
 
diff --git a/include/timer.h b/include/timer.h
index 2de1829c18..8402deb89c 100644
--- a/include/timer.h
+++ b/include/timer.h
@@ -235,8 +235,13 @@ namespace quda {
      the profile stack, and be popped when its destructor is called.
    */
   struct pushProfile {
+    static inline double secs_dummy = 0;
+    static inline double gflops_dummy = 0;
     TimeProfile &profile;
-    pushProfile(TimeProfile &profile);
+    double &secs;
+    double &gflops;
+    uint64_t flops;
+    pushProfile(TimeProfile &profile, double &secs = secs_dummy, double &gflops = gflops_dummy);
     virtual ~pushProfile();
   };
 
diff --git a/include/tune_quda.h b/include/tune_quda.h
index ff99826149..2750e57e9c 100644
--- a/include/tune_quda.h
+++ b/include/tune_quda.h
@@ -45,6 +45,10 @@ namespace quda {
 
   class Tunable {
 
+    friend TuneParam tuneLaunch(Tunable &, QudaTune, QudaVerbosity);
+    static inline uint64_t _flops_global = 0;
+    static inline uint64_t _bytes_global = 0;
+
   protected:
     virtual long long flops() const { return 0; }
     virtual long long bytes() const { return 0; }
@@ -340,6 +344,12 @@ namespace quda {
 
     qudaError_t launchError() const { return launch_error; }
     qudaError_t &launchError() { return launch_error; }
+
+    static void flops_global(uint64_t value) { _flops_global = value; }
+    static uint64_t flops_global() { return _flops_global; }
+
+    static void bytes_global(uint64_t value) { _bytes_global = value; }
+    static uint64_t bytes_global() { return _bytes_global; }
   };
 
   /**
diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp
index 00c888dd88..710d6ac13a 100644
--- a/lib/eigensolve_quda.cpp
+++ b/lib/eigensolve_quda.cpp
@@ -259,8 +259,6 @@ namespace quda
       io.save(kSpace, save_prec, n_eig);
     }
 
-    mat.flops();
-
     logQuda(QUDA_SUMMARIZE, "********************************\n");
     logQuda(QUDA_SUMMARIZE, "***** END QUDA EIGENSOLVER *****\n");
     logQuda(QUDA_SUMMARIZE, "********************************\n");
diff --git a/lib/gauge_fix_fft.cu b/lib/gauge_fix_fft.cu
index 1de3980332..fea9a92623 100644
--- a/lib/gauge_fix_fft.cu
+++ b/lib/gauge_fix_fft.cu
@@ -217,7 +217,7 @@ namespace quda {
     GaugeFixQuality<decltype(argQ)> gfixquality(argQ, data);
     gfixquality.apply(device::get_default_stream());
     double action0 = argQ.getAction();
-    if(getVerbosity() >= QUDA_SUMMARIZE) printf("Step: %d\tAction: %.16e\ttheta: %.16e\n", 0, argQ.getAction(), argQ.getTheta());
+    logQuda(QUDA_SUMMARIZE, "Step: %d\tAction: %.16e\ttheta: %.16e\n", 0, argQ.getAction(), argQ.getTheta());
 
     double diff = 0.0;
     int iter = 0;
@@ -289,7 +289,7 @@ namespace quda {
       if ( autotune && ((action - action0) < -1e-14) ) {
         if ( arg.alpha > 0.01 ) {
           arg.alpha = 0.95 * arg.alpha;
-          if(getVerbosity() >= QUDA_SUMMARIZE) printf(">>>>>>>>>>>>>> Warning: changing alpha down -> %.4e\n", arg.alpha);
+          logQuda(QUDA_SUMMARIZE, ">>>>>>>>>>>>>> Warning: changing alpha down -> %.4e\n", arg.alpha);
         }
       }
       //------------------------------------------------------------------------
@@ -356,7 +356,7 @@ namespace quda {
     
     gflops = (gflops * 1e-9) / (secs);
     gbytes = gbytes / (secs * 1e9);
-    if (getVerbosity() > QUDA_SUMMARIZE) printfQuda("Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
+    logQuda(QUDA_SUMMARIZE, "Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
 
     host_free(num_failures_h);
   }
@@ -366,10 +366,10 @@ namespace quda {
                    double alpha, int autotune, double tolerance, int stopWtheta)
     {
       if (gauge_dir != 3) {
-	if (getVerbosity() > QUDA_SUMMARIZE) printfQuda("Starting Landau gauge fixing with FFTs...\n");
+	logQuda(QUDA_SUMMARIZE, "Starting Landau gauge fixing with FFTs...\n");
         gaugeFixingFFT<Float, recon, 4>(data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
       } else {
-	if (getVerbosity() > QUDA_SUMMARIZE) printfQuda("Starting Coulomb gauge fixing with FFTs...\n");
+	logQuda(QUDA_SUMMARIZE, "Starting Coulomb gauge fixing with FFTs...\n");
         gaugeFixingFFT<Float, recon, 3>(data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
       }
     }
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index f05e633d51..6286bb04a9 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -1773,7 +1773,7 @@ namespace quda {
 
 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
 {
-  auto profile = pushProfile(profileDslash);
+  auto profile = pushProfile(profileDslash, inv_param->secs, inv_param->gflops);
 
   const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;
 
@@ -2141,12 +2141,13 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
 void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param)
 {
   if (!initialized) errorQuda("QUDA not initialized");
-  auto profile = pushProfile(profileEigensolve);
 
   // Transfer the inv param structure contained in eig_param.
   // This will define the operator to be eigensolved.
   QudaInvertParam *inv_param = eig_param->invert_param;
 
+  auto profile = pushProfile(profileEigensolve, inv_param->secs, inv_param->gflops);
+
   // QUDA can employ even-odd preconditioning to an operator.
   // For the eigensolver the solution type must match
   // the solve type, i.e., there is no full solution reconstruction
@@ -2179,9 +2180,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
   // Check that the gauge field is valid
   GaugeField *cudaGauge = checkGauge(inv_param);
 
-  // Set all timing statistics to zero
-  inv_param->secs = 0;
-  inv_param->gflops = 0;
+  // Set iter statistics to zero
   inv_param->iter = 0;
 
   // Dump all eigensolver and invert param variables to stdout if requested.
@@ -2331,8 +2330,6 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr
     errorQuda("Outer MG solver can only use QUDA_DIRECT_SOLVE at present");
 
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaMultigridParam(&mg_param);
-  mg_param.secs = 0;
-  mg_param.gflops = 0;
 
   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
     (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
@@ -2395,7 +2392,7 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr
 
 void* newMultigridQuda(QudaMultigridParam *mg_param) {
   profilerStart(__func__);
-  auto profile = pushProfile(profileInvert);
+  auto profile = pushProfile(profileInvert, mg_param->secs, mg_param->gflops);
   pushVerbosity(mg_param->invert_param->verbosity);
 
   auto *mg = new multigrid_solver(*mg_param, profileInvert);
@@ -2414,7 +2411,7 @@ void destroyMultigridQuda(void *mg) {
 void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
 {
   profilerStart(__func__);
-  auto profile = pushProfile(profileInvert);
+  auto profile = pushProfile(profileInvert, mg_param->secs, mg_param->gflops);
   pushVerbosity(mg_param->invert_param->verbosity);
 
   profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE);
@@ -2526,7 +2523,7 @@ void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
 void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
 {
   profilerStart(__func__);
-  auto profile = pushProfile(profileInvert);
+  auto profile = pushProfile(profileInvert, mg_param->secs, mg_param->gflops);
   pushVerbosity(mg_param->invert_param->verbosity);
 
   auto *mg = static_cast<multigrid_solver*>(mg_);
@@ -2547,8 +2544,6 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
   if (param->inv_type != QUDA_EIGCG_INVERTER && param->inv_type != QUDA_INC_EIGCG_INVERTER) return;
 
   GaugeField *cudaGauge = checkGauge(param);
-  eig_param.secs   = 0;
-  eig_param.gflops = 0;
 
   DiracParam diracParam;
   if(eig_param.cuda_prec_ritz == param->cuda_prec)
@@ -2602,7 +2597,7 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
 }
 
 void* newDeflationQuda(QudaEigParam *eig_param) {
-  auto profile = pushProfile(profileInvert);
+  auto profile = pushProfile(profileInvert, eig_param->secs, eig_param->gflops);
   auto *defl = new deflated_solver(*eig_param, profileInvert);
   saveProfile(__func__);
   flushProfile();
@@ -2615,7 +2610,7 @@ void destroyDeflationQuda(void *df) {
 
 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
 {
-  auto profile = pushProfile(profileInvert);
+  auto profile = pushProfile(profileInvert, param->secs, param->gflops);
   profilerStart(__func__);
 
   if (!initialized) errorQuda("QUDA not initialized");
@@ -2643,8 +2638,6 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
   bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||
     (param->solve_type == QUDA_NORMERR_PC_SOLVE);
 
-  param->secs = 0;
-  param->gflops = 0;
   param->iter = 0;
 
   Dirac *d = nullptr;
@@ -2933,9 +2926,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
   }
   profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE);
 
-  if (!param->make_resident_solution) {
-    h_x = x;
-  }
+  if (!param->make_resident_solution) h_x = x;
 
   profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE);
 
@@ -3032,7 +3023,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
   */
 
   profilerStart(__func__);
-  auto profile = pushProfile(profileInvertMultiSrc);
+  auto profile = pushProfile(profileInvertMultiSrc, param->secs, param->gflops);
 
   CommKey split_key = {param->split_grid[0], param->split_grid[1], param->split_grid[2], param->split_grid[3]};
   int num_sub_partition = quda::product(split_key);
@@ -3365,7 +3356,7 @@ void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param
  */
 void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
 {
-  auto profile = pushProfile(profileMulti);
+  auto profile = pushProfile(profileMulti, param->secs, param->gflops);
   profilerStart(__func__);
 
   if (!initialized) errorQuda("QUDA not initialized");
@@ -3413,9 +3404,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
     }
   }
 
-  // Timing and FLOP counters
-  param->secs = 0;
-  param->gflops = 0;
   param->iter = 0;
 
   for (int i=0; i<param->num_offset-1; i++) {
@@ -4963,7 +4951,7 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param,
 void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_param)
 {
   if (smear_param->n_steps == 0) return;
-  auto profile = pushProfile(profileGaussianSmear);
+  auto profile = pushProfile(profileGaussianSmear, smear_param->secs, smear_param->gflops);
 
   QudaInvertParam *inv_param = smear_param->inv_param;
 
@@ -5080,8 +5068,6 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
   logQuda(QUDA_VERBOSE, "Finished 2link Gaussian smearing.\n");
   delete d;
 
-  smear_param->gflops = dirac.Flops();
-
   if (smear_param->delete_2link != 0) { freeUniqueGaugeQuda(QUDA_SMEARED_LINKS); }
 
   saveTuneCache();
diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp
index c5d3bf90a1..4fdf08020a 100644
--- a/lib/inv_bicgstab_quda.cpp
+++ b/lib/inv_bicgstab_quda.cpp
@@ -214,10 +214,6 @@ namespace quda {
 
     PrintStats("BiCGstab", k, r2, b2, heavy_quark_res);
 
-    if (!param.is_preconditioner) { // do not do the below if we this is an inner solver
-      blas::flops = 0;
-    }
-
     profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
 
@@ -344,10 +340,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops())*1e-9;
-
-    param.gflops += gflops;
     param.iter += k;
 
     if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
@@ -363,12 +355,6 @@ namespace quda {
       PrintSummary("BiCGstab", k, r2, b2, stop, param.tol_hq);
     }
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-    matPrecon.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
 
     profile.TPSTART(QUDA_PROFILE_FREE);
diff --git a/lib/inv_bicgstabl_quda.cpp b/lib/inv_bicgstabl_quda.cpp
index 0393fe308c..b0e00d9ff5 100644
--- a/lib/inv_bicgstabl_quda.cpp
+++ b/lib/inv_bicgstabl_quda.cpp
@@ -50,7 +50,6 @@ namespace quda {
 
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-      param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
       profile.TPSTART(QUDA_PROFILE_EIGEN);
     }
 
@@ -61,7 +60,6 @@ namespace quda {
 
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_EIGEN);
-      param.secs += profile.Last(QUDA_PROFILE_EIGEN);
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
     }
 
@@ -562,7 +560,6 @@ namespace quda {
     double heavy_quark_res = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x, r_full).z) : 0.0;
     const int heavy_quark_check = param.heavy_quark_check; // how often to check the heavy quark residual
 
-    blas::flops = 0;
     //bool l2_converge = false;
     //double r2_old = r2;
 
@@ -706,9 +703,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matEig.flops()) * 1e-9;
-    param.gflops = gflops;
     param.iter += total_iter;
 
     if (total_iter >= param.maxiter) // >= if n_krylov doesn't divide max iter.
@@ -726,12 +720,7 @@ namespace quda {
       param.true_res_hq = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x, r[0]).z) : 0.0;
     }
 
-    // Reset flops counters.
-    blas::flops = 0;
-    mat.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
-    param.secs += profile.Last(QUDA_PROFILE_EPILOGUE);
 
     PrintSummary(solver_name.c_str(), total_iter, r2, b2, stop, param.tol_hq);
   }
diff --git a/lib/inv_ca_cg.cpp b/lib/inv_ca_cg.cpp
index ec95bf3ffe..445b2acaf3 100644
--- a/lib/inv_ca_cg.cpp
+++ b/lib/inv_ca_cg.cpp
@@ -184,10 +184,7 @@ namespace quda
   {
     Solver::create(x, b);
     if (!init) {
-      if (!param.is_preconditioner) {
-        blas::flops = 0;
-        profile.TPSTART(QUDA_PROFILE_INIT);
-      }
+      if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT);
 
       Q_AQandg.resize(param.Nkrylov * (param.Nkrylov + 1));
       Q_AS.resize(param.Nkrylov * param.Nkrylov);
@@ -248,7 +245,6 @@ namespace quda
   {
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-      param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
       profile.TPSTART(QUDA_PROFILE_EIGEN);
     }
 
@@ -290,7 +286,6 @@ namespace quda
 
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_EIGEN);
-      param.secs += profile.Last(QUDA_PROFILE_EIGEN);
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
     }
   }
@@ -318,7 +313,6 @@ namespace quda
   {
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-      param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
       profile.TPSTART(QUDA_PROFILE_EIGEN);
     }
 
@@ -357,7 +351,6 @@ namespace quda
 
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_EIGEN);
-      param.secs += profile.Last(QUDA_PROFILE_EIGEN);
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
     }
   }
@@ -522,7 +515,6 @@ namespace quda
     int resIncreaseTotal = 0;
 
     if (!param.is_preconditioner) {
-      blas::flops = 0;
       profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
     }
@@ -675,25 +667,8 @@ namespace quda
     }
 
     if (!param.is_preconditioner) {
-      qudaDeviceSynchronize(); // ensure solver is complete before ending timing
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-      profile.TPSTART(QUDA_PROFILE_EPILOGUE);
-      param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
-
-      // store flops and reset counters
-      double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matEig.flops()) * 1e-9;
-
-      param.gflops += gflops;
       param.iter += total_iter;
-
-      // reset the flops counters
-      blas::flops = 0;
-      mat.flops();
-      matSloppy.flops();
-      matPrecon.flops();
-      matEig.flops();
-
-      profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     }
 
     PrintSummary("CA-CG", total_iter, r2, b2, stop, param.tol_hq);
diff --git a/lib/inv_ca_gcr.cpp b/lib/inv_ca_gcr.cpp
index f9e605ea86..5b893bd3fc 100644
--- a/lib/inv_ca_gcr.cpp
+++ b/lib/inv_ca_gcr.cpp
@@ -28,10 +28,7 @@ namespace quda
     Solver::create(x, b);
 
     if (!init) {
-      if (!param.is_preconditioner) {
-        blas::flops = 0;
-        profile.TPSTART(QUDA_PROFILE_INIT);
-      }
+      if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT);
 
       alpha.resize(param.Nkrylov);
 
@@ -103,7 +100,6 @@ namespace quda
 
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-      param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
       profile.TPSTART(QUDA_PROFILE_EIGEN);
     }
 
@@ -115,7 +111,6 @@ namespace quda
 
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_EIGEN);
-      param.secs += profile.Last(QUDA_PROFILE_EIGEN);
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
     }
   }
@@ -268,7 +263,6 @@ namespace quda
     int resIncreaseTotal = 0;
 
     if (!param.is_preconditioner) {
-      blas::flops = 0;
       profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
     }
@@ -375,25 +369,8 @@ namespace quda
     }
 
     if (!param.is_preconditioner) {
-      qudaDeviceSynchronize(); // ensure solver is complete before ending timing
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-      profile.TPSTART(QUDA_PROFILE_EPILOGUE);
-      param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
-
-      // store flops and reset counters
-      double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matMdagM.flops()) * 1e-9;
-
-      param.gflops += gflops;
       param.iter += total_iter;
-
-      // reset the flops counters
-      blas::flops = 0;
-      mat.flops();
-      matSloppy.flops();
-      matPrecon.flops();
-      matMdagM.flops();
-
-      profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     }
 
     PrintSummary("CA-GCR", total_iter, r2, b2, stop, param.tol_hq);
diff --git a/lib/inv_cg3_quda.cpp b/lib/inv_cg3_quda.cpp
index 42ab22fcab..9ac9f85b9f 100644
--- a/lib/inv_cg3_quda.cpp
+++ b/lib/inv_cg3_quda.cpp
@@ -268,8 +268,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_INIT);
     profile.TPSTART(QUDA_PROFILE_PREAMBLE);
 
-    blas::flops = 0;
-
     // compute initial residual depending on whether we have an initial guess or not
     double r2;
     if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) {
@@ -474,13 +472,9 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops())*1e-9;
-    param.gflops = gflops;
     param.iter += k;
 
-    if (k == param.maxiter)
-      warningQuda("Exceeded maximum iterations %d", param.maxiter);
+    if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
 
     // compute the true residuals
     if (!mixed_precision && param.compute_true_res) {
@@ -491,11 +485,6 @@ namespace quda {
 
     PrintSummary("CG3", k, r2, b2, stop, param.tol_hq);
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
   }
 
diff --git a/lib/inv_cg_quda.cpp b/lib/inv_cg_quda.cpp
index 15bdf23002..40cd15ea1c 100644
--- a/lib/inv_cg_quda.cpp
+++ b/lib/inv_cg_quda.cpp
@@ -369,7 +369,6 @@ namespace quda {
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
-      blas::flops = 0;
     }
 
     int k = 0;
@@ -544,9 +543,6 @@ namespace quda {
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
       profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-      param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-      double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matEig.flops()) * 1e-9;
-      param.gflops = gflops;
       param.iter += k;
 
       if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
@@ -563,15 +559,7 @@ namespace quda {
 
     PrintSummary("CG", k, r2, b2, stop, 0.0);
 
-    if (!param.is_preconditioner) {
-      // reset the flops counters
-      blas::flops = 0;
-      mat.flops();
-      matSloppy.flops();
-      matPrecon.flops();
-
-      profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
-    }
+    if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
 
     if (param.is_preconditioner) commGlobalReductionPop();
   }
@@ -692,7 +680,6 @@ namespace quda {
 
     profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
-    blas::flops = 0;
 
     int k = 0;
 
@@ -988,9 +975,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matEig.flops()) * 1e-9;
-    param.gflops = gflops;
     param.iter += k;
 
     if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
@@ -1006,12 +990,6 @@ namespace quda {
 
     PrintSummary("CG", k, r2, b2, stop, param.tol_hq);
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-    matPrecon.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
   }
 
@@ -1163,7 +1141,6 @@ namespace quda {
 
     profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
-    blas::flops = 0;
 
     int k = 0;
 
@@ -1311,9 +1288,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops()) * 1e-9;
-    param.gflops = gflops;
     param.iter += k;
 
     if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
@@ -1332,11 +1306,6 @@ namespace quda {
       PrintSummary("CG", k, r2(i, i).real(), b2[i], stop[i], 0.0);
     }
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     profile.TPSTART(QUDA_PROFILE_FREE);
 
@@ -1533,7 +1502,6 @@ void CG::solve(ColorSpinorField& x, ColorSpinorField& b) {
 
   profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
   profile.TPSTART(QUDA_PROFILE_COMPUTE);
-  blas::flops = 0;
 
   int k = 0;
 
@@ -1879,9 +1847,6 @@ void CG::solve(ColorSpinorField& x, ColorSpinorField& b) {
   profile.TPSTOP(QUDA_PROFILE_COMPUTE);
   profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-  param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-  double gflops = (blas::flops + mat.flops() + matSloppy.flops())*1e-9;
-  param.gflops = gflops;
   param.iter += k;
 
   if (k == param.maxiter)
@@ -1901,11 +1866,6 @@ void CG::solve(ColorSpinorField& x, ColorSpinorField& b) {
     PrintSummary("CG", k, r2(i,i).real(), b2[i], stop[i], 0.0);
   }
 
-  // reset the flops counters
-  blas::flops = 0;
-  mat.flops();
-  matSloppy.flops();
-
   profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
   profile.TPSTART(QUDA_PROFILE_FREE);
 
diff --git a/lib/inv_eigcg_quda.cpp b/lib/inv_eigcg_quda.cpp
index 57a963a20e..decb31af7e 100644
--- a/lib/inv_eigcg_quda.cpp
+++ b/lib/inv_eigcg_quda.cpp
@@ -179,11 +179,7 @@ namespace quda {
     inner.delta = 1e-20; // no reliable updates within the inner solver
     inner.precision = outer.precision_precondition; // preconditioners are uni-precision solvers
     inner.precision_sloppy = outer.precision_precondition;
-
     inner.iter   = 0;
-    inner.gflops = 0;
-    inner.secs   = 0;
-
     inner.inv_type_precondition = QUDA_INVALID_INVERTER;
     inner.is_preconditioner = true; // used to tell the inner solver it is an inner solver
 
@@ -193,9 +189,6 @@ namespace quda {
   // set the required parameters for the initCG solver
   static void fillInitCGSolverParam(SolverParam &inner, const SolverParam &outer) {
     inner.iter   = 0;
-    inner.gflops = 0;
-    inner.secs   = 0;
-
     inner.tol              = outer.tol;
     inner.tol_restart      = outer.tol_restart;
     inner.maxiter          = outer.maxiter;
@@ -460,7 +453,6 @@ namespace quda {
 
     profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
-    blas::flops = 0;
 
     double rMinvr = blas::reDotProduct(r,*z);
     //Begin EigCG iterations:
@@ -517,9 +509,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + matSloppy.flops())*1e-9;
-    param.gflops = gflops;
     param.iter += k;
 
     if (k == param.maxiter)
@@ -532,10 +521,6 @@ namespace quda {
 
     PrintSummary("eigCG", k, r2, b2, args.global_stop, param.tol_hq);
 
-    // reset the flops counters
-    blas::flops = 0;
-    matSloppy.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     profile.TPSTART(QUDA_PROFILE_FREE);
 
@@ -588,20 +573,11 @@ namespace quda {
       xProj = x;
       rProj = r; 
 
-      if(getVerbosity() >= QUDA_VERBOSE) printfQuda("\ninitCG stat: %i iter / %g secs = %g Gflops. \n", Kparam.iter, Kparam.secs, Kparam.gflops);
-
       Kparam.tol *= param.inc_tol;
 
       if(restart_idx == (param.max_restart_num-1)) Kparam.tol = full_tol;//do the last solve in the next cycle to full tolerance
-
-      param.secs   += Kparam.secs;
     }
 
-    if(getVerbosity() >= QUDA_VERBOSE) printfQuda("\ninitCG stat: %i iter / %g secs = %g Gflops. \n", Kparam.iter, Kparam.secs, Kparam.gflops);
-    //
-    param.secs   += Kparam.secs;
-    param.gflops += Kparam.gflops;
-
     k   += Kparam.iter;
 
     delete rp;
diff --git a/lib/inv_gcr_quda.cpp b/lib/inv_gcr_quda.cpp
index 9227f573b3..3caf952929 100644
--- a/lib/inv_gcr_quda.cpp
+++ b/lib/inv_gcr_quda.cpp
@@ -276,8 +276,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_INIT);
     profile.TPSTART(QUDA_PROFILE_PREAMBLE);
 
-    blas::flops = 0;
-
     blas::copy(r_sloppy, r);
 
     int total_iter = 0;
@@ -386,11 +384,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
-
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matMdagM.flops()) * 1e-9;
-    if (K) gflops += K->flops()*1e-9;
-
     if (k >= param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
 
     logQuda(QUDA_VERBOSE, "GCR: number of restarts = %d\n", restart);
@@ -410,16 +403,8 @@ namespace quda {
       if (0) blas::copy(b, K ? r_sloppy : p[k_break]);
     }
 
-    param.gflops += gflops;
     param.iter += total_iter;
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-    matPrecon.flops();
-    matMdagM.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     profile.TPSTART(QUDA_PROFILE_FREE);
 
diff --git a/lib/inv_gmresdr_quda.cpp b/lib/inv_gmresdr_quda.cpp
index 389206853e..ab56176a2a 100644
--- a/lib/inv_gmresdr_quda.cpp
+++ b/lib/inv_gmresdr_quda.cpp
@@ -143,8 +143,6 @@ namespace quda {
     inner.precision_sloppy = outer.precision_precondition;
 
     inner.iter = 0;
-    inner.gflops = 0;
-    inner.secs = 0;
 
     inner.inv_type_precondition = QUDA_INVALID_INVERTER;
     inner.is_preconditioner = true;
@@ -469,7 +467,6 @@ namespace quda {
 
     profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
-    blas::flops = 0;
 
     const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false;
 
@@ -549,9 +546,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops()) * 1e-9;
-    param.gflops = gflops;
     param.iter += tot_iters;
 
     mat(r, x);
@@ -560,9 +554,6 @@ namespace quda {
 
     PrintSummary("FGMResDR:", tot_iters, r2, b2, stop, param.tol_hq);
 
-    blas::flops = 0;
-    mat.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
 
     param.rhs_idx += 1;
diff --git a/lib/inv_mr_quda.cpp b/lib/inv_mr_quda.cpp
index 4f636bf279..cc69c3cd14 100644
--- a/lib/inv_mr_quda.cpp
+++ b/lib/inv_mr_quda.cpp
@@ -62,10 +62,7 @@ namespace quda
 
     create(x, b); // allocate fields
 
-    if (!param.is_preconditioner) {
-      blas::flops = 0;
-      profile.TPSTART(QUDA_PROFILE_COMPUTE);
-    }
+    if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_COMPUTE);
 
     double b2 = blas::norm2(b); // Save norm of b
     double r2 = 0.0;            // if zero source then we will exit immediately doing no work
@@ -160,17 +157,7 @@ namespace quda
 
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-      profile.TPSTART(QUDA_PROFILE_EPILOGUE);
-      param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
-
-      // store flops and reset counters
-      double gflops = (blas::flops + mat.flops() + matSloppy.flops()) * 1e-9;
-
-      param.gflops += gflops;
       param.iter += iter;
-      blas::flops = 0;
-
-      profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     }
   }
 
diff --git a/lib/inv_msrc_cg_quda.cpp b/lib/inv_msrc_cg_quda.cpp
index 9a64386095..70bcb9a089 100644
--- a/lib/inv_msrc_cg_quda.cpp
+++ b/lib/inv_msrc_cg_quda.cpp
@@ -146,7 +146,6 @@ namespace quda {
 
     profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
-    blas::flops = 0;
 
     int k=0;
 
@@ -315,10 +314,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops())*1e-9;
-    reduceDouble(gflops);
-    param.gflops = gflops;
     param.iter += k;
 
     if (k==param.maxiter)
@@ -334,11 +329,6 @@ namespace quda {
 
     PrintSummary("CG", k, r2, b2, stop, inv.tol_hq);
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     profile.TPSTART(QUDA_PROFILE_FREE);
 
diff --git a/lib/inv_multi_cg_quda.cpp b/lib/inv_multi_cg_quda.cpp
index b6757440f4..ada5a3326a 100644
--- a/lib/inv_multi_cg_quda.cpp
+++ b/lib/inv_multi_cg_quda.cpp
@@ -262,7 +262,6 @@ namespace quda {
 
     int k = 0;
     int rUpdate = 0;
-    blas::flops = 0;
 
     // now create the worker class for updating the shifted solutions and gradient vectors
     bool aux_update = false;
@@ -443,9 +442,6 @@ namespace quda {
     logQuda(QUDA_VERBOSE, "Reliable updates = %d\n", rUpdate);
     if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d\n", param.maxiter);
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops())*1e-9;
-    param.gflops = gflops;
     param.iter += k;
 
     if (param.compute_true_res) {
@@ -490,11 +486,6 @@ namespace quda {
       }
     }
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     popOutputPrefix();
   }
diff --git a/lib/inv_pcg_quda.cpp b/lib/inv_pcg_quda.cpp
index 24d9259ef8..2fe62692de 100644
--- a/lib/inv_pcg_quda.cpp
+++ b/lib/inv_pcg_quda.cpp
@@ -203,8 +203,6 @@ namespace quda
     profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
 
-    blas::flops = 0;
-
     int k = 0;
     PrintStats("PCG", k, r2, b2, heavy_quark_res);
 
@@ -378,10 +376,6 @@ namespace quda
     if (mixed()) copy(x, x_sloppy);
     xpy(y, x); // x += y
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matEig.flops()) * 1e-9;
-    if (K) gflops += K->flops() * 1e-9;
-    param.gflops = gflops;
     param.iter += k;
 
     if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
@@ -393,13 +387,6 @@ namespace quda
     double true_res = xmyNorm(b, r);
     param.true_res = sqrt(true_res / b2);
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-    matPrecon.flops();
-    matEig.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
   }
 
diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp
index 4defe3878f..475a1d9b3b 100644
--- a/lib/multigrid.cpp
+++ b/lib/multigrid.cpp
@@ -804,34 +804,6 @@ namespace quda
     popLevel();
   }
 
-  // FIXME need to make this more robust (implement Solver::flops() for all solvers)
-  double MG::flops() const {
-    double flops = 0;
-
-    if (param_coarse_solver) {
-      flops += param_coarse_solver->gflops * 1e9;
-      param_coarse_solver->gflops = 0;
-    } else if (param.level < param.Nlevel-1) {
-      flops += coarse->flops();
-    }
-
-    if (param_presmooth) {
-      flops += param_presmooth->gflops * 1e9;
-      param_presmooth->gflops = 0;
-    }
-
-    if (param_postsmooth) {
-      flops += param_postsmooth->gflops * 1e9;
-      param_postsmooth->gflops = 0;
-    }
-
-    if (transfer) {
-      flops += transfer->flops();
-    }
-
-    return flops;
-  }
-
   bool check_deviation(double deviation, double tol)
   {
     return (deviation > tol || std::isnan(deviation) || std::isinf(deviation));
diff --git a/lib/solver.cpp b/lib/solver.cpp
index 8b734e8bc5..12cce8f532 100644
--- a/lib/solver.cpp
+++ b/lib/solver.cpp
@@ -223,9 +223,6 @@ namespace quda {
       = (outer.inv_type_precondition == QUDA_MR_INVERTER) ? QUDA_INVALID_RESIDUAL : QUDA_L2_RELATIVE_RESIDUAL;
 
     inner.iter = 0;
-    inner.gflops = 0;
-    inner.secs = 0;
-
     inner.inv_type_precondition = QUDA_INVALID_INVERTER;
     inner.is_preconditioner = true; // tell inner solver it is a preconditioner
     inner.pipeline = true;
diff --git a/lib/timer.cpp b/lib/timer.cpp
index 2c6f9b21c3..125b9242d4 100644
--- a/lib/timer.cpp
+++ b/lib/timer.cpp
@@ -1,6 +1,7 @@
 #include <stack>
 #include <quda_internal.h>
 #include <timer.h>
+#include <tune_quda.h>
 
 #ifdef INTERFACE_NVTX
 #include "nvtx3/nvToolsExt.h"
@@ -225,7 +226,12 @@ namespace quda {
 
   static std::stack<TimeProfile*> tp_stack;
 
-  pushProfile::pushProfile(TimeProfile &profile) : profile(profile)
+  pushProfile::pushProfile(TimeProfile &profile, double &secs, double &gflops) :
+    profile(profile),
+    secs(secs),
+    gflops(gflops),
+    flops(Tunable::flops_global())
+
   {
     profile.TPSTART(QUDA_PROFILE_TOTAL);
     tp_stack.push(&profile);
@@ -238,6 +244,9 @@ namespace quda {
     if (&(this->profile) != &profile) errorQuda("Popped profile is not the expected one");
     tp_stack.pop();
     profile.TPSTOP(QUDA_PROFILE_TOTAL);
+    secs = profile.Last(QUDA_PROFILE_TOTAL);
+    gflops = (Tunable::flops_global() - flops) * 1e-9;
+    if (&gflops != &gflops_dummy) comm_allreduce_sum(gflops);
   }
 
   TimeProfile& getProfile()
diff --git a/lib/tune.cpp b/lib/tune.cpp
index fea2a7b509..608a77e3c9 100644
--- a/lib/tune.cpp
+++ b/lib/tune.cpp
@@ -890,6 +890,8 @@ namespace quda
         trace_list.push_back(trace_entry);
       }
 
+      Tunable::flops_global(Tunable::flops_global() + tunable.flops()); // increment flops counter
+      Tunable::bytes_global(Tunable::bytes_global() + tunable.bytes()); // increment bytes counter
       return param_tuned;
     }
 
@@ -908,6 +910,8 @@ namespace quda
       logQuda(QUDA_DEBUG_VERBOSE, "Launching %s with %s at vol=%s with %s (untuned)\n", key.name, key.aux, key.volume,
               tunable.paramString(param_default).c_str());
 
+      Tunable::flops_global(Tunable::flops_global() + tunable.flops()); // increment flops counter
+      Tunable::bytes_global(Tunable::bytes_global() + tunable.bytes()); // increment bytes counter
       return param_default;
     } else if (!tuning) {
 
@@ -1121,6 +1125,8 @@ namespace quda
 
     param.n_calls = profile_count ? 1 : 0;
 
+    Tunable::flops_global(Tunable::flops_global() + tunable.flops()); // increment flops counter
+    Tunable::bytes_global(Tunable::bytes_global() + tunable.bytes()); // increment bytes counter
     return param;
   }
 

From fe2979807f5d2fda0f10fcda6c1313cc722c72a8 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 13 Oct 2023 16:22:31 -0700
Subject: [PATCH 55/60] Report MG setup time and performance in invert_test and
 staggered_invert_test

---
 tests/invert_test.cpp           | 2 ++
 tests/staggered_invert_test.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp
index 9de2ecfe83..20af8bd390 100644
--- a/tests/invert_test.cpp
+++ b/tests/invert_test.cpp
@@ -214,6 +214,8 @@ std::vector<std::array<double, 2>> solve(test_t param)
     if (use_split_grid) { errorQuda("Split grid does not work with MG yet."); }
     mg_preconditioner = newMultigridQuda(&mg_param);
     inv_param.preconditioner = mg_preconditioner;
+
+    printfQuda("MG Setup Done: %g secs, %g Gflops\n", mg_param.secs, mg_param.gflops / mg_param.secs);
   }
 
   // Vector construct START
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index ea5aab17fd..0d9f2c3e5e 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -217,6 +217,8 @@ void test(int argc, char **argv)
     if (use_split_grid) { errorQuda("Split grid does not work with MG yet."); }
     mg_preconditioner = newMultigridQuda(&mg_param);
     inv_param.preconditioner = mg_preconditioner;
+
+    printfQuda("MG Setup Done: %g secs, %g Gflops\n", mg_param.secs, mg_param.gflops / mg_param.secs);
   }
 
   // Staggered vector construct START

From d3649dd16304446c90da5bfa7f8de842ea08f64f Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 16 Oct 2023 10:46:13 -0700
Subject: [PATCH 56/60] Remove legacy blas flop and byte counting

---
 include/blas_quda.h                |  3 ---
 lib/blas_quda.cu                   |  6 ------
 lib/coarse_op_preconditioned.in.cu |  4 ++--
 lib/multi_blas_quda.cu             |  3 ---
 lib/multi_reduce_quda.cu           |  3 ---
 lib/reduce_quda.cu                 |  3 ---
 lib/staggered_kd_build_xinv.cu     | 19 ++++++++-----------
 tests/blas_test.cpp                | 11 ++++++-----
 8 files changed, 16 insertions(+), 36 deletions(-)

diff --git a/include/blas_quda.h b/include/blas_quda.h
index 8df40df452..07b09f1209 100644
--- a/include/blas_quda.h
+++ b/include/blas_quda.h
@@ -23,9 +23,6 @@ namespace quda {
 
     void setParam(int kernel, int prec, int threads, int blocks);
 
-    extern unsigned long long flops;
-    extern unsigned long long bytes;
-
     inline void zero(cvector_ref<ColorSpinorField> &x)
     {
       for (auto i = 0u; i < x.size(); i++) x[i].zero();
diff --git a/lib/blas_quda.cu b/lib/blas_quda.cu
index 4c8719f309..f84f2eeb59 100644
--- a/lib/blas_quda.cu
+++ b/lib/blas_quda.cu
@@ -7,9 +7,6 @@ namespace quda {
 
   namespace blas {
 
-    unsigned long long flops;
-    unsigned long long bytes;
-
     template <template <typename real> class Functor, typename store_t, typename y_store_t,
               int nSpin, typename coeff_t>
     class Blas : public TunableGridStrideKernel2D
@@ -56,9 +53,6 @@ namespace quda {
         }
 
         apply(device::get_default_stream());
-
-        blas::bytes += bytes();
-        blas::flops += flops();
       }
 
       TuneKey tuneKey() const override { return TuneKey(vol, typeid(f).name(), aux); }
diff --git a/lib/coarse_op_preconditioned.in.cu b/lib/coarse_op_preconditioned.in.cu
index 833b3f55ad..7f487d0b9b 100644
--- a/lib/coarse_op_preconditioned.in.cu
+++ b/lib/coarse_op_preconditioned.in.cu
@@ -175,7 +175,7 @@ namespace quda
       GaugeField *X_aos = create_gauge_copy(X, true);
       Xinv_aos = create_gauge_copy(Xinv, false);
 
-      blas::flops += invert(Xinv_aos->data(), X_aos->data(), n, X_aos->Volume(), X_aos->Precision(), X.Location());
+      Tunable::flops_global(invert(Xinv_aos->data(), X_aos->data(), n, X_aos->Volume(), X_aos->Precision(), X.Location()) + Tunable::flops_global());
 
       if (&Xinv != Xinv_aos) {
         if (Xinv.Precision() < QUDA_SINGLE_PRECISION) Xinv.Scale(Xinv_aos->abs_max());
@@ -186,7 +186,7 @@ namespace quda
       if (!use_mma) { delete Xinv_aos; }
 
     } else if (X.Location() == QUDA_CPU_FIELD_LOCATION && X.Order() == QUDA_QDP_GAUGE_ORDER) {
-      blas::flops += invert(Xinv.data<void *>(0), X.data<void *>(0), n, X.Volume(), X.Precision(), X.Location());
+      Tunable::flops_global(invert(Xinv.data<void *>(0), X.data<void *>(0), n, X.Volume(), X.Precision(), X.Location()) + Tunable::flops_global());
     } else {
       errorQuda("Unsupported location=%d and order=%d", X.Location(), X.Order());
     }
diff --git a/lib/multi_blas_quda.cu b/lib/multi_blas_quda.cu
index 5020f7b109..002bb48235 100644
--- a/lib/multi_blas_quda.cu
+++ b/lib/multi_blas_quda.cu
@@ -81,9 +81,6 @@ namespace quda {
 #endif
 
         apply(device::get_default_stream());
-
-        blas::bytes += bytes();
-        blas::flops += flops();
       }
 
       TuneKey tuneKey() const override { return TuneKey(vol, typeid(f).name(), aux); }
diff --git a/lib/multi_reduce_quda.cu b/lib/multi_reduce_quda.cu
index f93ab431e4..8fcf9d080b 100644
--- a/lib/multi_reduce_quda.cu
+++ b/lib/multi_reduce_quda.cu
@@ -97,9 +97,6 @@ namespace quda {
         if (is_norm) strcat(aux, ",norm");
 
         apply(device::get_default_stream());
-
-        blas::bytes += bytes();
-        blas::flops += flops();
       }
 
       TuneKey tuneKey() const override { return TuneKey(vol, typeid(r).name(), aux); }
diff --git a/lib/reduce_quda.cu b/lib/reduce_quda.cu
index 7b61c78b6e..4a77ec79ee 100644
--- a/lib/reduce_quda.cu
+++ b/lib/reduce_quda.cu
@@ -62,9 +62,6 @@ namespace quda {
         }
 
         apply(device::get_default_stream());
-
-        blas::bytes += bytes();
-        blas::flops += flops();
       }
 
       TuneKey tuneKey() const override { return TuneKey(vol, typeid(r).name(), aux); }
diff --git a/lib/staggered_kd_build_xinv.cu b/lib/staggered_kd_build_xinv.cu
index b1195d9f4e..1d93de4389 100644
--- a/lib/staggered_kd_build_xinv.cu
+++ b/lib/staggered_kd_build_xinv.cu
@@ -58,7 +58,7 @@ namespace quda {
       // reset scales as appropriate
       if constexpr (sizeof(Float) < QUDA_SINGLE_PRECISION) {
         double max_scale = g.abs_max();
-        if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Global U_max = %e\n", max_scale);
+        logQuda(QUDA_VERBOSE, "Global U_max = %e\n", max_scale);
         X.Scale(max_scale > 2.0*mass ? max_scale : 2.0*mass);
       }
 
@@ -215,11 +215,11 @@ namespace quda {
     GaugeField& X = *tmp_X;
 
     // Step 4: Calculate X from U
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Computing the KD block on the %s\n", location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU");
+    logQuda(QUDA_VERBOSE, "Computing the KD block on the %s\n", location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU");
 
     calculateStaggeredKDBlock(X, U, mass);
 
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("X2 = %e\n", X.norm2(0));
+    logQuda(QUDA_VERBOSE, "X2 = %e\n", X.norm2(0));
 
     // Step 5: Calculate Xinv
     if (dagger_approximation) {
@@ -237,14 +237,13 @@ namespace quda {
         
         X_.copy(X);
 
-        blas::flops += invert(xInvMilcOrder->data(), X_.data(), n, X_.Volume(), X_.Precision(), X.Location());
+        Tunable::flops_global(invert(xInvMilcOrder->data(), X_.data(), n, X_.Volume(), X_.Precision(), X.Location()) + Tunable::flops_global());
 
       } else if (location == QUDA_CPU_FIELD_LOCATION) {
-
-        blas::flops += invert(xInvMilcOrder->data(), X.data(), n, X.Volume(), X.Precision(), X.Location());
+        Tunable::flops_global(invert(xInvMilcOrder->data(), X.data(), n, X.Volume(), X.Precision(), X.Location()) + Tunable::flops_global());
       }
 
-      if (getVerbosity() >= QUDA_VERBOSE) printfQuda("xInvMilcOrder = %e\n", xInvMilcOrder->norm2(0));
+      logQuda(QUDA_VERBOSE, "xInvMilcOrder = %e\n", xInvMilcOrder->norm2(0));
 
     }
 
@@ -252,10 +251,8 @@ namespace quda {
     // last two parameters: dagger approximation, mass (which becomes a scale in the dagger approx)
     ReorderStaggeredKahlerDiracInverse(Xinv, *xInvMilcOrder, dagger_approximation, mass);
 
-    if (getVerbosity() >= QUDA_VERBOSE) {
-      if (dagger_approximation) printfQuda("Using the dagger approximation to Xinv\n");
-      printfQuda("xInvKdGeometry = %e\n", Xinv.norm2());
-    }
+    if (dagger_approximation) logQuda(QUDA_VERBOSE, "Using the dagger approximation to Xinv\n");
+    logQuda(QUDA_VERBOSE, "xInvKdGeometry = %e\n", Xinv.norm2());
   }
 
 
diff --git a/tests/blas_test.cpp b/tests/blas_test.cpp
index 713ffb5d0a..b2315c3f70 100644
--- a/tests/blas_test.cpp
+++ b/tests/blas_test.cpp
@@ -9,6 +9,8 @@
 #include <host_utils.h>
 #include <command_line_params.h>
 
+#include <tune_quda.h>
+
 // include because of nasty globals used in the tests
 #include <dslash_reference.h>
 
@@ -1152,14 +1154,13 @@ TEST_P(BlasTest, benchmark)
   // do the initial tune
   benchmark(kernel, 1);
 
-  // now rerun with more iterations to get accurate speed measurements
-  quda::blas::flops = 0;
-  quda::blas::bytes = 0;
+  auto flops0 = quda::Tunable::flops_global();
+  auto bytes0 = quda::Tunable::bytes_global();
 
   double secs = benchmark(kernel, niter);
 
-  double gflops = (quda::blas::flops * 1e-9) / (secs);
-  double gbytes = quda::blas::bytes / (secs * 1e9);
+  double gflops = (quda::Tunable::flops_global() - flops0) * 1e-9 / secs;
+  double gbytes = (quda::Tunable::bytes_global() - bytes0) / (secs * 1e9);
   RecordProperty("Gflops", std::to_string(gflops));
   RecordProperty("GBs", std::to_string(gbytes));
   printfQuda("%-31s: Gflop/s = %6.1f, GB/s = %6.1f\n", kernel_map.at(kernel).c_str(), gflops, gbytes);

From 0a413c7279aade95932e29dda68665ad1c6da06d Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 16 Oct 2023 15:26:44 -0700
Subject: [PATCH 57/60] Remove legacy Dirac flop counter and switch to using
 QUDA's global flops counter

---
 include/dirac_quda.h                  |  13 --
 lib/dirac.cpp                         |   3 -
 lib/dirac_clover.cpp                  |   6 -
 lib/dirac_clover_hasenbusch_twist.cpp |  14 +-
 lib/dirac_coarse.cpp                  |  18 ---
 lib/dirac_domain_wall.cpp             |  15 --
 lib/dirac_domain_wall_4d.cpp          |  25 ----
 lib/dirac_improved_staggered.cpp      |   7 -
 lib/dirac_improved_staggered_kd.cpp   |   6 -
 lib/dirac_mobius.cpp                  | 196 ++------------------------
 lib/dirac_staggered.cpp               |   6 -
 lib/dirac_staggered_kd.cpp            |   7 +-
 lib/dirac_twisted_clover.cpp          |  18 ---
 lib/dirac_twisted_mass.cpp            |  12 --
 lib/dirac_wilson.cpp                  |   3 -
 lib/gauge_covdev.cpp                  |   1 -
 lib/gauge_laplace.cpp                 |   2 -
 tests/dslash_test_utils.h             |  26 +++-
 tests/multigrid_benchmark_test.cpp    |  10 +-
 tests/staggered_dslash_test_utils.h   |  17 ++-
 20 files changed, 54 insertions(+), 351 deletions(-)

diff --git a/include/dirac_quda.h b/include/dirac_quda.h
index cfbba175d5..6ce01ea578 100644
--- a/include/dirac_quda.h
+++ b/include/dirac_quda.h
@@ -174,7 +174,6 @@ namespace quda {
     int laplace3D;
     QudaMatPCType matpcType;
     mutable QudaDagType dagger; // mutable to simplify implementation of Mdag
-    mutable unsigned long long flops;
     QudaDiracType type;
     mutable QudaPrecision halo_precision; // only does something for DiracCoarse at present
 
@@ -404,16 +403,6 @@ namespace quda {
     */
     virtual bool AllowTruncation() const { return false; }
 
-    /**
-       @brief  returns and then zeroes flopcount
-    */
-    unsigned long long Flops() const
-    {
-      unsigned long long rtn = flops;
-      flops = 0;
-      return rtn;
-    }
-
     /**
        @brief returns preconditioning type
     */
@@ -2243,8 +2232,6 @@ namespace quda {
      */
     virtual void operator()(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const = 0;
 
-    unsigned long long flops() const { return dirac->Flops(); }
-
     QudaMatPCType getMatPCType() const { return dirac->getMatPCType(); }
 
     virtual int getStencilSteps() const = 0;
diff --git a/lib/dirac.cpp b/lib/dirac.cpp
index e7be7cdc6d..35411c1841 100644
--- a/lib/dirac.cpp
+++ b/lib/dirac.cpp
@@ -14,7 +14,6 @@ namespace quda {
     laplace3D(param.laplace3D),
     matpcType(param.matpcType),
     dagger(param.dagger),
-    flops(0),
     type(param.type),
     halo_precision(param.halo_precision),
     use_mobius_fused_kernel(param.use_mobius_fused_kernel),
@@ -29,7 +28,6 @@ namespace quda {
     laplace3D(dirac.laplace3D),
     matpcType(dirac.matpcType),
     dagger(dirac.dagger),
-    flops(0),
     type(dirac.type),
     halo_precision(dirac.halo_precision),
     profile("Dirac", false)
@@ -51,7 +49,6 @@ namespace quda {
       laplace3D = dirac.laplace3D;
       matpcType = dirac.matpcType;
       dagger = dirac.dagger;
-      flops = 0;
 
       for (int i=0; i<4; i++) commDim[i] = dirac.commDim[i];
 
diff --git a/lib/dirac_clover.cpp b/lib/dirac_clover.cpp
index 6bb8e56df5..242fc23e24 100644
--- a/lib/dirac_clover.cpp
+++ b/lib/dirac_clover.cpp
@@ -38,7 +38,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyWilsonClover(out, in, *gauge, *clover, k, x, parity, dagger, commDim, profile);
-    flops += 1872ll*in.Volume();
   }
 
   // Public method to apply the clover term only
@@ -47,13 +46,11 @@ namespace quda {
     checkParitySpinor(in, out);
 
     ApplyClover(out, in, *clover, false, parity);
-    flops += 504ll*in.Volume();
   }
 
   void DiracClover::M(ColorSpinorField &out, const ColorSpinorField &in) const
   {
     ApplyWilsonClover(out, in, *gauge, *clover, -kappa, in, QUDA_INVALID_PARITY, dagger, commDim, profile);
-    flops += 1872ll * in.Volume();
   }
 
   void DiracClover::MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -127,7 +124,6 @@ namespace quda {
     checkParitySpinor(in, out);
 
     ApplyClover(out, in, *clover, true, parity);
-    flops += 504ll*in.Volume();
   }
 
   // apply hopping term, then clover: (A_ee^-1 D_eo) or (A_oo^-1 D_oe),
@@ -140,7 +136,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyWilsonCloverPreconditioned(out, in, *gauge, *clover, 0.0, in, parity, dagger, commDim, profile);
-    flops += 1824ll*in.Volume();
   }
 
   // xpay version of the above
@@ -152,7 +147,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyWilsonCloverPreconditioned(out, in, *gauge, *clover, k, x, parity, dagger, commDim, profile);
-    flops += 1872ll*in.Volume();
   }
 
   // Apply the even-odd preconditioned clover-improved Dirac operator
diff --git a/lib/dirac_clover_hasenbusch_twist.cpp b/lib/dirac_clover_hasenbusch_twist.cpp
index 93600c8299..8c82d7bbd8 100644
--- a/lib/dirac_clover_hasenbusch_twist.cpp
+++ b/lib/dirac_clover_hasenbusch_twist.cpp
@@ -42,9 +42,6 @@ namespace quda
         ApplyWilsonCloverHasenbuschTwist(out.Odd(), in.Even(), *gauge, *clover, -kappa, mu, in.Odd(), QUDA_ODD_PARITY,
                                          dagger, commDim, profile);
       }
-
-      // 2 c/b applies of DiracClover + (1-imu gamma_5 A)psi_{!p}
-      flops += 2 * 1872ll * in.VolumeCB() + (48ll + 504ll) * in.VolumeCB();
     } else {
       if (matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
         ApplyWilsonClover(out.Even(), in.Odd(), *gauge, *clover, -kappa, in.Even(), QUDA_EVEN_PARITY, dagger, commDim,
@@ -57,8 +54,6 @@ namespace quda
         ApplyWilsonClover(out.Odd(), in.Even(), *gauge, *clover, -kappa, in.Odd(), QUDA_ODD_PARITY, dagger, commDim,
                           profile);
       }
-      // 2 c/b applies of DiracClover + (1-imu gamma_5)psi_{!p}
-      flops += 2 * 1872ll * in.VolumeCB() + 48ll * in.VolumeCB();
     }
   }
 
@@ -115,9 +110,6 @@ namespace quda
     checkSpinorAlias(in, out);
 
     ApplyWilsonCloverHasenbuschTwistPCClovInv(out, in, *gauge, *clover, k, b, x, parity, dagger, commDim, profile);
-
-    // DiracCloverPC.DslashXPay -/+ mu ( i gamma_5 ) A
-    flops += (1872ll + 48ll + 504ll) * in.Volume();
   }
 
   // xpay version of the above
@@ -129,9 +121,6 @@ namespace quda
     checkSpinorAlias(in, out);
 
     ApplyWilsonCloverHasenbuschTwistPCNoClovInv(out, in, *gauge, *clover, k, b, x, parity, dagger, commDim, profile);
-
-    //    DiracCloverPC.DslashXPay -/+ mu ( i gamma_5 )
-    flops += (1872ll + 48) * in.Volume();
   }
 
   // Apply the even-odd preconditioned clover-improved Dirac operator
@@ -155,7 +144,6 @@ namespace quda
 
       // applies (A + imu*g5 - kappa^2 D)-
       ApplyTwistedClover(out, tmp, *gauge, *clover, kappa2, mu, in, parity[1], dagger, commDim, profile);
-      flops += 1872ll * in.Volume();
     } else if (!dagger) { // symmetric preconditioning
       // We need two cases because M = 1-ADAD and M^\dag = 1-D^\dag A D^dag A
       // where A is actually a clover inverse.
@@ -188,7 +176,7 @@ namespace quda
   {
     // double a = - 2.0 * kappa * mu * T.Vectors().TwistFlavor();
     // CoarseOp(Y, X, T, *gauge, &clover, kappa, a, -mu_factor,QUDA_CLOVERPC_DIRAC, matpcType);
-    errorQuda("Not yet implemented\n");
+    errorQuda("Not yet implemented");
   }
 
 } // namespace quda
diff --git a/lib/dirac_coarse.cpp b/lib/dirac_coarse.cpp
index 3681257170..058e81cab5 100644
--- a/lib/dirac_coarse.cpp
+++ b/lib/dirac_coarse.cpp
@@ -374,8 +374,6 @@ namespace quda {
       ApplyCoarse(out, in, in, *Y_h, *X_h, kappa, parity, false, true, dagger, commDim, QUDA_INVALID_PRECISION,
                   dslash_use_mma);
     }
-    int n = in[0].Nspin() * in[0].Ncolor();
-    flops += (8 * n * n - 2 * n) * (long long)in[0].VolumeCB() * in.size();
   }
 
   void DiracCoarse::CloverInv(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
@@ -392,8 +390,6 @@ namespace quda {
       ApplyCoarse(out, in, in, *Y_h, *Xinv_h, kappa, parity, false, true, dagger, commDim, QUDA_INVALID_PRECISION,
                   dslash_use_mma);
     }
-    int n = in[0].Nspin() * in[0].Ncolor();
-    flops += (8 * n * n - 2 * n) * (long long)in[0].VolumeCB() * in.size();
   }
 
   void DiracCoarse::Dslash(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
@@ -409,9 +405,6 @@ namespace quda {
     } else if ( location == QUDA_CPU_FIELD_LOCATION ) {
       ApplyCoarse(out, in, in, *Y_h, *X_h, kappa, parity, true, false, dagger, commDim, halo_precision, dslash_use_mma);
     }
-
-    int n = in[0].Nspin() * in[0].Ncolor();
-    flops += (8 * (8 * n * n) - 2 * n) * (long long)in[0].VolumeCB() * in[0].SiteSubset() * in.size();
   }
 
   void DiracCoarse::DslashXpay(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
@@ -428,8 +421,6 @@ namespace quda {
     } else if ( location == QUDA_CPU_FIELD_LOCATION ) {
       ApplyCoarse(out, in, x, *Y_h, *X_h, kappa, parity, true, true, dagger, commDim, halo_precision, dslash_use_mma);
     }
-    int n = in[0].Nspin() * in[0].Ncolor();
-    flops += (9 * (8 * n * n) - 2 * n) * (long long)in[0].VolumeCB() * in[0].SiteSubset() * in.size();
   }
 
   void DiracCoarse::M(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const
@@ -445,8 +436,6 @@ namespace quda {
       ApplyCoarse(out, in, in, *Y_h, *X_h, kappa, QUDA_INVALID_PARITY, true, true, dagger, commDim, halo_precision,
                   dslash_use_mma);
     }
-    int n = in[0].Nspin() * in[0].Ncolor();
-    flops += (9 * (8 * n * n) - 2 * n) * (long long)in[0].VolumeCB() * in[0].SiteSubset() * in.size();
   }
 
   void DiracCoarse::MdagM(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const
@@ -536,9 +525,6 @@ namespace quda {
       ApplyCoarse(out, in, in, *Yhat_h, *X_h, kappa, parity, true, false, dagger, commDim, halo_precision,
                   dslash_use_mma);
     }
-
-    int n = in[0].Nspin() * in[0].Ncolor();
-    flops += (8 * (8 * n * n) - 2 * n) * in[0].VolumeCB() * in[0].SiteSubset() * in.size();
   }
 
   void DiracCoarsePC::DslashXpay(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
@@ -547,10 +533,6 @@ namespace quda {
     // FIXME emulated for now
     Dslash(out, in, parity);
     for (auto i = 0u; i < x.size(); i++) blas::xpay(x[i], k, out[i]);
-
-    int n = in[0].Nspin() * in[0].Ncolor();
-    flops += (8 * (8 * n * n) - 2 * n) * in[0].VolumeCB()
-      * in.size(); // blas flops counted separately so only need to count dslash flops
   }
 
   void DiracCoarsePC::M(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const
diff --git a/lib/dirac_domain_wall.cpp b/lib/dirac_domain_wall.cpp
index cc29f2927f..85b9a4f76b 100644
--- a/lib/dirac_domain_wall.cpp
+++ b/lib/dirac_domain_wall.cpp
@@ -49,11 +49,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall5D(out, in, *gauge, 0.0, mass, in, parity, dagger, commDim, profile);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls-2)*(in.Volume()/Ls);
-    long long wall = 2*in.Volume()/Ls;
-    flops += 1320LL*(long long)in.Volume() + 96LL*bulk + 120LL*wall;
   }
 
   void DiracDomainWall::DslashXpay(ColorSpinorField &out, const ColorSpinorField &in, 
@@ -65,11 +60,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall5D(out, in, *gauge, k, mass, x, parity, dagger, commDim, profile);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls-2)*(in.Volume()/Ls);
-    long long wall = 2*in.Volume()/Ls;
-    flops += (1320LL+48LL)*(long long)in.Volume() + 96LL*bulk + 120LL*wall;
   }
 
   void DiracDomainWall::M(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -77,11 +67,6 @@ namespace quda {
     checkFullSpinor(out, in);
 
     ApplyDomainWall5D(out, in, *gauge, -kappa5, mass, in, QUDA_INVALID_PARITY, dagger, commDim, profile);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += (1320LL + 48LL) * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   void DiracDomainWall::MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
diff --git a/lib/dirac_domain_wall_4d.cpp b/lib/dirac_domain_wall_4d.cpp
index 7c69233f70..043101e2b5 100644
--- a/lib/dirac_domain_wall_4d.cpp
+++ b/lib/dirac_domain_wall_4d.cpp
@@ -26,7 +26,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4D(out, in, *gauge, 0.0, 0.0, nullptr, nullptr, in, parity, dagger, commDim, profile);
-    flops += 1320LL*(long long)in.Volume();
   }
 
   void DiracDomainWall4D::Dslash5(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -36,11 +35,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, in, mass, 0.0, nullptr, nullptr, 0.0, dagger, Dslash5Type::DSLASH5_DWF);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls-2)*(in.Volume()/Ls);
-    long long wall = 2*in.Volume()/Ls;
-    flops += 96LL*bulk + 120LL*wall;
   }
 
   // Modification for the 4D preconditioned domain wall operator
@@ -52,8 +46,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4D(out, in, *gauge, k, 0.0, nullptr, nullptr, x, parity, dagger, commDim, profile);
-
-    flops += (1320LL+48LL)*(long long)in.Volume();
   }
 
   void DiracDomainWall4D::Dslash5Xpay(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x,
@@ -64,11 +56,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, x, mass, 0.0, nullptr, nullptr, k, dagger, Dslash5Type::DSLASH5_DWF);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls-2)*(in.Volume()/Ls);
-    long long wall = 2*in.Volume()/Ls;
-    flops += (48LL)*(long long)in.Volume() + 96LL*bulk + 120LL*wall;
   }
 
   void DiracDomainWall4D::M(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -76,13 +63,7 @@ namespace quda {
     checkFullSpinor(out, in);
 
     ApplyDomainWall4D(out, in, *gauge, 0.0, 0.0, nullptr, nullptr, in, QUDA_INVALID_PARITY, dagger, commDim, profile);
-    flops += 1320LL * (long long)in.Volume();
     ApplyDslash5(out, in, out, mass, 0.0, nullptr, nullptr, 1.0, dagger, Dslash5Type::DSLASH5_DWF);
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += (48LL) * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
-
     blas::xpay(in, -kappa5, out);
   }
 
@@ -132,9 +113,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, in, mass, m5, nullptr, nullptr, 0.0, dagger, Dslash5Type::M5_INV_DWF);
-
-    long long Ls = in.X(4);
-    flops += 144LL * (long long)in.Volume() * Ls + 3LL * Ls * (Ls - 1LL);
   }
 
   void DiracDomainWall4DPC::M5invXpay(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x,
@@ -145,9 +123,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, x, mass, m5, nullptr, nullptr, b, dagger, Dslash5Type::M5_INV_DWF);
-
-    long long Ls = in.X(4);
-    flops += (144LL * Ls + 48LL) * (long long)in.Volume() + 3LL * Ls * (Ls - 1LL);
   }
 
   // Apply the 4D even-odd preconditioned domain-wall Dirac operator
diff --git a/lib/dirac_improved_staggered.cpp b/lib/dirac_improved_staggered.cpp
index b6423f9a02..e700a88cce 100644
--- a/lib/dirac_improved_staggered.cpp
+++ b/lib/dirac_improved_staggered.cpp
@@ -32,7 +32,6 @@ namespace quda {
     checkParitySpinor(in, out);
 
     ApplyImprovedStaggered(out, in, *fatGauge, *longGauge, 0., in, parity, dagger, commDim, profile);
-    flops += 1146ll*in.Volume();
   }
 
   void DiracImprovedStaggered::DslashXpay(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity,
@@ -49,10 +48,8 @@ namespace quda {
       } else {
         ApplyImprovedStaggered(out, in, *fatGauge, *longGauge, 0., x, parity, QUDA_DAG_YES, commDim, profile);
       }
-      flops += 1146ll * in.Volume();
     } else {
       ApplyImprovedStaggered(out, in, *fatGauge, *longGauge, k, x, parity, dagger, commDim, profile);
-      flops += 1158ll * in.Volume();
     }
   }
 
@@ -69,11 +66,9 @@ namespace quda {
         ApplyImprovedStaggered(out, in, *fatGauge, *longGauge, 0., in, QUDA_INVALID_PARITY, QUDA_DAG_YES, commDim,
                                profile);
       }
-      flops += 1146ll * in.Volume();
     } else {
       ApplyImprovedStaggered(out, in, *fatGauge, *longGauge, 2. * mass, in, QUDA_INVALID_PARITY, dagger, commDim,
                              profile);
-      flops += 1158ll * in.Volume();
     }
   }
 
@@ -134,8 +129,6 @@ namespace quda {
     } else {
       ApplyStaggeredQSmear(out, in, *gauge, t0_local, is_time_slice, parity, laplace3D, dagger, comm_dim, profile);
     }
-
-    flops += ( laplace3D > 3 ? 570ll : 426ll ) * ( in.Volume() / ( is_time_slice ? in.X(3) : 1 ) );
   }  
 
   void DiracImprovedStaggered::createCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, double, double mass,
diff --git a/lib/dirac_improved_staggered_kd.cpp b/lib/dirac_improved_staggered_kd.cpp
index 39e6080cd6..fc9e9ce756 100644
--- a/lib/dirac_improved_staggered_kd.cpp
+++ b/lib/dirac_improved_staggered_kd.cpp
@@ -59,29 +59,23 @@ namespace quda
       if (mass == 0.) {
         ApplyImprovedStaggered(tmp, in, *fatGauge, *longGauge, 0., in, QUDA_INVALID_PARITY, QUDA_DAG_YES, commDim,
                                profile);
-        flops += 1146ll * in.Volume();
       } else {
         ApplyImprovedStaggered(tmp, in, *fatGauge, *longGauge, 2. * mass, in, QUDA_INVALID_PARITY, dagger, commDim,
                                profile);
-        flops += 1158ll * in.Volume();
       }
 
       ApplyStaggeredKahlerDiracInverse(out, tmp, *Xinv, false);
-      flops += (8ll * 48 - 2ll) * 48 * in.Volume() / 16; // for 2^4 block
 
     } else { // QUDA_DAG_YES
 
       ApplyStaggeredKahlerDiracInverse(tmp, in, *Xinv, true);
-      flops += (8ll * 48 - 2ll) * 48 * in.Volume() / 16; // for 2^4 block
 
       if (mass == 0.) {
         ApplyImprovedStaggered(out, tmp, *fatGauge, *longGauge, 0., tmp, QUDA_INVALID_PARITY, QUDA_DAG_NO, commDim,
                                profile);
-        flops += 1146ll * in.Volume();
       } else {
         ApplyImprovedStaggered(out, tmp, *fatGauge, *longGauge, 2. * mass, tmp, QUDA_INVALID_PARITY, dagger, commDim,
                                profile);
-        flops += 1158ll * in.Volume();
       }
     }
   }
diff --git a/lib/dirac_mobius.cpp b/lib/dirac_mobius.cpp
index d60d8060fc..9e3648e838 100644
--- a/lib/dirac_mobius.cpp
+++ b/lib/dirac_mobius.cpp
@@ -58,8 +58,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4D(out, in, *gauge, 0.0, 0.0, nullptr, nullptr, in, parity, dagger, commDim, profile);
-
-    flops += 1320LL * (long long)in.Volume();
   }
 
   void DiracMobius::Dslash4pre(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -69,11 +67,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, in, mass, m5, b_5, c_5, 0.0, dagger, Dslash5Type::DSLASH5_MOBIUS_PRE);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 72LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   // Unlike DWF-4d, the Mobius variant here applies the full M5 operator and not just D5
@@ -84,11 +77,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, in, mass, m5, b_5, c_5, 0.0, dagger, Dslash5Type::DSLASH5_MOBIUS);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 48LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   // Modification for the 4D preconditioned Mobius domain wall operator
@@ -100,8 +88,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4D(out, in, *gauge, k, m5, b_5, c_5, x, parity, dagger, commDim, profile);
-
-    flops += 1320LL * (long long)in.Volume();
   }
 
   void DiracMobius::Dslash4preXpay(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x,
@@ -112,12 +98,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, x, mass, m5, b_5, c_5, k, dagger, Dslash5Type::DSLASH5_MOBIUS_PRE);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-
-    flops += (72LL + 48LL) * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   // The xpay operator bakes in a factor of kappa_b^2
@@ -129,11 +109,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, x, mass, m5, b_5, c_5, k, dagger, Dslash5Type::DSLASH5_MOBIUS);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 96LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   void DiracMobius::M(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -156,13 +131,6 @@ namespace quda {
       ApplyDslash5(out, in, in, mass, m5, b_5, c_5, 0.0, dagger, Dslash5Type::DSLASH5_MOBIUS);
     }
     blas::axpy(-mobius_kappa_b, tmp, out);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 72LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall; // pre
-    flops += 1320LL * (long long)in.Volume();                            // dslash4
-    flops += 48LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall; // dslash5
   }
 
   void DiracMobius::MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -223,9 +191,6 @@ namespace quda {
 
     ApplyDslash5(out, in, in, mass, m5, b_5, c_5, 0.0, dagger,
                  zMobius ? Dslash5Type::M5_INV_ZMOBIUS : Dslash5Type::M5_INV_MOBIUS);
-
-    long long Ls = in.X(4);
-    flops += 144LL * (long long)in.Volume() * Ls + 3LL * Ls * (Ls - 1LL);
   }
 
   // The xpay operator bakes in a factor of kappa_b^2
@@ -238,9 +203,6 @@ namespace quda {
 
     ApplyDslash5(out, in, x, mass, m5, b_5, c_5, k, dagger,
                  zMobius ? Dslash5Type::M5_INV_ZMOBIUS : Dslash5Type::M5_INV_MOBIUS);
-
-    long long Ls = in.X(4);
-    flops += (144LL * Ls + 48LL) * (long long)in.Volume() + 3LL * Ls * (Ls - 1LL);
   }
 
   void DiracMobiusPC::Dslash4M5invM5pre(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
@@ -250,16 +212,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4DM5invM5pre(out, in, *gauge, 0.0, m5, b_5, c_5, in, out, parity, dagger, commDim, mass, profile);
-
-    // D4
-    flops += 1320LL * (long long)in.Volume();
-    // M5inv
-    long long Ls = in.X(4);
-    flops += 144LL * (long long)in.Volume() * Ls + 3LL * Ls * (Ls - 1LL);
-    // M5pre
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 72LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   void DiracMobiusPC::Dslash4M5preM5inv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
@@ -269,16 +221,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4DM5preM5inv(out, in, *gauge, 0.0, m5, b_5, c_5, in, out, parity, dagger, commDim, mass, profile);
-
-    // D4
-    flops += 1320LL * (long long)in.Volume();
-    // M5inv
-    long long Ls = in.X(4);
-    flops += 144LL * (long long)in.Volume() * Ls + 3LL * Ls * (Ls - 1LL);
-    // M5pre
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 72LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   void DiracMobiusPC::Dslash4M5invXpay(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity,
@@ -289,14 +231,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4DM5inv(out, in, *gauge, a, m5, b_5, c_5, x, out, parity, dagger, commDim, mass, profile);
-
-    // D4
-    flops += 1320LL * (long long)in.Volume();
-    // M5inv
-    long long Ls = in.X(4);
-    flops += 144LL * (long long)in.Volume() * Ls + 3LL * Ls * (Ls - 1LL);
-    // xpay
-    flops += 48LL * (long long)in.Volume();
   }
 
   void DiracMobiusPC::Dslash4M5preXpay(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity,
@@ -307,16 +241,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4DM5pre(out, in, *gauge, a, m5, b_5, c_5, x, out, parity, dagger, commDim, mass, profile);
-
-    // D4
-    flops += 1320LL * (long long)in.Volume();
-    // M5pre
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 72LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
-    // xpay
-    flops += 48LL * (long long)in.Volume();
   }
 
   void DiracMobiusPC::Dslash4XpayM5mob(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity,
@@ -327,16 +251,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4DM5mob(out, in, *gauge, a, m5, b_5, c_5, x, out, parity, dagger, commDim, mass, profile);
-
-    // D4
-    flops += 1320LL * (long long)in.Volume();
-    // M5mob
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 48LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
-    // xpay
-    flops += 48LL * (long long)in.Volume();
   }
 
   void DiracMobiusPC::Dslash4M5preXpayM5mob(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity,
@@ -347,18 +261,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4DM5preM5mob(out, in, *gauge, a, m5, b_5, c_5, x, out, parity, dagger, commDim, mass, profile);
-
-    // D4
-    flops += 1320LL * (long long)in.Volume();
-    // M5pre
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 72LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
-    // M5mob
-    flops += 48LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
-    // xpay
-    flops += 48LL * (long long)in.Volume();
   }
 
   void DiracMobiusPC::Dslash4M5invXpayM5inv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity,
@@ -369,16 +271,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4DM5invM5inv(out, in, *gauge, a, m5, b_5, c_5, x, y, parity, dagger, commDim, mass, profile);
-
-    // D4
-    flops += 1320LL * (long long)in.Volume();
-    // M5inv
-    long long Ls = in.X(4);
-    flops += 144LL * (long long)in.Volume() * Ls + 3LL * Ls * (Ls - 1LL);
-    // M5inv
-    flops += 144LL * (long long)in.Volume() * Ls + 3LL * Ls * (Ls - 1LL);
-    // xpay
-    flops += 48LL * (long long)in.Volume();
   }
 
   // Apply the even-odd preconditioned mobius DWF operator
@@ -554,7 +446,7 @@ namespace quda {
 
   void DiracMobiusPC::MdagMLocal(ColorSpinorField &out, const ColorSpinorField &in) const
   {
-    if (zMobius) { errorQuda("DiracMobiusPC::MdagMLocal doesn't currently support zMobius.\n"); }
+    if (zMobius) errorQuda("DiracMobiusPC::MdagMLocal doesn't currently support zMobius");
 
     lat_dim_t shift0 = {0, 0, 0, 0};
     lat_dim_t shift1;
@@ -565,7 +457,7 @@ namespace quda {
       shift2[d] = comm_dim_partitioned(d) ? 2 : 0;
     }
 
-    if (extended_gauge == nullptr) { extended_gauge = createExtendedGauge(*gauge, shift2, profile, true); }
+    if (extended_gauge == nullptr) extended_gauge = createExtendedGauge(*gauge, shift2, profile, true);
 
     checkDWF(in, out);
     checkSpinorAlias(in, out);
@@ -573,67 +465,36 @@ namespace quda {
     ColorSpinorParam csParam(out);
     csParam.create = QUDA_NULL_FIELD_CREATE;
 
-    ColorSpinorField *unextended_tmp1 = ColorSpinorField::Create(csParam);
-    ColorSpinorField *unextended_tmp2 = ColorSpinorField::Create(csParam);
+    ColorSpinorField unextended_tmp1(csParam);
+    ColorSpinorField unextended_tmp2(csParam);
 
     csParam.x[0] += shift2[0]; // x direction is checkerboarded
     for (int d = 1; d < 4; ++d) { csParam.x[d] += shift2[d] * 2; }
-    ColorSpinorField *extended_tmp1 = ColorSpinorField::Create(csParam);
-    ColorSpinorField *extended_tmp2 = ColorSpinorField::Create(csParam);
+    ColorSpinorField extended_tmp1(csParam);
+    ColorSpinorField extended_tmp2(csParam);
 
     int odd_bit = (getMatPCType() == QUDA_MATPC_ODD_ODD) ? 1 : 0;
     QudaParity parity[2] = {static_cast<QudaParity>((1 + odd_bit) % 2), static_cast<QudaParity>((0 + odd_bit) % 2)};
     if (out.Precision() == QUDA_HALF_PRECISION || out.Precision() == QUDA_QUARTER_PRECISION) {
-      mobius_tensor_core::apply_fused_dslash(*unextended_tmp2, in, *extended_gauge, *unextended_tmp2, in, mass, m5, b_5,
+      mobius_tensor_core::apply_fused_dslash(unextended_tmp2, in, *extended_gauge, unextended_tmp2, in, mass, m5, b_5,
                                              c_5, dagger, parity[1], shift0.data, shift0.data,
                                              MdwfFusedDslashType::D5PRE);
 
-      mobius_tensor_core::apply_fused_dslash(*extended_tmp2, *unextended_tmp2, *extended_gauge, *extended_tmp2,
-                                             *unextended_tmp2, mass, m5, b_5, c_5, dagger, parity[0], shift1.data,
+      mobius_tensor_core::apply_fused_dslash(extended_tmp2, unextended_tmp2, *extended_gauge, extended_tmp2,
+                                             unextended_tmp2, mass, m5, b_5, c_5, dagger, parity[0], shift1.data,
                                              shift2.data, MdwfFusedDslashType::D4_D5INV_D5PRE);
 
-      mobius_tensor_core::apply_fused_dslash(*extended_tmp1, *extended_tmp2, *extended_gauge, *unextended_tmp1, in,
+      mobius_tensor_core::apply_fused_dslash(extended_tmp1, extended_tmp2, *extended_gauge, unextended_tmp1, in,
                                              mass, m5, b_5, c_5, dagger, parity[1], shift0.data, shift1.data,
                                              MdwfFusedDslashType::D4_D5INV_D5INVDAG);
 
-      mobius_tensor_core::apply_fused_dslash(*extended_tmp2, *extended_tmp1, *extended_gauge, *extended_tmp2,
-                                             *extended_tmp1, mass, m5, b_5, c_5, dagger, parity[0], shift1.data,
+      mobius_tensor_core::apply_fused_dslash(extended_tmp2, extended_tmp1, *extended_gauge, extended_tmp2,
+                                             extended_tmp1, mass, m5, b_5, c_5, dagger, parity[0], shift1.data,
                                              shift1.data, MdwfFusedDslashType::D4DAG_D5PREDAG_D5INVDAG);
 
-      mobius_tensor_core::apply_fused_dslash(out, *extended_tmp2, *extended_gauge, out, *unextended_tmp1, mass, m5, b_5,
+      mobius_tensor_core::apply_fused_dslash(out, extended_tmp2, *extended_gauge, out, unextended_tmp1, mass, m5, b_5,
                                              c_5, dagger, parity[1], shift2.data, shift2.data,
                                              MdwfFusedDslashType::D4DAG_D5PREDAG);
-
-      const long long Ls = in.X(4);
-      const long long mat = 2ll * 4ll * Ls - 1ll; // (multiplicaiton-add) * (spin) * Ls - 1
-      const long long hop = 7ll * 8ll;            // 8 for eight directions
-
-      long long vol;
-      long long halo_vol;
-
-      vol = (2 * in.X(0)) * in.X(1) * in.X(2) * in.X(3) * Ls / 2ll;
-      flops += vol * 24ll * mat;
-
-      vol = (2 * in.X(0) + 2 * 1) * (in.X(1) + 2 * 1) * (in.X(2) + 2 * 1) * (in.X(3) + 2 * 1) * Ls / 2ll;
-      halo_vol = (2 * in.X(0)) * in.X(1) * in.X(2) * in.X(3) * Ls / 2ll;
-      flops += halo_vol * 24ll * hop + vol * 24ll * mat;
-
-      vol = (2 * in.X(0) + 2 * 2) * (in.X(1) + 2 * 2) * (in.X(2) + 2 * 2) * (in.X(3) + 2 * 2) * Ls / 2ll;
-      halo_vol = (2 * in.X(0) + 2 * 1) * (in.X(1) + 2 * 1) * (in.X(2) + 2 * 1) * (in.X(3) + 2 * 1) * Ls / 2ll;
-      flops += halo_vol * 24ll * hop + vol * 24ll * mat * 2ll;
-
-      vol = (2 * in.X(0) + 2 * 1) * (in.X(1) + 2 * 1) * (in.X(2) + 2 * 1) * (in.X(3) + 2 * 1) * Ls / 2ll;
-      flops += vol * 24ll * (hop + mat);
-
-      vol = (2 * in.X(0)) * in.X(1) * in.X(2) * in.X(3) * Ls / 2ll;
-      flops += vol * 24ll * (hop + mat);
-
-      delete extended_tmp2;
-      delete extended_tmp1;
-
-      delete unextended_tmp1;
-      delete unextended_tmp2;
-
     } else {
       errorQuda("DiracMobiusPC::MdagMLocal(...) only supports half and quarter precision");
     }
@@ -710,20 +571,13 @@ namespace quda {
 
   void DiracMobiusEofa::m5_eofa(ColorSpinorField &out, const ColorSpinorField &in) const
   {
-    if (in.Ndim() != 5 || out.Ndim() != 5) errorQuda("Wrong number of dimensions\n");
+    if (in.Ndim() != 5 || out.Ndim() != 5) errorQuda("Wrong number of dimensions");
 
     checkDWF(in, out);
     checkSpinorAlias(in, out);
 
     mobius_eofa::apply_dslash5(out, in, in, mass, m5, b_5, c_5, 0., eofa_pm, m5inv_fac, mobius_kappa, eofa_u, eofa_x,
                                eofa_y, sherman_morrison_fac, dagger, Dslash5Type::M5_EOFA);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-
-    // 96 = 48 + 48, the second 48 from EOFA
-    flops += 96LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   void DiracMobiusEofa::m5_eofa_xpay(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x,
@@ -738,13 +592,6 @@ namespace quda {
     // The kernel will actually do (m5 * in - kappa_b^2 * x)
     mobius_eofa::apply_dslash5(out, in, x, mass, m5, b_5, c_5, a, eofa_pm, m5inv_fac, mobius_kappa, eofa_u, eofa_x,
                                eofa_y, sherman_morrison_fac, dagger, Dslash5Type::M5_EOFA);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-
-    // 144 = 96 + 48, the 48 from EOFA
-    flops += 144LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   void DiracMobiusEofa::M(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -768,15 +615,6 @@ namespace quda {
                                  eofa_y, sherman_morrison_fac, dagger, Dslash5Type::M5_EOFA);
     }
     blas::axpy(-mobius_kappa_b, tmp, out);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 72LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall; // pre
-    flops += 1320LL * (long long)in.Volume();                            // dslash4
-
-    // 96 = 48 + 48, the second 48 from EOFA
-    flops += 96LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall; // dslash5
   }
 
   void DiracMobiusEofa::MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -816,9 +654,6 @@ namespace quda {
 
     mobius_eofa::apply_dslash5(out, in, in, mass, m5, b_5, c_5, 0., eofa_pm, m5inv_fac, mobius_kappa, eofa_u, eofa_x,
                                eofa_y, sherman_morrison_fac, dagger, Dslash5Type::M5INV_EOFA);
-
-    long long Ls = in.X(4);
-    flops += (192LL * Ls + 96LL) * (long long)in.Volume() + 3LL * Ls * (Ls - 1LL);
   }
 
   void DiracMobiusEofaPC::m5inv_eofa_xpay(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x,
@@ -834,9 +669,6 @@ namespace quda {
     // The kernel will actually do (x - kappa_b^2 * m5inv * in)
     mobius_eofa::apply_dslash5(out, in, x, mass, m5, b_5, c_5, a, eofa_pm, m5inv_fac, mobius_kappa, eofa_u, eofa_x,
                                eofa_y, sherman_morrison_fac, dagger, Dslash5Type::M5INV_EOFA);
-
-    long long Ls = in.X(4);
-    flops += (192LL * Ls + 48LL + 96LL) * (long long)in.Volume() + 3LL * Ls * (Ls - 1LL);
   }
 
   // Apply the even-odd preconditioned mobius DWF EOFA operator
diff --git a/lib/dirac_staggered.cpp b/lib/dirac_staggered.cpp
index eb04c249b1..fcb7641a3f 100644
--- a/lib/dirac_staggered.cpp
+++ b/lib/dirac_staggered.cpp
@@ -25,7 +25,6 @@ namespace quda {
     checkParitySpinor(in, out);
 
     ApplyStaggered(out, in, *gauge, 0., in, parity, dagger, commDim, profile);
-    flops += 570ll*in.Volume();
   }
 
   void DiracStaggered::DslashXpay(ColorSpinorField &out, const ColorSpinorField &in, 
@@ -43,10 +42,8 @@ namespace quda {
       } else {
         ApplyStaggered(out, in, *gauge, 0., x, parity, QUDA_DAG_YES, commDim, profile);
       }
-      flops += 570ll * in.Volume();
     } else {
       ApplyStaggered(out, in, *gauge, k, x, parity, dagger, commDim, profile);
-      flops += 582ll * in.Volume();
     }
   }
 
@@ -66,10 +63,8 @@ namespace quda {
       } else {
         ApplyStaggered(out, in, *gauge, 0., in, QUDA_INVALID_PARITY, QUDA_DAG_YES, commDim, profile);
       }
-      flops += 570ll * in.Volume();
     } else {
       ApplyStaggered(out, in, *gauge, 2. * mass, in, QUDA_INVALID_PARITY, dagger, commDim, profile);
-      flops += 582ll * in.Volume();
     }
   }
 
@@ -142,7 +137,6 @@ namespace quda {
     } else {
       ApplyStaggeredQSmear(out, in, *gauge, t0_local, is_time_slice, parity, laplace3D, dagger, comm_dim, profile);
     }
-    flops += ( laplace3D > 3 ? 570ll : 426ll ) * ( in.Volume() / ( is_time_slice ? in.X(3) : 1 ) );
   }  
   
 
diff --git a/lib/dirac_staggered_kd.cpp b/lib/dirac_staggered_kd.cpp
index db339402da..ffb1bda9e3 100644
--- a/lib/dirac_staggered_kd.cpp
+++ b/lib/dirac_staggered_kd.cpp
@@ -58,25 +58,20 @@ namespace quda
 
       if (mass == 0.) {
         ApplyStaggered(tmp, in, *gauge, 0., in, QUDA_INVALID_PARITY, QUDA_DAG_YES, commDim, profile);
-        flops += 570ll * in.Volume();
       } else {
         ApplyStaggered(tmp, in, *gauge, 2. * mass, in, QUDA_INVALID_PARITY, dagger, commDim, profile);
-        flops += 582ll * in.Volume();
       }
+
       ApplyStaggeredKahlerDiracInverse(out, tmp, *Xinv, false);
-      flops += (8ll * 48 - 2ll) * 48 * in.Volume() / 16; // for 2^4 block
 
     } else { // QUDA_DAG_YES
 
       ApplyStaggeredKahlerDiracInverse(tmp, in, *Xinv, true);
-      flops += (8ll * 48 - 2ll) * 48 * in.Volume() / 16; // for 2^4 block
 
       if (mass == 0.) {
         ApplyStaggered(out, tmp, *gauge, 0., tmp, QUDA_INVALID_PARITY, QUDA_DAG_NO, commDim, profile);
-        flops += 570ll * in.Volume();
       } else {
         ApplyStaggered(out, tmp, *gauge, 2. * mass, tmp, QUDA_INVALID_PARITY, dagger, commDim, profile);
-        flops += 582ll * in.Volume();
       }
     }
   }
diff --git a/lib/dirac_twisted_clover.cpp b/lib/dirac_twisted_clover.cpp
index 25d91776a7..e6806e5d50 100644
--- a/lib/dirac_twisted_clover.cpp
+++ b/lib/dirac_twisted_clover.cpp
@@ -50,11 +50,6 @@ namespace quda {
   {
     checkParitySpinor(out, in);
     ApplyTwistClover(out, in, *clover, kappa, mu, epsilon, parity, dagger, twistType);
-
-    if (twistType == QUDA_TWIST_GAMMA5_INVERSE)
-      flops += (504ll + 504ll + 48ll) * in.Volume();
-    else
-      flops += (504ll + 48ll) * in.Volume();
   }
 
 
@@ -79,15 +74,10 @@ namespace quda {
       // tm_rho is a Hasenbusch mass preconditioning parameter applied just like a twisted mass
       // but *not* the inverse of M_ee or M_oo
       ApplyTwistedClover(out, in, *gauge, *clover, k, 2 * (mu + tm_rho) * kappa, x, parity, dagger, commDim, profile);
-      // wilson + chiral twist + clover
-      flops += (1320ll + 48ll + 504ll) * in.Volume();
-
     } else {
       // k * D * in + (A + i*2*mu*kappa*gamma_5 * tau_3 - 2 * kappa * epsilon * tau_1 ) * x
       ApplyNdegTwistedClover(out, in, *gauge, *clover, k, 2 * mu * kappa, -2 * kappa * epsilon, x, parity, dagger,
                              commDim, profile);
-      // wilson + chiral twist + flavour twist + clover
-      flops += (1320ll + 48ll + 48ll + 504ll) * in.Volume();
     }
   }
 
@@ -106,14 +96,10 @@ namespace quda {
       // (-kappa * D + A + i*2*mu*kappa*gamma_5 ) * in
       ApplyTwistedClover(out, in, *gauge, *clover, -kappa, 2.0 * kappa * mu, in, QUDA_INVALID_PARITY, dagger, commDim,
                          profile);
-      // wilson + chiral twist + clover
-      flops += (1320ll + 48ll + 504ll) * in.Volume();
     } else {
       // (-kappa * D + A + i*2*mu*kappa*gamma_5*tau_3 - 2*epsilon*kappa*tau_1) * in
       ApplyNdegTwistedClover(out, in, *gauge, *clover, -kappa, 2 * kappa * mu, -2 * kappa * epsilon, in,
                              QUDA_INVALID_PARITY, dagger, commDim, profile);
-      // wilson + chiral twist + flavor twist + clover
-      flops += (1320ll + 48ll + 48ll + 504ll) * in.Volume();
     }
   }
 
@@ -231,11 +217,9 @@ namespace quda {
       if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
         ApplyTwistedCloverPreconditioned(out, in, *gauge, *clover, 1.0, -2.0 * kappa * mu, false, in, parity, dagger,
                                          commDim, profile);
-        flops += (1320ll + 48ll + 504ll) * in.Volume();
       } else {
         ApplyNdegTwistedCloverPreconditioned(out, in, *gauge, *clover, 1.0, -2.0 * kappa * mu, 2.0 * kappa * epsilon,
                                              false, in, parity, dagger, commDim, profile);
-        flops += (1320ll + 48ll + 48ll + 504ll) * in.Volume();
       }
     }
   }
@@ -260,11 +244,9 @@ namespace quda {
       if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
         ApplyTwistedCloverPreconditioned(out, in, *gauge, *clover, k, -2.0 * kappa * mu, true, x, parity, dagger,
                                          commDim, profile);
-        flops += (1320ll + 48ll + 504ll) * in.Volume();
       } else {
         ApplyNdegTwistedCloverPreconditioned(out, in, *gauge, *clover, k, -2.0 * kappa * mu, 2.0 * kappa * epsilon,
                                              true, x, parity, dagger, commDim, profile);
-        flops += (1320ll + 48ll + 48ll + 504ll) * in.Volume();
       }
     }
   }
diff --git a/lib/dirac_twisted_mass.cpp b/lib/dirac_twisted_mass.cpp
index 0c2911c851..964c1191cc 100644
--- a/lib/dirac_twisted_mass.cpp
+++ b/lib/dirac_twisted_mass.cpp
@@ -36,7 +36,6 @@ namespace quda {
   {
     checkParitySpinor(out, in);
     ApplyTwistGamma(out, in, 4, kappa, mu, epsilon, dagger, twistType);
-    flops += 24ll*in.Volume();
   }
 
   // Public method to apply the twist
@@ -51,12 +50,10 @@ namespace quda {
     if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
       // this would really just be a Wilson dslash (not actually instantiated at present)
       ApplyTwistedMass(out, in, *gauge, 0.0, 2 * mu * kappa, in, parity, dagger, commDim, profile);
-      flops += 1392ll * in.Volume();
     } else {
       // this would really just be a 2-way vectorized Wilson dslash (not actually instantiated at present)
       ApplyNdegTwistedMass(
           out, in, *gauge, 0.0, 2 * mu * kappa, -2 * kappa * epsilon, in, parity, dagger, commDim, profile);
-      flops += (1440ll) * in.Volume();
     }
   }
 
@@ -67,11 +64,9 @@ namespace quda {
     if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
       // k * D * in + (1 + i*2*mu*kappa*gamma_5) *x
       ApplyTwistedMass(out, in, *gauge, k, 2 * mu * kappa, x, parity, dagger, commDim, profile);
-      flops += 1416ll * in.Volume();
     } else {
       // k * D * in + (1 + i*2*mu*kappa*gamma_5*tau_3 - 2*epsilon*kappa*tau_1) * x
       ApplyNdegTwistedMass(out, in, *gauge, k, 2 * mu * kappa, -2 * kappa * epsilon, x, parity, dagger, commDim, profile);
-      flops += (1464ll) * in.Volume();
     }
   }
 
@@ -88,11 +83,9 @@ namespace quda {
 
     if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
       ApplyTwistedMass(out, in, *gauge, -kappa, 2 * mu * kappa, in, QUDA_INVALID_PARITY, dagger, commDim, profile);
-      flops += 1416ll * in.Volume();
     } else {
       ApplyNdegTwistedMass(out, in, *gauge, -kappa, 2 * mu * kappa, -2 * kappa * epsilon, in, QUDA_INVALID_PARITY,
           dagger, commDim, profile);
-      flops += (1464ll) * in.Volume();
     }
   }
 
@@ -174,7 +167,6 @@ namespace quda {
       bool asymmetric
           = (matpcType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) && dagger;
       ApplyTwistedMassPreconditioned(out, in, *gauge, b, a, false, in, parity, dagger, asymmetric, commDim, profile);
-      flops += 1392ll * in.Volume(); // flops numbers are approximate since they will vary depending on the dagger or not
     } else {//TWIST doublet :
       double a = 2.0 * kappa * mu;
       double b = 2.0 * kappa * epsilon;
@@ -184,7 +176,6 @@ namespace quda {
           = (matpcType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) && dagger;
       ApplyNdegTwistedMassPreconditioned(out, in, *gauge, c, -2.0 * mu * kappa, 2.0 * kappa * epsilon, false, in,
           parity, dagger, asymmetric, commDim, profile);
-      flops += (1440ll) * in.Volume(); // flops are approx. since they will vary depending on the dagger or not
     }
   }
 
@@ -206,7 +197,6 @@ namespace quda {
       bool asymmetric
           = (matpcType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) && dagger;
       ApplyTwistedMassPreconditioned(out, in, *gauge, b, a, true, x, parity, dagger, asymmetric, commDim, profile);
-      flops += 1416ll * in.Volume(); // flops numbers are approximate since they will vary depending on the dagger or not
     } else {//TWIST_DOUBLET:
       double a = 2.0 * kappa * mu;
       double b = 2.0 * kappa * epsilon;
@@ -216,8 +206,6 @@ namespace quda {
           = (matpcType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) && dagger;
       ApplyNdegTwistedMassPreconditioned(out, in, *gauge, k * c, -2 * mu * kappa, 2 * kappa * epsilon, true, x, parity,
           dagger, asymmetric, commDim, profile);
-      flops += (1464ll)
-          * in.Volume(); // flops numbers are approximate since they will vary depending on the dagger or not
     }
   }
 
diff --git a/lib/dirac_wilson.cpp b/lib/dirac_wilson.cpp
index 336d7d38be..a73fbe7080 100644
--- a/lib/dirac_wilson.cpp
+++ b/lib/dirac_wilson.cpp
@@ -28,7 +28,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyWilson(out, in, *gauge, 0.0, in, parity, dagger, commDim, profile);
-    flops += 1320ll*in.Volume();
   }
 
   void DiracWilson::DslashXpay(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity,
@@ -38,7 +37,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyWilson(out, in, *gauge, k, x, parity, dagger, commDim, profile);
-    flops += 1368ll*in.Volume();
   }
 
   void DiracWilson::M(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -46,7 +44,6 @@ namespace quda {
     checkFullSpinor(out, in);
 
     ApplyWilson(out, in, *gauge, -kappa, in, QUDA_INVALID_PARITY, dagger, commDim, profile);
-    flops += 1368ll * in.Volume();
   }
 
   void DiracWilson::MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
diff --git a/lib/gauge_covdev.cpp b/lib/gauge_covdev.cpp
index e3a9e751e4..0d458c14f3 100644
--- a/lib/gauge_covdev.cpp
+++ b/lib/gauge_covdev.cpp
@@ -26,7 +26,6 @@ namespace quda {
     // only switch on comms needed for mu derivative (FIXME - only communicate in the given direction)
     comm_dim[mu % 4] = comm_dim_partitioned(mu % 4);
     ApplyCovDev(out, in, *gauge, mu, parity, dagger, comm_dim, profile);
-    flops += 1320ll*in.Volume(); // FIXME
   }
 
   void GaugeCovDev::MCD(ColorSpinorField &out, const ColorSpinorField &in, const int mu) const
diff --git a/lib/gauge_laplace.cpp b/lib/gauge_laplace.cpp
index 18ab9e3802..1f7d8608e8 100644
--- a/lib/gauge_laplace.cpp
+++ b/lib/gauge_laplace.cpp
@@ -29,7 +29,6 @@ namespace quda {
       if (laplace3D == i) comm_dim[i] = 0;
     }
     ApplyLaplace(out, in, *gauge, laplace3D, 1.0, 1.0, in, parity, dagger, comm_dim, profile);
-    flops += 1320ll*in.Volume(); // FIXME
   }
 
   void GaugeLaplace::DslashXpay(ColorSpinorField &out, const ColorSpinorField &in, 
@@ -45,7 +44,6 @@ namespace quda {
       if (laplace3D == i) comm_dim[i] = 0;
     }
     ApplyLaplace(out, in, *gauge, laplace3D, k, 1.0, x, parity, dagger, comm_dim, profile);
-    flops += 1368ll*in.Volume(); // FIXME
   }
 
   void GaugeLaplace::M(ColorSpinorField &out, const ColorSpinorField &in) const
diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h
index 8f01594579..77f0fe839d 100644
--- a/tests/dslash_test_utils.h
+++ b/tests/dslash_test_utils.h
@@ -25,6 +25,7 @@
 #include <gtest/gtest.h>
 
 #include <color_spinor_field.h>
+#include <tune_quda.h>
 
 using namespace quda;
 
@@ -995,22 +996,33 @@ struct DslashTestWrapper {
       printfQuda("Tuning...\n");
       dslashCUDA(1); // warm-up run
     }
+
+    auto flops0 = quda::Tunable::flops_global();
+    auto bytes0 = quda::Tunable::bytes_global();
+
     printfQuda("Executing %d kernel loops...\n", niter);
-    if (!transfer) dirac->Flops();
     DslashTime dslash_time = dslashCUDA(niter);
     printfQuda("done.\n\n");
 
+    unsigned long long flops = (quda::Tunable::flops_global() - flops0);
+    unsigned long long bytes = (quda::Tunable::bytes_global() - bytes0);
+
     if (!test_split_grid) {
       if (!transfer) spinorOut = cudaSpinorOut;
 
       // print timing information
       printfQuda("%fus per kernel call\n", 1e6 * dslash_time.event_time / niter);
-      // FIXME No flops count for twisted-clover yet
-      unsigned long long flops = 0;
-      if (!transfer) flops = dirac->Flops();
-      printfQuda("%llu flops per kernel call, %llu flops per site\n", flops / niter,
-                 (flops / niter) / cudaSpinor.Volume());
-      printfQuda("GFLOPS = %f\n", 1.0e-9 * flops / dslash_time.event_time);
+
+      printfQuda("%llu flops per kernel call, %llu flops per site %llu bytes per site\n", flops / niter,
+                 (flops / niter) / cudaSpinor.Volume(), (bytes / niter) / cudaSpinor.Volume());
+
+      double gflops = 1.0e-9 * flops / dslash_time.event_time;
+      printfQuda("GFLOPS = %f\n", gflops);
+      ::testing::Test::RecordProperty("Gflops", std::to_string(gflops));
+
+      double gbytes = 1.0e-9 * bytes / dslash_time.event_time;
+      printfQuda("GBYTES = %f\n", gbytes);
+      ::testing::Test::RecordProperty("Gbytes", std::to_string(gbytes));
 
       size_t ghost_bytes = cudaSpinor.GhostBytes();
 
diff --git a/tests/multigrid_benchmark_test.cpp b/tests/multigrid_benchmark_test.cpp
index 3b3f1e5beb..02c66e1644 100644
--- a/tests/multigrid_benchmark_test.cpp
+++ b/tests/multigrid_benchmark_test.cpp
@@ -12,6 +12,7 @@
 // include because of nasty globals used in the tests
 #include <dslash_reference.h>
 #include <dirac_quda.h>
+#include <tune_quda.h>
 #include <gauge_tools.h>
 #include <gtest/gtest.h>
 
@@ -278,12 +279,11 @@ int main(int argc, char **argv)
     if (test_rc != 0) warningQuda("Tests failed");
   }
 
-  // now rerun with more iterations to get accurate speed measurements
-  dirac->Flops();    // reset flops counter
-  dirac_pc->Flops(); // reset flops counter
-
+  auto flops0 = quda::Tunable::flops_global();
   double secs = benchmark(test_type, niter);
-  double gflops = ((test_type < 5 ? dirac->Flops() : dirac_pc->Flops()) * 1e-9) / (secs);
+  auto flops1 = quda::Tunable::flops_global();
+
+  double gflops = (flops1 - flops0) * 1e-9 / secs;
 
   printfQuda("Ncolor = %2d, %-31s: Gflop/s = %6.1f\n", Ncolor, names[test_type], gflops);
 
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 5cae0d80c2..6c5d40dd30 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -19,6 +19,7 @@
 #include "dslash_test_helpers.h"
 #include <assert.h>
 #include <gtest/gtest.h>
+#include <tune_quda.h>
 
 using namespace quda;
 
@@ -405,20 +406,30 @@ struct StaggeredDslashTestWrapper {
     printfQuda("Tuning...\n");
     dslashCUDA(1);
 
-    // reset flop counter
-    dirac->Flops();
+    auto flops0 = quda::Tunable::flops_global();
+    auto bytes0 = quda::Tunable::bytes_global();
 
     DslashTime dslash_time = dslashCUDA(niter);
+
+    unsigned long long flops = (quda::Tunable::flops_global() - flops0);
+    unsigned long long bytes = (quda::Tunable::bytes_global() - bytes0);
+
     spinorOut = cudaSpinorOut;
 
     if (print_metrics) {
       printfQuda("%fus per kernel call\n", 1e6 * dslash_time.event_time / niter);
 
-      unsigned long long flops = dirac->Flops();
+      printfQuda("%llu flops per kernel call, %llu flops per site %llu bytes per site\n", flops / niter,
+                 (flops / niter) / cudaSpinor.Volume(), (bytes / niter) / cudaSpinor.Volume());
+
       double gflops = 1.0e-9 * flops / dslash_time.event_time;
       printfQuda("GFLOPS = %f\n", gflops);
       ::testing::Test::RecordProperty("Gflops", std::to_string(gflops));
 
+      double gbytes = 1.0e-9 * bytes / dslash_time.event_time;
+      printfQuda("GBYTES = %f\n", gbytes);
+      ::testing::Test::RecordProperty("Gbytes", std::to_string(gbytes));
+
       size_t ghost_bytes = cudaSpinor.GhostBytes();
 
       ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_GPU",

From 14b36bf7180e22e2fdf702b651ca2895c9571c1f Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 16 Oct 2023 15:28:58 -0700
Subject: [PATCH 58/60] Don't count policy flops / bytes in the global counters
 to avoid double counting

---
 lib/tune.cpp | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/lib/tune.cpp b/lib/tune.cpp
index 608a77e3c9..7a7b551be5 100644
--- a/lib/tune.cpp
+++ b/lib/tune.cpp
@@ -848,6 +848,7 @@ namespace quda
     TuneKey key = tunable.tuneKey();
     if (use_managed_memory()) strcat(key.aux, ",managed");
     last_key = key;
+    bool is_policy = strncmp(key.aux, "policy,", 7) == 0 ? true : false;
 
 #ifdef LAUNCH_TIMER
     launchTimer.TPSTOP(QUDA_PROFILE_INIT);
@@ -890,8 +891,10 @@ namespace quda
         trace_list.push_back(trace_entry);
       }
 
-      Tunable::flops_global(Tunable::flops_global() + tunable.flops()); // increment flops counter
-      Tunable::bytes_global(Tunable::bytes_global() + tunable.bytes()); // increment bytes counter
+      if (!is_policy) {
+        Tunable::flops_global(Tunable::flops_global() + tunable.flops()); // increment flops counter
+        Tunable::bytes_global(Tunable::bytes_global() + tunable.bytes()); // increment bytes counter
+      }
       return param_tuned;
     }
 
@@ -910,8 +913,10 @@ namespace quda
       logQuda(QUDA_DEBUG_VERBOSE, "Launching %s with %s at vol=%s with %s (untuned)\n", key.name, key.aux, key.volume,
               tunable.paramString(param_default).c_str());
 
-      Tunable::flops_global(Tunable::flops_global() + tunable.flops()); // increment flops counter
-      Tunable::bytes_global(Tunable::bytes_global() + tunable.bytes()); // increment bytes counter
+      if (!is_policy) {
+        Tunable::flops_global(Tunable::flops_global() + tunable.flops()); // increment flops counter
+        Tunable::bytes_global(Tunable::bytes_global() + tunable.bytes()); // increment bytes counter
+      }
       return param_default;
     } else if (!tuning) {
 
@@ -1125,8 +1130,10 @@ namespace quda
 
     param.n_calls = profile_count ? 1 : 0;
 
-    Tunable::flops_global(Tunable::flops_global() + tunable.flops()); // increment flops counter
-    Tunable::bytes_global(Tunable::bytes_global() + tunable.bytes()); // increment bytes counter
+    if (!is_policy) {
+      Tunable::flops_global(Tunable::flops_global() + tunable.flops()); // increment flops counter
+      Tunable::bytes_global(Tunable::bytes_global() + tunable.bytes()); // increment bytes counter
+    }
     return param;
   }
 

From 4e8fb5d3420d4fe1fae5631598066e5868985420 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 16 Oct 2023 15:57:45 -0700
Subject: [PATCH 59/60] Apply clang format

---
 include/blas_helper.cuh                       |   4 +-
 include/dirac_quda.h                          |   5 +-
 include/gauge_field.h                         |  34 +++---
 include/gauge_field_order.h                   |  78 ++++++------
 include/lattice_field.h                       |   4 +-
 include/quda.h                                |   7 +-
 include/quda_api.h                            |   6 +-
 include/quda_ptr.h                            |  12 +-
 include/timer.h                               |   4 +-
 include/tune_quda.h                           |   2 +-
 lib/coarse_op.cuh                             |   8 +-
 lib/color_spinor_field.cpp                    |  10 +-
 lib/copy_color_spinor_mg.in.hpp               |   4 +-
 lib/dirac_coarse.cpp                          |  13 +-
 lib/dirac_improved_staggered_kd.cpp           |   4 +-
 lib/dirac_mobius.cpp                          |   4 +-
 lib/gauge_field.cpp                           | 106 ++++++++--------
 lib/interface_quda.cpp                        | 113 +++++++++---------
 lib/lattice_field.cpp                         |  10 +-
 lib/milc_interface.cpp                        |   2 +-
 lib/quda_ptr.cpp                              |  60 ++++------
 lib/targets/cuda/quda_api.cpp                 |  16 +--
 lib/targets/hip/quda_api.cpp                  |  16 +--
 lib/timer.cpp                                 |   9 +-
 tests/dslash_test_utils.h                     | 111 +++++++++--------
 tests/gauge_path_test.cpp                     |  13 +-
 tests/heatbath_test.cpp                       |   4 +-
 tests/hisq_paths_force_test.cpp               |   6 +-
 tests/hisq_unitarize_force_test.cpp           |   2 +-
 tests/host_reference/covdev_reference.cpp     |  29 +++--
 tests/host_reference/covdev_reference.h       |   5 +-
 tests/host_reference/dslash_reference.cpp     |  11 +-
 tests/host_reference/dslash_reference.h       |   5 +-
 .../host_reference/gauge_force_reference.cpp  |  12 +-
 tests/host_reference/gauge_force_reference.h  |   8 +-
 tests/host_reference/hisq_force_reference.cpp |  31 ++---
 tests/host_reference/hisq_force_reference.h   |   3 +-
 tests/staggered_gsmear_test_utils.h           |   8 +-
 tests/staggered_invert_test.cpp               |  13 +-
 tests/utils/host_utils.h                      |   7 +-
 tests/utils/staggered_host_utils.cpp          |   3 +-
 41 files changed, 405 insertions(+), 397 deletions(-)

diff --git a/include/blas_helper.cuh b/include/blas_helper.cuh
index 80e974c0c9..1c55c5c2e3 100644
--- a/include/blas_helper.cuh
+++ b/include/blas_helper.cuh
@@ -111,9 +111,7 @@ namespace quda
       {}
 
       data_t(const ColorSpinorField &x) :
-        spinor(x.data<store_t *>()),
-        stride(x.VolumeCB()),
-        cb_offset(x.Bytes() / (2 * sizeof(store_t) * N))
+        spinor(x.data<store_t *>()), stride(x.VolumeCB()), cb_offset(x.Bytes() / (2 * sizeof(store_t) * N))
       {}
     };
 
diff --git a/include/dirac_quda.h b/include/dirac_quda.h
index 6ce01ea578..1aa339139f 100644
--- a/include/dirac_quda.h
+++ b/include/dirac_quda.h
@@ -465,10 +465,7 @@ namespace quda {
      *  @param long_gauge_in Updated long links
      *  @param clover_in Updated clover field
      */
-    virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *)
-    {
-      gauge = gauge_in;
-    }
+    virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *) { gauge = gauge_in; }
 
     /**
      * @brief Create the coarse operator (virtual parent)
diff --git a/include/gauge_field.h b/include/gauge_field.h
index c85a8bed06..b1757087cb 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -96,8 +96,10 @@ namespace quda {
       link_type(link_type_ != QUDA_INVALID_LINKS ? link_type_ : param.type),
       t_boundary(link_type == QUDA_ASQTAD_MOM_LINKS ? QUDA_PERIODIC_T : param.t_boundary),
       // if we have momentum field and not using TIFR field, then we always have recon-10
-      reconstruct(link_type == QUDA_ASQTAD_MOM_LINKS && order != QUDA_TIFR_GAUGE_ORDER && order != QUDA_TIFR_PADDED_GAUGE_ORDER ?
-                  QUDA_RECONSTRUCT_10 : QUDA_RECONSTRUCT_NO),
+      reconstruct(link_type == QUDA_ASQTAD_MOM_LINKS && order != QUDA_TIFR_GAUGE_ORDER
+                      && order != QUDA_TIFR_PADDED_GAUGE_ORDER ?
+                    QUDA_RECONSTRUCT_10 :
+                    QUDA_RECONSTRUCT_NO),
       anisotropy(param.anisotropy),
       tadpole(param.tadpole_coeff),
       gauge(h_gauge),
@@ -144,11 +146,11 @@ namespace quda {
   };
 
   std::ostream& operator<<(std::ostream& output, const GaugeFieldParam& param);
-  std::ostream& operator<<(std::ostream& output, const GaugeField& param);
+  std::ostream &operator<<(std::ostream &output, const GaugeField &param);
 
   class GaugeField : public LatticeField {
 
-    friend std::ostream& operator<<(std::ostream& output, const GaugeField& param);
+    friend std::ostream &operator<<(std::ostream &output, const GaugeField &param);
 
   private:
     /**
@@ -439,10 +441,8 @@ namespace quda {
     {
       switch (order) {
       case QUDA_QDP_GAUGE_ORDER:
-      case QUDA_QDPJIT_GAUGE_ORDER:
-        return true;
-      default:
-        return false;
+      case QUDA_QDPJIT_GAUGE_ORDER: return true;
+      default: return false;
       }
     }
 
@@ -451,11 +451,10 @@ namespace quda {
        @tparam T Optional type to cast the pointer to (default is void*).
        @return Base pointer to the gauge field allocation
      */
-    template <typename T = void*>
+    template <typename T = void *>
     std::enable_if_t<std::is_pointer_v<T> && !std::is_pointer_v<typename std::remove_pointer<T>::type>, T> data() const
     {
-      if (is_pointer_array(order))
-        errorQuda("Non dim-array ordered field requested but order is %d", order);
+      if (is_pointer_array(order)) errorQuda("Non dim-array ordered field requested but order is %d", order);
       return reinterpret_cast<T>(gauge.data());
     }
 
@@ -468,9 +467,10 @@ namespace quda {
        @param[in] d Dimension index when the allocation is an array type
        @return Base pointer to the gauge field allocation
      */
-    template <typename T = void*> auto data(unsigned int d) const
+    template <typename T = void *> auto data(unsigned int d) const
     {
-      static_assert(std::is_pointer_v<T> && !std::is_pointer_v<typename std::remove_pointer<T>::type>, "data() requires a pointer cast type");
+      static_assert(std::is_pointer_v<T> && !std::is_pointer_v<typename std::remove_pointer<T>::type>,
+                    "data() requires a pointer cast type");
       if (d >= (unsigned)geometry) errorQuda("Invalid array index %d for geometry %d field", d, geometry);
       if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order);
       return reinterpret_cast<T>(gauge_array[d].data());
@@ -483,8 +483,9 @@ namespace quda {
        or QDPJIT.
        @return Array of pointers to the gauge field allocations
      */
-    template <typename T = void*>
-    std::enable_if_t<std::is_pointer_v<T> && !std::is_pointer_v<typename std::remove_pointer<T>::type>, array<T, QUDA_MAX_DIM>> data_array() const
+    template <typename T = void *>
+    std::enable_if_t<std::is_pointer_v<T> && !std::is_pointer_v<typename std::remove_pointer<T>::type>, array<T, QUDA_MAX_DIM>>
+    data_array() const
     {
       if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order);
       array<T, QUDA_MAX_DIM> u = {};
@@ -494,7 +495,8 @@ namespace quda {
 
     virtual int full_dim(int d) const { return x[d]; }
 
-    auto& Ghost() const {
+    auto &Ghost() const
+    {
       if ( isNative() ) errorQuda("No ghost zone pointer for quda-native gauge fields");
       return ghost;
     }
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 3a3aabd965..2ce7197a83 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -445,19 +445,21 @@ namespace quda {
       GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr)
       {
         for (int d=0; d<4; d++) {
-          ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
+          ghost[d] = ghost_ ? static_cast<complex<storeFloat> *>(ghost_[d]) :
             U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ?
-            static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d].data())) : nullptr;
-          ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
+                              static_cast<complex<storeFloat> *>(const_cast<void *>(U.Ghost()[d].data())) :
+                              nullptr;
+          ghostOffset[d] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor();
 
-          ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
-            ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
+          ghost[d + 4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
+            ghost_                                              ? static_cast<complex<storeFloat> *>(ghost_[d + 4]) :
             U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ?
-            static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4].data())) : nullptr;
-          ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
+                     static_cast<complex<storeFloat> *>(const_cast<void *>(U.Ghost()[d + 4].data())) :
+                     nullptr;
+          ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor();
         }
 
-	resetScale(U.Scale());
+        resetScale(U.Scale());
       }
 
       void resetScale(Float max)
@@ -567,18 +569,21 @@ namespace quda {
       GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr)
       {
         for (int d=0; d<4; d++) {
-          ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
-            U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d].data())) : nullptr;
-          ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
+          ghost[d] = ghost_ ? static_cast<complex<storeFloat> *>(ghost_[d]) :
+            U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ?
+                              static_cast<complex<storeFloat> *>(const_cast<void *>(U.Ghost()[d].data())) :
+                              nullptr;
+          ghostOffset[d] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor();
 
-          ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
-            ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
+          ghost[d + 4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
+            ghost_                                              ? static_cast<complex<storeFloat> *>(ghost_[d + 4]) :
             U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ?
-            static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4].data())) : nullptr;
-          ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
+                     static_cast<complex<storeFloat> *>(const_cast<void *>(U.Ghost()[d + 4].data())) :
+                     nullptr;
+          ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor();
         }
 
-	resetScale(U.Scale());
+        resetScale(U.Scale());
       }
 
       void resetScale(Float max)
@@ -704,9 +709,11 @@ namespace quda {
         if constexpr (!native_ghost) assert(ghost_ != nullptr);
         for (int d = 0; d < 4; d++) {
           ghost[d] = !native_ghost ? static_cast<complex<storeFloat>*>(ghost_[d]) : nullptr;
-          ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d);
-          ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast<complex<storeFloat>*>(ghost_[d+4]) : nullptr;
-          ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d);
+          ghostVolumeCB[d] = U.Nface() * U.SurfaceCB(d);
+          ghost[d + 4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY ?
+            static_cast<complex<storeFloat> *>(ghost_[d + 4]) :
+            nullptr;
+          ghostVolumeCB[d + 4] = U.Nface() * U.SurfaceCB(d);
         }
         resetScale(U.Scale());
       }
@@ -870,12 +877,12 @@ namespace quda {
 	__device__ __host__ inline int Ncolor() const { return nColor; }
 
 	/** Returns the field volume */
-	__device__ __host__ inline auto Volume() const { return 2*volumeCB; }
+        __device__ __host__ inline auto Volume() const { return 2 * volumeCB; }
 
-	/** Returns the field volume */
-	__device__ __host__ inline auto VolumeCB() const { return volumeCB; }
+        /** Returns the field volume */
+        __device__ __host__ inline auto VolumeCB() const { return volumeCB; }
 
-	/** Returns the field geometric dimension */
+        /** Returns the field geometric dimension */
 	__device__ __host__ inline int Ndim() const { return nDim; }
 
 	/** Returns the field geometry */
@@ -1788,8 +1795,9 @@ namespace quda {
             errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
 
           for (int i = 0; i < 4; i++) {
-            ghost[i] = (ghost_) ? ghost_[i] :
-              u.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? (Float *)(u.Ghost()[i].data()) : nullptr;
+            ghost[i] = (ghost_)                            ? ghost_[i] :
+              u.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? (Float *)(u.Ghost()[i].data()) :
+                                                             nullptr;
             faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth
           }
         }
@@ -1847,11 +1855,11 @@ namespace quda {
       using complex = complex<real>;
       Float *gauge[QUDA_MAX_DIM];
       const unsigned int volumeCB;
-    QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
-      : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
-    {
-      for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data<Float *>(i);
-    }
+      QDPOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
+        LegacyOrder<Float, length>(u, ghost_), volumeCB(u.VolumeCB())
+      {
+        for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data<Float *>(i);
+      }
 
         __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const
         {
@@ -1893,11 +1901,11 @@ namespace quda {
       using complex = complex<real>;
       Float *gauge[QUDA_MAX_DIM];
       const unsigned int volumeCB;
-    QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
-      : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
-    {
-      for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data<Float *>(i);
-    }
+      QDPJITOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
+        LegacyOrder<Float, length>(u, ghost_), volumeCB(u.VolumeCB())
+      {
+        for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data<Float *>(i);
+      }
 
         __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const
         {
diff --git a/include/lattice_field.h b/include/lattice_field.h
index a7ca3984ee..b92297eabc 100644
--- a/include/lattice_field.h
+++ b/include/lattice_field.h
@@ -150,11 +150,11 @@ namespace quda {
   };
 
   std::ostream& operator<<(std::ostream& output, const LatticeFieldParam& param);
-  std::ostream& operator<<(std::ostream& output, const LatticeField& field);
+  std::ostream &operator<<(std::ostream &output, const LatticeField &field);
 
   class LatticeField : public Object {
 
-    friend std::ostream& operator<<(std::ostream& output, const LatticeField& param);
+    friend std::ostream &operator<<(std::ostream &output, const LatticeField &param);
 
     /**
        @brief Create the field as specified by the param
diff --git a/include/quda.h b/include/quda.h
index b2ddefa72c..c8392b2054 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -62,7 +62,7 @@ extern "C" {
 
     QudaGaugeFixed gauge_fix; /**< Whether the input gauge field is in the axial gauge or not */
 
-    int ga_pad;       /**< The pad size that native GaugeFields will use (default=0) */
+    int ga_pad; /**< The pad size that native GaugeFields will use (default=0) */
 
     int site_ga_pad;  /**< Used by link fattening and the gauge and fermion forces */
 
@@ -1685,8 +1685,7 @@ extern "C" {
    */
   int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps,
                                 const unsigned int verbose_interval, const double relax_boost, const double tolerance,
-                                const unsigned int reunit_interval, const unsigned int stopWtheta,
-                                QudaGaugeParam *param);
+                                const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param);
 
   /**
    * @brief Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
@@ -1764,7 +1763,7 @@ extern "C" {
     double secs;
     /** Flops count for the smearing operations **/
     double gflops;
-    
+
   } QudaQuarkSmearParam;
 
   /**
diff --git a/include/quda_api.h b/include/quda_api.h
index becec68c8b..e1ec69bbe1 100644
--- a/include/quda_api.h
+++ b/include/quda_api.h
@@ -50,8 +50,8 @@ namespace quda
      @param[in] count Size of transfer
      @param[in] kind Type of memory copy
   */
-  void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func, const char *file,
-                   const char *line);
+  void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func,
+                   const char *file, const char *line);
 
   /**
      @brief Wrapper around cudaMemcpyAsync or driver API equivalent
@@ -246,7 +246,7 @@ namespace quda
 #define qudaMemsetAsync(ptr, value, count, stream)                                                                     \
   ::quda::qudaMemsetAsync_(ptr, value, count, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
 
-#define qudaMemset2DAsync(ptr, offset, pitch, value, width, height, stream) \
+#define qudaMemset2DAsync(ptr, offset, pitch, value, width, height, stream)                                            \
   ::quda::qudaMemset2DAsync_(ptr, offset, pitch, value, width, height, stream, __func__, quda::file_name(__FILE__),    \
                              __STRINGIFY__(__LINE__))
 
diff --git a/include/quda_ptr.h b/include/quda_ptr.h
index 185d852d57..aab76f6b89 100644
--- a/include/quda_ptr.h
+++ b/include/quda_ptr.h
@@ -3,7 +3,8 @@
 #include <ostream>
 #include "malloc_quda.h"
 
-namespace quda {
+namespace quda
+{
 
   /**
      Object that stores a memory allocation with different views for
@@ -18,8 +19,9 @@ namespace quda {
      QUDA_MEMORY_MAPPED         both (pinned to host)
      QUDA_MEMORY_MANAGED        both
    */
-  class quda_ptr {
-    friend std::ostream& operator<<(std::ostream& output, const quda_ptr& ptr);
+  class quda_ptr
+  {
+    friend std::ostream &operator<<(std::ostream &output, const quda_ptr &ptr);
     QudaMemoryType type = QUDA_MEMORY_INVALID; /** Memory type of the allocation */
     size_t size = 0;                           /** Size of the allocation */
     bool pool = false;                         /** Is the allocation is pooled */
@@ -99,6 +101,6 @@ namespace quda {
     bool is_reference() const;
   };
 
-  std::ostream& operator<<(std::ostream& output, const quda_ptr& ptr);
+  std::ostream &operator<<(std::ostream &output, const quda_ptr &ptr);
 
-}
+} // namespace quda
diff --git a/include/timer.h b/include/timer.h
index 8402deb89c..0b69d5d466 100644
--- a/include/timer.h
+++ b/include/timer.h
@@ -205,7 +205,7 @@ namespace quda {
   public:
     TimeProfile() = default;
     TimeProfile(const TimeProfile &) = default;
-    TimeProfile& operator=(const TimeProfile &) = default;
+    TimeProfile &operator=(const TimeProfile &) = default;
 
     TimeProfile(std::string fname) : fname(fname), switchOff(false), use_global(true) { ; }
 
@@ -249,7 +249,7 @@ namespace quda {
      @brief Return a reference to the present profile at the top of
      the stack
    */
-  TimeProfile& getProfile();
+  TimeProfile &getProfile();
 
 } // namespace quda
 
diff --git a/include/tune_quda.h b/include/tune_quda.h
index 2750e57e9c..c6a7d0c111 100644
--- a/include/tune_quda.h
+++ b/include/tune_quda.h
@@ -22,7 +22,7 @@ namespace quda {
     dim3 grid;
     unsigned int shared_bytes = 0;
     bool set_max_shared_bytes = false; // whether to opt in to max shared bytes per thread block
-    int4 aux = {1, 1, 1, 1}; // free parameter that can be used as an arbitrary autotuning dimension outside of launch parameters
+    int4 aux = {1, 1, 1, 1}; // free parameter used as an arbitrary autotuning dimension
 
     std::string comment;
     float time = FLT_MAX;
diff --git a/lib/coarse_op.cuh b/lib/coarse_op.cuh
index 40ac842c06..564d1ed55c 100644
--- a/lib/coarse_op.cuh
+++ b/lib/coarse_op.cuh
@@ -887,8 +887,8 @@ namespace quda {
 	X_atomic.backup();
         break;
       case COMPUTE_CONVERT:
-	if (Y_atomic.data() == Y.data()) Y.backup();
-	if (X_atomic.data() == X.data()) X.backup();
+        if (Y_atomic.data() == Y.data()) Y.backup();
+        if (X_atomic.data() == X.data()) X.backup();
         break;
       case COMPUTE_RESCALE:
         Y.backup();
@@ -921,8 +921,8 @@ namespace quda {
 	X_atomic.restore();
         break;
       case COMPUTE_CONVERT:
-	if (Y_atomic.data() == Y.data()) Y.restore();
-	if (X_atomic.data() == X.data()) X.restore();
+        if (Y_atomic.data() == Y.data()) Y.restore();
+        if (X_atomic.data() == X.data()) X.restore();
         break;
       case COMPUTE_RESCALE:
         Y.restore();
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index b1e7aa6060..e949080b79 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -150,7 +150,7 @@ namespace quda
     if (param.create != QUDA_REFERENCE_FIELD_CREATE && param.create != QUDA_GHOST_FIELD_CREATE) {
       v = quda_ptr(mem_type, bytes);
       alloc = true;
-    } else  if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
+    } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
       v = quda_ptr(param.v, mem_type);
       reference = true;
     } else if (param.create == QUDA_GHOST_FIELD_CREATE) {
@@ -213,7 +213,8 @@ namespace quda
     if (bytes != bytes_raw) {
       size_t subset_bytes = bytes / siteSubset;
       size_t subset_bytes_raw = bytes_raw / siteSubset;
-      qudaMemset2DAsync(v, subset_bytes_raw, subset_bytes, 0, subset_bytes - subset_bytes_raw, siteSubset, device::get_default_stream());
+      qudaMemset2DAsync(v, subset_bytes_raw, subset_bytes, 0, subset_bytes - subset_bytes_raw, siteSubset,
+                        device::get_default_stream());
     }
   }
 
@@ -396,10 +397,7 @@ namespace quda
     ghost_precision_allocated = ghost_precision;
   } // createGhostZone
 
-  void ColorSpinorField::zero()
-  {
-    qudaMemsetAsync(v, 0, bytes, device::get_default_stream());
-  }
+  void ColorSpinorField::zero() { qudaMemsetAsync(v, 0, bytes, device::get_default_stream()); }
 
   void ColorSpinorField::copy(const ColorSpinorField &src)
   {
diff --git a/lib/copy_color_spinor_mg.in.hpp b/lib/copy_color_spinor_mg.in.hpp
index d28ffa4e80..92b96b4c1a 100644
--- a/lib/copy_color_spinor_mg.in.hpp
+++ b/lib/copy_color_spinor_mg.in.hpp
@@ -117,14 +117,14 @@ namespace quda {
       }
 
       // set for the source subset ordering
-      srcFloat *srcEven = Src ? Src : src.data<srcFloat*>();
+      srcFloat *srcEven = Src ? Src : src.data<srcFloat *>();
       srcFloat *srcOdd = (srcFloat*)((char*)srcEven + src.Bytes()/2);
       if (src.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER) {
 	std::swap<srcFloat*>(srcEven, srcOdd);
       }
 
       // set for the destination subset ordering
-      dstFloat *dstEven = Dst ? Dst : dst.data<dstFloat*>();
+      dstFloat *dstEven = Dst ? Dst : dst.data<dstFloat *>();
       dstFloat *dstOdd = (dstFloat*)((char*)dstEven + dst.Bytes()/2);
       if (dst.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER) {
 	std::swap<dstFloat*>(dstEven, dstOdd);
diff --git a/lib/dirac_coarse.cpp b/lib/dirac_coarse.cpp
index 058e81cab5..6956df7e69 100644
--- a/lib/dirac_coarse.cpp
+++ b/lib/dirac_coarse.cpp
@@ -29,8 +29,8 @@ namespace quda {
     initializeCoarse();
   }
 
-  DiracCoarse::DiracCoarse(const DiracParam &param, std::shared_ptr<GaugeField> Y_h,
-                           std::shared_ptr<GaugeField> X_h, std::shared_ptr<GaugeField> Xinv_h,
+  DiracCoarse::DiracCoarse(const DiracParam &param, std::shared_ptr<GaugeField> Y_h, std::shared_ptr<GaugeField> X_h,
+                           std::shared_ptr<GaugeField> Xinv_h,
                            std::shared_ptr<GaugeField> Yhat_h, // cpu link fields
                            std::shared_ptr<GaugeField> Y_d, std::shared_ptr<GaugeField> X_d,
                            std::shared_ptr<GaugeField> Xinv_d,
@@ -495,11 +495,10 @@ namespace quda {
     /* do nothing */
   }
 
-  DiracCoarsePC::DiracCoarsePC(const DiracParam &param, std::shared_ptr<GaugeField> Y_h,
-                               std::shared_ptr<GaugeField> X_h, std::shared_ptr<GaugeField> Xinv_h,
-                               std::shared_ptr<GaugeField> Yhat_h, std::shared_ptr<GaugeField> Y_d,
-                               std::shared_ptr<GaugeField> X_d, std::shared_ptr<GaugeField> Xinv_d,
-                               std::shared_ptr<GaugeField> Yhat_d) :
+  DiracCoarsePC::DiracCoarsePC(const DiracParam &param, std::shared_ptr<GaugeField> Y_h, std::shared_ptr<GaugeField> X_h,
+                               std::shared_ptr<GaugeField> Xinv_h, std::shared_ptr<GaugeField> Yhat_h,
+                               std::shared_ptr<GaugeField> Y_d, std::shared_ptr<GaugeField> X_d,
+                               std::shared_ptr<GaugeField> Xinv_d, std::shared_ptr<GaugeField> Yhat_d) :
     DiracCoarse(param, Y_h, X_h, Xinv_h, Yhat_h, Y_d, X_d, Xinv_d, Yhat_d)
   {
   }
diff --git a/lib/dirac_improved_staggered_kd.cpp b/lib/dirac_improved_staggered_kd.cpp
index fc9e9ce756..d7e058d3fd 100644
--- a/lib/dirac_improved_staggered_kd.cpp
+++ b/lib/dirac_improved_staggered_kd.cpp
@@ -148,8 +148,8 @@ namespace quda
     // Should we support "preparing" and "reconstructing"?
   }
 
-  void DiracImprovedStaggeredKD::updateFields(GaugeField *, GaugeField *fat_gauge_in,
-                                              GaugeField *long_gauge_in, CloverField *)
+  void DiracImprovedStaggeredKD::updateFields(GaugeField *, GaugeField *fat_gauge_in, GaugeField *long_gauge_in,
+                                              CloverField *)
   {
     Dirac::updateFields(fat_gauge_in, nullptr, nullptr, nullptr);
     fatGauge = fat_gauge_in;
diff --git a/lib/dirac_mobius.cpp b/lib/dirac_mobius.cpp
index 9e3648e838..aaf2aaf6fc 100644
--- a/lib/dirac_mobius.cpp
+++ b/lib/dirac_mobius.cpp
@@ -484,8 +484,8 @@ namespace quda {
                                              unextended_tmp2, mass, m5, b_5, c_5, dagger, parity[0], shift1.data,
                                              shift2.data, MdwfFusedDslashType::D4_D5INV_D5PRE);
 
-      mobius_tensor_core::apply_fused_dslash(extended_tmp1, extended_tmp2, *extended_gauge, unextended_tmp1, in,
-                                             mass, m5, b_5, c_5, dagger, parity[1], shift0.data, shift1.data,
+      mobius_tensor_core::apply_fused_dslash(extended_tmp1, extended_tmp2, *extended_gauge, unextended_tmp1, in, mass,
+                                             m5, b_5, c_5, dagger, parity[1], shift0.data, shift1.data,
                                              MdwfFusedDslashType::D4_D5INV_D5INVDAG);
 
       mobius_tensor_core::apply_fused_dslash(extended_tmp2, extended_tmp1, *extended_gauge, extended_tmp2,
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 6a0b4c6bb9..9b2584ba26 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -161,10 +161,11 @@ namespace quda {
     if (isNative() && ghostExchange == QUDA_GHOST_EXCHANGE_PAD) {
       bool pad_check = true;
       for (int i = 0; i < nDim; i++) {
-	// when we have coarse links we need to double the pad since we're storing forwards and backwards links
-	int minimum_pad = comm_dim_partitioned(i) ? nFace*surfaceCB[i] * (geometry == QUDA_COARSE_GEOMETRY ? 2 : 1) : 0;
-	if (pad < minimum_pad) pad_check = false;
-	if (!pad_check) errorQuda("GaugeField being constructed with insufficient padding in dim %d (%d < %d)", i, pad, minimum_pad);
+        // when we have coarse links we need to double the pad since we're storing forwards and backwards links
+        int minimum_pad = comm_dim_partitioned(i) ? nFace * surfaceCB[i] * (geometry == QUDA_COARSE_GEOMETRY ? 2 : 1) : 0;
+        if (pad < minimum_pad) pad_check = false;
+        if (!pad_check)
+          errorQuda("GaugeField being constructed with insufficient padding in dim %d (%d < %d)", i, pad, minimum_pad);
       }
     }
 
@@ -187,9 +188,9 @@ namespace quda {
         }
       }
 
-    } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER  ||
-	       order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_TIFR_GAUGE_ORDER ||
-	       order == QUDA_TIFR_PADDED_GAUGE_ORDER || order == QUDA_MILC_SITE_GAUGE_ORDER) {
+    } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER || order == QUDA_BQCD_GAUGE_ORDER
+               || order == QUDA_TIFR_GAUGE_ORDER || order == QUDA_TIFR_PADDED_GAUGE_ORDER
+               || order == QUDA_MILC_SITE_GAUGE_ORDER) {
       // does not support device
 
       if (order == QUDA_MILC_SITE_GAUGE_ORDER && param.create != QUDA_REFERENCE_FIELD_CREATE) {
@@ -210,10 +211,10 @@ namespace quda {
 
     if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) {
       if (!isNative()) {
-        for (int i=0; i<nDim; i++) {
+        for (int i = 0; i < nDim; i++) {
           size_t nbytes = nFace * surface[i] * nInternal * precision;
           ghost[i] = quda_ptr(mem_type, nbytes);
-          if (geometry == QUDA_COARSE_GEOMETRY) ghost[i+4] = quda_ptr(mem_type, nbytes);
+          if (geometry == QUDA_COARSE_GEOMETRY) ghost[i + 4] = quda_ptr(mem_type, nbytes);
 
           qudaMemset(ghost[i], 0, nbytes);
           if (geometry == QUDA_COARSE_GEOMETRY) qudaMemset(ghost[i + 4], 0, nbytes);
@@ -228,7 +229,8 @@ namespace quda {
 
     // exchange the boundaries if a non-trivial field
     if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD)
-      if (param.create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) {
+      if (param.create == QUDA_REFERENCE_FIELD_CREATE
+          && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) {
         exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
       }
 
@@ -312,7 +314,8 @@ namespace quda {
     size_t pitch = stride * order * precision;
     if (pad_bytes) {
       for (int parity = 0; parity < 2; parity++) {
-        qudaMemset2DAsync(gauge, parity * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad, device::get_default_stream());
+        qudaMemset2DAsync(gauge, parity * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad,
+                          device::get_default_stream());
       }
     }
   }
@@ -351,14 +354,14 @@ namespace quda {
 
     if (phase != QUDA_STAGGERED_PHASE_INVALID) staggeredPhaseType = phase;
     applyGaugePhase(*this);
-    if (ghostExchange==QUDA_GHOST_EXCHANGE_PAD) exchangeGhost();
+    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) exchangeGhost();
     staggeredPhaseApplied = true;
   }
 
   void GaugeField::removeStaggeredPhase() {
     if (!staggeredPhaseApplied) errorQuda("No staggered phases to remove");
     applyGaugePhase(*this);
-    if (ghostExchange==QUDA_GHOST_EXCHANGE_PAD) exchangeGhost();
+    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) exchangeGhost();
     staggeredPhaseApplied = false;
   }
 
@@ -871,13 +874,13 @@ namespace quda {
       comm_free(mh_send[i]);
       comm_free(mh_recv[i]);
     }
-
   }
 
   bool GaugeField::are_compatible_weak(const GaugeField &a, const GaugeField &b)
   {
-    return (a.LinkType() == b.LinkType() && a.Ncolor() == b.Ncolor() && a.Nface() == b.Nface() && a.GaugeFixed() == b.GaugeFixed()
-            && a.TBoundary() == b.TBoundary() && a.Anisotropy() == b.Anisotropy() && a.Tadpole() == b.Tadpole());
+    return (a.LinkType() == b.LinkType() && a.Ncolor() == b.Ncolor() && a.Nface() == b.Nface()
+            && a.GaugeFixed() == b.GaugeFixed() && a.TBoundary() == b.TBoundary() && a.Anisotropy() == b.Anisotropy()
+            && a.Tadpole() == b.Tadpole());
   }
 
   bool GaugeField::are_compatible(const GaugeField &a, const GaugeField &b)
@@ -905,9 +908,9 @@ namespace quda {
   void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
   {
     if (order == QUDA_QDP_GAUGE_ORDER) {
-      void **buffer = new void*[geometry];
-      for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes/geometry);
-      return ((void*)buffer);
+      void **buffer = new void *[geometry];
+      for (int d = 0; d < geometry; d++) buffer[d] = pool_device_malloc(bytes / geometry);
+      return ((void *)buffer);
     } else {
       return pool_device_malloc(bytes);
     }
@@ -916,8 +919,8 @@ namespace quda {
   void **create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
   {
     if (order > 4) {
-      void **buffer = new void*[geometry];
-      for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes[d]);
+      void **buffer = new void *[geometry];
+      for (int d = 0; d < geometry; d++) buffer[d] = pool_device_malloc(bytes[d]);
       return buffer;
     } else {
       return 0;
@@ -927,8 +930,8 @@ namespace quda {
   void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
   {
     if (order == QUDA_QDP_GAUGE_ORDER) {
-      for (int d=0; d<geometry; d++) pool_device_free(((void**)buffer)[d]);
-      delete []((void**)buffer);
+      for (int d = 0; d < geometry; d++) pool_device_free(((void **)buffer)[d]);
+      delete[]((void **)buffer);
     } else {
       pool_device_free(buffer);
     }
@@ -937,8 +940,8 @@ namespace quda {
   void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
   {
     if (order > 4) {
-      for (int d=0; d<geometry; d++) pool_device_free(buffer[d]);
-      delete []buffer;
+      for (int d = 0; d < geometry; d++) pool_device_free(buffer[d]);
+      delete[] buffer;
     }
   }
 
@@ -974,7 +977,7 @@ namespace quda {
           copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, nullptr);
           if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
         }
-      } else { // CPU location 
+      } else { // CPU location
         if (reorder_location() == QUDA_CPU_FIELD_LOCATION) {
 
           if (!src.isNative()) errorQuda("Only native order is supported");
@@ -990,9 +993,8 @@ namespace quda {
 
         } else { // else reorder on the GPU
 
-          if (order == QUDA_MILC_SITE_GAUGE_ORDER ||
-              order == QUDA_BQCD_GAUGE_ORDER      ||
-              order == QUDA_TIFR_PADDED_GAUGE_ORDER) {
+          if (order == QUDA_MILC_SITE_GAUGE_ORDER || order == QUDA_BQCD_GAUGE_ORDER
+              || order == QUDA_TIFR_PADDED_GAUGE_ORDER) {
             // special case where we use zero-copy memory to read/write directly from application's array
             void *data_d = get_mapped_device_pointer(data());
             if (GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
@@ -1004,10 +1006,10 @@ namespace quda {
           } else {
             void *buffer = create_gauge_buffer(bytes, order, geometry);
             size_t ghost_bytes[8];
-            int dstNinternal = reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : 2*nColor*nColor;
-            for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * dstNinternal * precision;
+            int dstNinternal = reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : 2 * nColor * nColor;
+            for (int d = 0; d < geometry; d++) ghost_bytes[d] = nFace * surface[d % 4] * dstNinternal * precision;
             void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, order, geometry) : nullptr;
-            
+
             if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED) {
               copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr);
               if (geometry == QUDA_COARSE_GEOMETRY)
@@ -1016,24 +1018,24 @@ namespace quda {
             } else {
               copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0);
             }
-          
+
             if (order == QUDA_QDP_GAUGE_ORDER) {
-              for (int d=0; d<geometry; d++) {
+              for (int d = 0; d < geometry; d++) {
                 qudaMemcpy(gauge_array[d].data(), ((void **)buffer)[d], bytes / geometry, qudaMemcpyDeviceToHost);
               }
             } else {
               qudaMemcpy(gauge.data(), buffer, bytes, qudaMemcpyDeviceToHost);
             }
 
-            if (order > 4 && ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
-              for (int d=0; d<geometry; d++)
+            if (order > 4 && ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD
+                && nFace)
+              for (int d = 0; d < geometry; d++)
                 qudaMemcpy(Ghost()[d].data(), ghost_buffer[d], ghost_bytes[d], qudaMemcpyDeviceToHost);
 
             free_gauge_buffer(buffer, order, geometry);
             if (nFace > 0) free_ghost_buffer(ghost_buffer, order, geometry);
           } // order
         }
-
       }
 
     } else if (src.Location() == QUDA_CPU_FIELD_LOCATION) {
@@ -1060,9 +1062,8 @@ namespace quda {
           pool_pinned_free(buffer);
         } else { // else on the GPU
 
-          if (src.Order() == QUDA_MILC_SITE_GAUGE_ORDER ||
-              src.Order() == QUDA_BQCD_GAUGE_ORDER      ||
-              src.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
+          if (src.Order() == QUDA_MILC_SITE_GAUGE_ORDER || src.Order() == QUDA_BQCD_GAUGE_ORDER
+              || src.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
             // special case where we use zero-copy memory to read/write directly from application's array
             void *src_d = get_mapped_device_pointer(src.data());
 
@@ -1075,12 +1076,12 @@ namespace quda {
           } else {
             void *buffer = create_gauge_buffer(src.Bytes(), src.Order(), src.Geometry());
             size_t ghost_bytes[8];
-            int srcNinternal = src.Reconstruct() != QUDA_RECONSTRUCT_NO ? src.Reconstruct() : 2*nColor*nColor;
-            for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * srcNinternal * src.Precision();
+            int srcNinternal = src.Reconstruct() != QUDA_RECONSTRUCT_NO ? src.Reconstruct() : 2 * nColor * nColor;
+            for (int d = 0; d < geometry; d++) ghost_bytes[d] = nFace * surface[d % 4] * srcNinternal * src.Precision();
             void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, src.Order(), geometry) : nullptr;
 
             if (src.Order() == QUDA_QDP_GAUGE_ORDER) {
-              for (int d=0; d<geometry; d++) {
+              for (int d = 0; d < geometry; d++) {
                 qudaMemcpy(((void **)buffer)[d], src.data(d), src.Bytes() / geometry, qudaMemcpyDefault);
               }
             } else {
@@ -1104,7 +1105,7 @@ namespace quda {
             if (nFace > 0) free_ghost_buffer(ghost_buffer, src.Order(), geometry);
           }
         } // reorder_location
-      } // this location
+      }   // this location
     } else {
       errorQuda("Invalid gauge field type");
     }
@@ -1123,14 +1124,13 @@ namespace quda {
     }
   }
 
-  std::ostream& operator<<(std::ostream& output, const GaugeFieldParam& param)
+  std::ostream &operator<<(std::ostream &output, const GaugeFieldParam &param)
   {
     output << static_cast<const LatticeFieldParam &>(param);
     output << "nColor = " << param.nColor << std::endl;
     output << "nFace = " << param.nFace << std::endl;
     output << "reconstruct = " << param.reconstruct << std::endl;
-    int nInternal = (param.reconstruct != QUDA_RECONSTRUCT_NO ?
-		     param.reconstruct : param.nColor * param.nColor * 2);
+    int nInternal = (param.reconstruct != QUDA_RECONSTRUCT_NO ? param.reconstruct : param.nColor * param.nColor * 2);
     output << "nInternal = " << nInternal << std::endl;
     output << "order = " << param.order << std::endl;
     output << "fixed = " << param.fixed << std::endl;
@@ -1146,7 +1146,7 @@ namespace quda {
     return output;  // for multiple << operators.
   }
 
-  std::ostream& operator<<(std::ostream& output, const GaugeField& field)
+  std::ostream &operator<<(std::ostream &output, const GaugeField &field)
   {
     output << static_cast<const LatticeField &>(field);
     output << "init = " << field.init << std::endl;
@@ -1177,7 +1177,7 @@ namespace quda {
     output << "i_mu = " << field.i_mu << std::endl;
     output << "site_offset = " << field.site_offset << std::endl;
     output << "size_size = " << field.site_size << std::endl;
-    return output;  // for multiple << operators.
+    return output; // for multiple << operators.
   }
 
   void GaugeField::zero()
@@ -1241,7 +1241,7 @@ namespace quda {
     return Checksum(*this, mini);
   }
 
-  GaugeField* GaugeField::Create(const GaugeFieldParam &param) { return new GaugeField(param); }
+  GaugeField *GaugeField::Create(const GaugeFieldParam &param) { return new GaugeField(param); }
 
   GaugeField GaugeField::create_alias(const GaugeFieldParam &param_)
   {
@@ -1254,11 +1254,11 @@ namespace quda {
   }
 
   // helper for creating extended gauge fields
-  GaugeField *createExtendedGauge(GaugeField &in, const lat_dim_t &R, TimeProfile &profile,
-                                  bool redundant_comms, QudaReconstructType recon)
+  GaugeField *createExtendedGauge(GaugeField &in, const lat_dim_t &R, TimeProfile &profile, bool redundant_comms,
+                                  QudaReconstructType recon)
   {
     GaugeFieldParam gParamEx(in);
-    //gParamEx.location = QUDA_CUDA_FIELD_LOCATION;
+    // gParamEx.location = QUDA_CUDA_FIELD_LOCATION;
     gParamEx.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
     gParamEx.pad = 0;
     gParamEx.nFace = 1;
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 6286bb04a9..e13d34ee32 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -71,7 +71,6 @@ static bool redundant_comms = false;
 
 #include <blas_lapack.h>
 
-
 GaugeField *gaugePrecise = nullptr;
 GaugeField *gaugeSloppy = nullptr;
 GaugeField *gaugePrecondition = nullptr;
@@ -433,9 +432,9 @@ void initQudaDevice(int dev)
   profileInit.TPSTART(QUDA_PROFILE_INIT);
 
 #ifdef GITVERSION
-  logQuda(QUDA_SUMMARIZE, "QUDA %s (git %s)\n",quda_version.c_str(),gitversion);
+  logQuda(QUDA_SUMMARIZE, "QUDA %s (git %s)\n", quda_version.c_str(), gitversion);
 #else
-  logQuda(QUDA_SUMMARIZE, "QUDA %s\n",quda_version.c_str());
+  logQuda(QUDA_SUMMARIZE, "QUDA %s\n", quda_version.c_str());
 #endif
 
 #ifdef MULTI_GPU
@@ -548,9 +547,8 @@ void freeUniqueSloppyGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, Gau
  * @param extended[in/out] Reference to the pointer of a given "extended" field.
  * @param preserve_precise[in] Whether (true) or not (false) to preserve the precise field.
  */
-void freeUniqueGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition,
-                            GaugeField *&refinement, GaugeField *&eigensolver, GaugeField *&extended,
-                            bool preserve_precise);
+void freeUniqueGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition, GaugeField *&refinement,
+                            GaugeField *&eigensolver, GaugeField *&extended, bool preserve_precise);
 
 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 {
@@ -691,7 +689,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
   profileGauge.TPSTOP(QUDA_PROFILE_COMPUTE);
 
   // create an extended preconditioning field
-  GaugeField* extended = nullptr;
+  GaugeField *extended = nullptr;
   if (param->overlap){
     lat_dim_t R; // domain-overlap widths in different directions
     for (int i=0; i<4; ++i) R[i] = param->overlap*commDimPartitioned(i);
@@ -1029,9 +1027,8 @@ void freeUniqueSloppyGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, Gau
   sloppy = nullptr;
 }
 
-void freeUniqueGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition,
-                            GaugeField *&refinement, GaugeField *&eigensolver, GaugeField *&extended,
-                            bool preserve_precise)
+void freeUniqueGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition, GaugeField *&refinement,
+                            GaugeField *&eigensolver, GaugeField *&extended, bool preserve_precise)
 {
   freeUniqueSloppyGaugeUtility(precise, sloppy, precondition, refinement, eigensolver);
 
@@ -1471,9 +1468,9 @@ namespace quda {
       memcpy(diracParam.c_5, inv_param->c_5, sizeof(Complex) * inv_param->Ls);
       logQuda(QUDA_DEBUG_VERBOSE, "Printing b_5 and c_5 values\n");
       for (int i = 0; i < diracParam.Ls; i++) {
-        logQuda(QUDA_DEBUG_VERBOSE, "fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n",
-                i, diracParam.b_5[i].real(),
-                diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(), diracParam.c_5[i].imag());
+        logQuda(QUDA_DEBUG_VERBOSE, "fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n", i,
+                diracParam.b_5[i].real(), diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(),
+                diracParam.c_5[i].imag());
       }
       break;
     case QUDA_STAGGERED_DSLASH:
@@ -3109,10 +3106,8 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     }
 
     // Make the gauge param dimensions larger
-    logQuda(QUDA_DEBUG_VERBOSE,
-            "Spliting the grid into sub-partitions: (%2d,%2d,%2d,%2d) / (%2d,%2d,%2d,%2d)\n",
-            comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3),
-            split_key[0], split_key[1], split_key[2], split_key[3]);
+    logQuda(QUDA_DEBUG_VERBOSE, "Spliting the grid into sub-partitions: (%2d,%2d,%2d,%2d) / (%2d,%2d,%2d,%2d)\n",
+            comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3), split_key[0], split_key[1], split_key[2], split_key[3]);
 
     for (int d = 0; d < CommKey::n_dim; d++) {
       if (comm_dim(d) % split_key[d] != 0) {
@@ -3562,9 +3557,8 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
       const double refine_tol = (param->tol_offset[i] == 0.0 ? iter_tol : param->tol_offset[i]);
       // refine if either L2 or heavy quark residual tolerances have not been met, only if desired residual is > 0
       if (param->true_res_offset[i] > refine_tol || rsd_hq > tol_hq) {
-	logQuda(QUDA_SUMMARIZE,
-                "Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
-                i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);
+        logQuda(QUDA_SUMMARIZE, "Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n", i,
+                param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);
 
         // for staggered the shift is just a change in mass term (FIXME: for twisted mass also)
         if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
@@ -3612,7 +3606,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
           bool orthogonal = false;
           bool apply_mat = true;
           bool hermitian = true;
-	  MinResExt mre(*m, orthogonal, apply_mat, hermitian);
+          MinResExt mre(*m, orthogonal, apply_mat, hermitian);
           mre(x[i], b, z, q);
         }
 
@@ -3691,14 +3685,14 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
 
   GaugeFieldParam gParam(*param, fatlink, QUDA_GENERAL_LINKS);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  GaugeField cpuFatLink(gParam);   // create the host fatlink
+  GaugeField cpuFatLink(gParam); // create the host fatlink
   gParam.gauge = longlink;
-  GaugeField cpuLongLink(gParam);  // create the host longlink
+  GaugeField cpuLongLink(gParam); // create the host longlink
   gParam.gauge = ulink;
   GaugeField cpuUnitarizedLink(gParam);
   gParam.link_type = param->type;
   gParam.gauge = inlink;
-  GaugeField cpuInLink(gParam);    // create the host sitelink
+  GaugeField cpuInLink(gParam); // create the host sitelink
 
   // create the device fields
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
@@ -3751,7 +3745,8 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
       const double tol = unitarizedLink.Precision() == QUDA_DOUBLE_PRECISION ? 1e-15 : 2e-6;
       if (unitarizedLink.StaggeredPhaseApplied()) unitarizedLink.removeStaggeredPhase();
       projectSU3(unitarizedLink, tol, num_failures_d);
-      if (unitarizedLink.StaggeredPhaseApplied() && param->staggered_phase_applied) unitarizedLink.applyStaggeredPhase();
+      if (unitarizedLink.StaggeredPhaseApplied() && param->staggered_phase_applied)
+        unitarizedLink.applyStaggeredPhase();
       if (*num_failures_h > 0) errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
     }
 
@@ -3768,14 +3763,14 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
 
   GaugeFieldParam gParam(*param, inlink, QUDA_ASQTAD_LONG_LINKS);
   gParam.gauge = twolink;
-  GaugeField cpuTwoLink(gParam);  // create the host twolink
+  GaugeField cpuTwoLink(gParam); // create the host twolink
 
   GaugeField *cudaInLinkEx = nullptr;
 
   if (inlink) {
     gParam.link_type = param->type;
     gParam.gauge     = inlink;
-    GaugeField cpuInLink(gParam);    // create the host sitelink
+    GaugeField cpuInLink(gParam); // create the host sitelink
 
     // create the device fields
     gParam.reconstruct = param->reconstruct;
@@ -3881,8 +3876,10 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
     std::exchange(*gaugePrecise, cudaSiteLink);
   }
 
-  if (qudaGaugeParam->make_resident_mom && !qudaGaugeParam->use_resident_mom) std::exchange(momResident, cudaMom);
-  else if (!qudaGaugeParam->make_resident_mom) momResident = GaugeField();
+  if (qudaGaugeParam->make_resident_mom && !qudaGaugeParam->use_resident_mom)
+    std::exchange(momResident, cudaMom);
+  else if (!qudaGaugeParam->make_resident_mom)
+    momResident = GaugeField();
 
   if (qudaGaugeParam->make_resident_gauge) {
     if (extendedGaugeResident) delete extendedGaugeResident;
@@ -3907,7 +3904,7 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
   if (qudaGaugeParam->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field to use");
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.create = QUDA_COPY_FIELD_CREATE;
-  gParam.field  = &cpuSiteLink;
+  gParam.field = &cpuSiteLink;
   gParam.reconstruct = qudaGaugeParam->reconstruct;
   gParam.setPrecision(qudaGaugeParam->cuda_prec, true);
   GaugeField cudaSiteLink = qudaGaugeParam->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
@@ -4003,7 +4000,8 @@ void createCloverQuda(QudaInvertParam* invertParam)
   // for clover we optimize to only send depth 1 halos in y/z/t (FIXME - make work for x, make robust in general)
   lat_dim_t R;
   for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d));
-  GaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profileClover, false, recon);
+  GaugeField *gauge = extendedGaugeResident ? extendedGaugeResident :
+                                              createExtendedGauge(*gaugePrecise, R, profileClover, false, recon);
 
   GaugeField *ex = gauge;
   if (gauge->Precision() < cloverPrecise->Precision()) {
@@ -4042,7 +4040,7 @@ void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
 
   gParam.setPrecision(gParam.Precision(), true);
   gParam.create = QUDA_ZERO_FIELD_CREATE;
-  auto* cudaGauge = new GaugeField(gParam);
+  auto *cudaGauge = new GaugeField(gParam);
 
   if (gauge) {
     cudaGauge->copy(*cpuGauge);
@@ -4054,7 +4052,7 @@ void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
 
 void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param)
 {
-  auto* cudaGauge = reinterpret_cast<GaugeField*>(inGauge);
+  auto *cudaGauge = reinterpret_cast<GaugeField *>(inGauge);
 
   GaugeFieldParam gParam(*param, gauge);
   gParam.geometry = cudaGauge->Geometry();
@@ -4065,7 +4063,7 @@ void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param)
 
 void destroyGaugeFieldQuda(void *gauge)
 {
-  auto* g = reinterpret_cast<GaugeField*>(gauge);
+  auto *g = reinterpret_cast<GaugeField *>(gauge);
   delete g;
 }
 
@@ -4082,7 +4080,8 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   GaugeField cpuMom(gParam);
 
   // create the device momentum field
-  if (gauge_param->use_resident_mom && momResident.empty()) errorQuda("Cannot use resident momentum field since none appears resident");
+  if (gauge_param->use_resident_mom && momResident.empty())
+    errorQuda("Cannot use resident momentum field since none appears resident");
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParam.create = QUDA_COPY_FIELD_CREATE;
@@ -4115,7 +4114,8 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
 
   // resident gauge field is required
   if (!gauge_param->use_resident_gauge || !gaugePrecise) errorQuda("Resident gauge field is required");
-  if (!gaugePrecise->StaggeredPhaseApplied()) errorQuda("Gauge field requires the staggered phase factors to be applied");
+  if (!gaugePrecise->StaggeredPhaseApplied())
+    errorQuda("Gauge field requires the staggered phase factors to be applied");
 
   // check if staggered phase is the desired one
   if (gauge_param->staggered_phase_type != gaugePrecise->StaggeredPhase()) {
@@ -4125,7 +4125,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
 
   const int nvector = inv_param->num_offset;
   std::vector<ColorSpinorField*> X(nvector);
-  for (int i=0; i<nvector; i++) X[i] = ColorSpinorField::Create(qParam);
+  for (int i = 0; i < nvector; i++) X[i] = ColorSpinorField::Create(qParam);
 
   if (inv_param->use_resident_solution) {
     if (solutionResident.size() < (unsigned int)nvector)
@@ -4179,8 +4179,10 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   // copy the momentum field back to the host
   if (gauge_param->return_result_mom) cpuMom.copy(cudaMom);
 
-  if (gauge_param->make_resident_mom && !gauge_param->use_resident_mom) std::exchange(momResident, cudaMom);
-  else if (!gauge_param->make_resident_mom) momResident = GaugeField();
+  if (gauge_param->make_resident_mom && !gauge_param->use_resident_mom)
+    std::exchange(momResident, cudaMom);
+  else if (!gauge_param->make_resident_mom)
+    momResident = GaugeField();
 
   for (int i=0; i<nvector; i++) delete X[i];
 }
@@ -4477,8 +4479,10 @@ void computeHISQForceQuda(void* const milc_momentum,
   // Close the paths, make anti-hermitian, and store in compressed format
   if (gParam->return_result_mom) cpuMom.copy(mom);
 
-  if (gParam->make_resident_mom && !gParam->use_resident_mom) std::exchange(momResident, mom);
-  else if (!gParam->make_resident_mom) momResident = GaugeField();
+  if (gParam->make_resident_mom && !gParam->use_resident_mom)
+    std::exchange(momResident, mom);
+  else if (!gParam->make_resident_mom)
+    momResident = GaugeField();
 }
 
 void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double *coeff, double kappa2, double ck,
@@ -4640,7 +4644,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
   delete dirac;
 }
 
-void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom, int exact, QudaGaugeParam* param)
+void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
 {
   auto profile = pushProfile(profileGaugeUpdate);
   checkGaugeParam(param);
@@ -4687,8 +4691,10 @@ void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom,
     std::exchange(*gaugePrecise, u_out);
   }
 
-  if (param->make_resident_mom && !param->use_resident_mom) std::exchange(momResident, cudaMom);
-  else if (!param->make_resident_mom) momResident = GaugeField();
+  if (param->make_resident_mom && !param->use_resident_mom)
+    std::exchange(momResident, cudaMom);
+  else if (!param->make_resident_mom)
+    momResident = GaugeField();
 }
 
 void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
@@ -4790,8 +4796,10 @@ double momActionQuda(void* momentum, QudaGaugeParam* param)
   // perform the update
   double action = computeMomAction(cudaMom);
 
-  if (param->make_resident_mom && !param->use_resident_mom) std::exchange(momResident, cudaMom);
-  else if (!param->make_resident_mom) momResident = GaugeField();
+  if (param->make_resident_mom && !param->use_resident_mom)
+    std::exchange(momResident, cudaMom);
+  else if (!param->make_resident_mom)
+    momResident = GaugeField();
 
   return action;
 }
@@ -4947,7 +4955,6 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param,
   popVerbosity();
 }
 
-
 void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_param)
 {
   if (smear_param->n_steps == 0) return;
@@ -4976,7 +4983,7 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
     //
     gaugeSmeared = new GaugeField(gParam);
 
-    GaugeField *two_link_ext = createExtendedGauge(*gaugePrecise, R, profileGauge);//aux field
+    GaugeField *two_link_ext = createExtendedGauge(*gaugePrecise, R, profileGauge); // aux field
 
     computeTwoLink(*gaugeSmeared, *two_link_ext);
 
@@ -5046,7 +5053,7 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
   // Scale up the source to prevent underflow
   profileGaussianSmear.TPSTART(QUDA_PROFILE_COMPUTE);
 
-  const double msq     = 1. / ftmp;
+  const double msq = 1. / ftmp;
   const double a       = inv_param->laplace3D * 2.0 + msq;
   const QudaParity  parity   = QUDA_INVALID_PARITY;
   for (int i = 0; i < smear_param->n_steps; i++) {
@@ -5137,9 +5144,8 @@ void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam
   gaugeObservables(in, obs_param[measurement_n]);
 
   logQuda(QUDA_SUMMARIZE, "flow t, plaquette, E_tot, E_spatial, E_temporal, Q charge\n");
-  logQuda(QUDA_SUMMARIZE, "%le %.16e %+.16e %+.16e %+.16e %+.16e\n", 0.0,
-          obs_param[0].plaquette[0], obs_param[0].energy[0],
-          obs_param[0].energy[1], obs_param[0].energy[2], obs_param[0].qcharge);
+  logQuda(QUDA_SUMMARIZE, "%le %.16e %+.16e %+.16e %+.16e %+.16e\n", 0.0, obs_param[0].plaquette[0],
+          obs_param[0].energy[0], obs_param[0].energy[1], obs_param[0].energy[2], obs_param[0].qcharge);
 
   for (unsigned int i = 0; i < smear_param->n_steps; i++) {
     // Perform W1, W2, and Vt Wilson Flow steps as defined in
@@ -5152,8 +5158,7 @@ void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam
       gaugeObservables(out, obs_param[measurement_n]);
       logQuda(QUDA_SUMMARIZE, "%le %.16e %+.16e %+.16e %+.16e %+.16e\n", smear_param->epsilon * (i + 1),
               obs_param[measurement_n].plaquette[0], obs_param[measurement_n].energy[0],
-              obs_param[measurement_n].energy[1], obs_param[measurement_n].energy[2],
-              obs_param[measurement_n].qcharge);
+              obs_param[measurement_n].energy[1], obs_param[measurement_n].energy[2], obs_param[measurement_n].qcharge);
     }
   }
 
diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp
index d195b7edb3..29528e0829 100644
--- a/lib/lattice_field.cpp
+++ b/lib/lattice_field.cpp
@@ -549,7 +549,8 @@ namespace quda {
     for (int d = 1; d < nDim; d++) vol_ss << "x" << x[d];
     vol_string = vol_ss.str();
     if (vol_string.size() >= TuneKey::volume_n)
-      errorQuda("Vol string %s (size = %lu) larger than maximum %d", vol_string.c_str(), vol_string.size(), TuneKey::volume_n);
+      errorQuda("Vol string %s (size = %lu) larger than maximum %d", vol_string.c_str(), vol_string.size(),
+                TuneKey::volume_n);
   }
 
   void LatticeField::checkField(const LatticeField &a) const {
@@ -623,7 +624,7 @@ namespace quda {
     return output;  // for multiple << operators.
   }
 
-  std::ostream& operator<<(std::ostream& output, const LatticeField &field)
+  std::ostream &operator<<(std::ostream &output, const LatticeField &field)
   {
     output << "volume = " << field.volume << std::endl;
     output << "volumeCB = " << field.volumeCB << std::endl;
@@ -645,7 +646,7 @@ namespace quda {
     output << "ghost_precision = " << field.ghost_precision_reset << std::endl;
     output << "scale = " << field.scale << std::endl;
     output << "siteSubset = " << field.siteSubset << std::endl;
-    output << "ghostExchange = " << field.ghostExchange<< std::endl;
+    output << "ghostExchange = " << field.ghostExchange << std::endl;
     output << "nDimComms = " << field.nDimComms << std::endl;
     output << "ghost_bytes = " << field.ghost_bytes_old << std::endl;
     output << "ghost_bytes_old = " << field.ghost_bytes_old << std::endl;
@@ -672,7 +673,8 @@ namespace quda {
     output << "vol_string = " << field.vol_string << std::endl;
     output << "aux_string = " << field.aux_string << std::endl;
     output << "mem_type = " << field.mem_type << std::endl;
-    for (auto i = 0u; i < field.backup_h.size(); i++) output << "backup_h[" << i << "] = " << field.backup_h[i] << std::endl;
+    for (auto i = 0u; i < field.backup_h.size(); i++)
+      output << "backup_h[" << i << "] = " << field.backup_h[i] << std::endl;
     return output;
   }
 
diff --git a/lib/milc_interface.cpp b/lib/milc_interface.cpp
index 7bef9f4496..fed48a17f3 100644
--- a/lib/milc_interface.cpp
+++ b/lib/milc_interface.cpp
@@ -2610,7 +2610,7 @@ void* qudaCreateGaugeField(void* gauge, int geometry, int precision)
 void qudaSaveGaugeField(void* gauge, void* inGauge)
 {
   qudamilc_called<true>(__func__);
-  auto cudaGauge = reinterpret_cast<GaugeField*>(inGauge);
+  auto cudaGauge = reinterpret_cast<GaugeField *>(inGauge);
   QudaGaugeParam qudaGaugeParam = newMILCGaugeParam(localDim, cudaGauge->Precision(), QUDA_GENERAL_LINKS);
   saveGaugeFieldQuda(gauge, inGauge, &qudaGaugeParam);
   qudamilc_called<false>(__func__);
diff --git a/lib/quda_ptr.cpp b/lib/quda_ptr.cpp
index bbb8d88457..c4e6197850 100644
--- a/lib/quda_ptr.cpp
+++ b/lib/quda_ptr.cpp
@@ -3,12 +3,10 @@
 #include "util_quda.h"
 #include "timer.h"
 
-namespace quda {
+namespace quda
+{
 
-  quda_ptr::quda_ptr(QudaMemoryType type, size_t size, bool pool) :
-    type(type),
-    size(size),
-    pool(pool)
+  quda_ptr::quda_ptr(QudaMemoryType type, size_t size, bool pool) : type(type), size(size), pool(pool)
   {
     getProfile().TPSTART(QUDA_PROFILE_INIT);
     if (pool && (type != QUDA_MEMORY_DEVICE && type != QUDA_MEMORY_HOST_PINNED && type != QUDA_MEMORY_HOST))
@@ -16,18 +14,10 @@ namespace quda {
 
     if (size > 0) {
       switch (type) {
-      case QUDA_MEMORY_DEVICE:
-        device = pool ? pool_device_malloc(size) : device_malloc(size);
-        break;
-      case QUDA_MEMORY_DEVICE_PINNED:
-        device = device_pinned_malloc(size);
-        break;
-      case QUDA_MEMORY_HOST:
-        host = safe_malloc(size);
-        break;
-      case QUDA_MEMORY_HOST_PINNED:
-        host = pool ? pool_pinned_malloc(size) : pinned_malloc(size);
-        break;
+      case QUDA_MEMORY_DEVICE: device = pool ? pool_device_malloc(size) : device_malloc(size); break;
+      case QUDA_MEMORY_DEVICE_PINNED: device = device_pinned_malloc(size); break;
+      case QUDA_MEMORY_HOST: host = safe_malloc(size); break;
+      case QUDA_MEMORY_HOST_PINNED: host = pool ? pool_pinned_malloc(size) : pinned_malloc(size); break;
       case QUDA_MEMORY_MAPPED:
         host = mapped_malloc(size);
         device = get_mapped_device_pointer(host);
@@ -42,9 +32,7 @@ namespace quda {
     getProfile().TPSTOP(QUDA_PROFILE_INIT);
   }
 
-  quda_ptr::quda_ptr(void *ptr, QudaMemoryType type) :
-    type(type),
-    reference(true)
+  quda_ptr::quda_ptr(void *ptr, QudaMemoryType type) : type(type), reference(true)
   {
     getProfile().TPSTART(QUDA_PROFILE_INIT);
     switch (type) {
@@ -67,7 +55,7 @@ namespace quda {
     getProfile().TPSTOP(QUDA_PROFILE_INIT);
   }
 
-  quda_ptr& quda_ptr::operator=(quda_ptr &&other)
+  quda_ptr &quda_ptr::operator=(quda_ptr &&other)
   {
     if (&other != this) {
       if (size > 0) errorQuda("Cannot move to already initialized quda_ptr");
@@ -84,11 +72,11 @@ namespace quda {
   {
     if (size > 0) {
       switch (type) {
-      case QUDA_MEMORY_DEVICE:        pool ? pool_device_free(device) : device_free(device); break;
+      case QUDA_MEMORY_DEVICE: pool ? pool_device_free(device) : device_free(device); break;
       case QUDA_MEMORY_DEVICE_PINNED: device_pinned_free(device); break;
-      case QUDA_MEMORY_HOST:          host_free(host); break;
-      case QUDA_MEMORY_HOST_PINNED:   pool ? pool_pinned_free(host) : host_free(host); break;
-      case QUDA_MEMORY_MAPPED:        host_free(host); break;
+      case QUDA_MEMORY_HOST: host_free(host); break;
+      case QUDA_MEMORY_HOST_PINNED: pool ? pool_pinned_free(host) : host_free(host); break;
+      case QUDA_MEMORY_MAPPED: host_free(host); break;
       default: errorQuda("Unknown memory type %d", type);
       }
     }
@@ -118,8 +106,7 @@ namespace quda {
     case QUDA_MEMORY_DEVICE:
     case QUDA_MEMORY_DEVICE_PINNED:
     case QUDA_MEMORY_MAPPED:
-    case QUDA_MEMORY_MANAGED:
-      return true;
+    case QUDA_MEMORY_MANAGED: return true;
     default: return false;
     }
   }
@@ -129,8 +116,7 @@ namespace quda {
     switch (type) {
     case QUDA_MEMORY_HOST:
     case QUDA_MEMORY_HOST_PINNED:
-    case QUDA_MEMORY_MANAGED:
-      return true;
+    case QUDA_MEMORY_MANAGED: return true;
     default: return false;
     }
   }
@@ -143,13 +129,9 @@ namespace quda {
     case QUDA_MEMORY_DEVICE:
     case QUDA_MEMORY_DEVICE_PINNED:
     case QUDA_MEMORY_MAPPED:
-    case QUDA_MEMORY_MANAGED:
-      ptr = device;
-      break;
+    case QUDA_MEMORY_MANAGED: ptr = device; break;
     case QUDA_MEMORY_HOST:
-    case QUDA_MEMORY_HOST_PINNED:
-      ptr = host;
-      break;
+    case QUDA_MEMORY_HOST_PINNED: ptr = host; break;
     default: errorQuda("Unknown memory type %d", type);
     }
 
@@ -170,11 +152,11 @@ namespace quda {
 
   bool quda_ptr::is_reference() const { return reference; }
 
-  std::ostream& operator<<(std::ostream& output, const quda_ptr& ptr)
+  std::ostream &operator<<(std::ostream &output, const quda_ptr &ptr)
   {
-    output << "{type = " << ptr.type << ", size = " << ptr.size << ", pool = " << ptr.pool << ", device = " << ptr.device
-           << ", host = " << ptr.host << ", reference = " << ptr.reference << "}";
+    output << "{type = " << ptr.type << ", size = " << ptr.size << ", pool = " << ptr.pool
+           << ", device = " << ptr.device << ", host = " << ptr.host << ", reference = " << ptr.reference << "}";
     return output;
   }
 
-}
+} // namespace quda
diff --git a/lib/targets/cuda/quda_api.cpp b/lib/targets/cuda/quda_api.cpp
index 00f829b7a4..b0a4b7cc4f 100644
--- a/lib/targets/cuda/quda_api.cpp
+++ b/lib/targets/cuda/quda_api.cpp
@@ -325,11 +325,12 @@ namespace quda
     QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line);
   }
 
-  void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func, const char *file,
-                   const char *line)
+  void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func,
+                   const char *file, const char *line)
   {
     if (count == 0) return;
-    QudaMem copy(dst.data(), src.data(), count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line);
+    QudaMem copy(dst.data(), src.data(), count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func,
+                 file, line);
   }
 
   void qudaMemcpyAsync_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const qudaStream_t &stream,
@@ -396,8 +397,8 @@ namespace quda
     QudaMem copy(ptr, value, count, stream, true, func, file, line);
   }
 
-  void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream,
-                        const char *func, const char *file, const char *line)
+  void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
+                        const char *file, const char *line)
   {
     if (count == 0) return;
     if (ptr.is_device()) {
@@ -411,10 +412,11 @@ namespace quda
                           const qudaStream_t &stream, const char *func, const char *file, const char *line)
   {
     if (ptr.is_device()) {
-      cudaError_t error = cudaMemset2DAsync(static_cast<char*>(ptr.data()) + offset, pitch, value, width, height, get_stream(stream));
+      cudaError_t error
+        = cudaMemset2DAsync(static_cast<char *>(ptr.data()) + offset, pitch, value, width, height, get_stream(stream));
       set_runtime_error(error, __func__, func, file, line);
     } else {
-      for (auto i = 0u; i < height; i++) memset(static_cast<char*>(ptr.data()) + offset + i * pitch, value, width);
+      for (auto i = 0u; i < height; i++) memset(static_cast<char *>(ptr.data()) + offset + i * pitch, value, width);
     }
   }
 
diff --git a/lib/targets/hip/quda_api.cpp b/lib/targets/hip/quda_api.cpp
index 6d9345a884..8fff9a75a5 100644
--- a/lib/targets/hip/quda_api.cpp
+++ b/lib/targets/hip/quda_api.cpp
@@ -261,11 +261,12 @@ namespace quda
     QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line);
   }
 
-  void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func, const char *file,
-                   const char *line)
+  void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func,
+                   const char *file, const char *line)
   {
     if (count == 0) return;
-    QudaMem copy(dst.data(), src.data(), count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line);
+    QudaMem copy(dst.data(), src.data(), count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func,
+                 file, line);
   }
 
   void qudaMemcpyAsync_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const qudaStream_t &stream,
@@ -312,8 +313,8 @@ namespace quda
     QudaMem copy(ptr, value, count, stream, true, func, file, line);
   }
 
-  void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream,
-                        const char *func, const char *file, const char *line)
+  void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
+                        const char *file, const char *line)
   {
     if (count == 0) return;
     if (ptr.is_device()) {
@@ -327,10 +328,11 @@ namespace quda
                           const qudaStream_t &stream, const char *func, const char *file, const char *line)
   {
     if (ptr.is_device()) {
-      hipError_t error = hipMemset2DAsync(static_cast<char*>(ptr.data()) + offset, pitch, value, width, height, get_stream(stream));
+      hipError_t error
+        = hipMemset2DAsync(static_cast<char *>(ptr.data()) + offset, pitch, value, width, height, get_stream(stream));
       set_runtime_error(error, __func__, func, file, line);
     } else {
-      for (auto i = 0u; i < height; i++) memset(static_cast<char*>(ptr.data()) + offset + i * pitch, value, width);
+      for (auto i = 0u; i < height; i++) memset(static_cast<char *>(ptr.data()) + offset + i * pitch, value, width);
     }
   }
 
diff --git a/lib/timer.cpp b/lib/timer.cpp
index 125b9242d4..d77365ec34 100644
--- a/lib/timer.cpp
+++ b/lib/timer.cpp
@@ -224,13 +224,10 @@ namespace quda {
 
   TimeProfile dummy("default", false);
 
-  static std::stack<TimeProfile*> tp_stack;
+  static std::stack<TimeProfile *> tp_stack;
 
   pushProfile::pushProfile(TimeProfile &profile, double &secs, double &gflops) :
-    profile(profile),
-    secs(secs),
-    gflops(gflops),
-    flops(Tunable::flops_global())
+    profile(profile), secs(secs), gflops(gflops), flops(Tunable::flops_global())
 
   {
     profile.TPSTART(QUDA_PROFILE_TOTAL);
@@ -249,7 +246,7 @@ namespace quda {
     if (&gflops != &gflops_dummy) comm_allreduce_sum(gflops);
   }
 
-  TimeProfile& getProfile()
+  TimeProfile &getProfile()
   {
     if (tp_stack.empty()) return dummy;
     return *(tp_stack.top());
diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h
index 77f0fe839d..a06dff6147 100644
--- a/tests/dslash_test_utils.h
+++ b/tests/dslash_test_utils.h
@@ -355,7 +355,8 @@ struct DslashTestWrapper {
                   inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::Mat:
-        wil_mat(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec, gauge_param);
+        wil_mat(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec,
+                gauge_param);
         break;
       case dslash_test_type::MatPCDagMatPC:
         wil_matpc(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.matpc_type, inv_param.dagger,
@@ -364,16 +365,18 @@ struct DslashTestWrapper {
                   inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatDagMat:
-        wil_mat(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-        wil_mat(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, not_dagger, inv_param.cpu_prec, gauge_param);
+        wil_mat(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec,
+                gauge_param);
+        wil_mat(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, not_dagger, inv_param.cpu_prec,
+                gauge_param);
         break;
       default: printfQuda("Test type not defined\n"); exit(-1);
       }
     } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        clover_dslash(spinorRef.data(), hostGauge, hostCloverInv, spinor.data(), parity, inv_param.dagger, inv_param.cpu_prec,
-                      gauge_param);
+        clover_dslash(spinorRef.data(), hostGauge, hostCloverInv, spinor.data(), parity, inv_param.dagger,
+                      inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatPC:
         clover_matpc(spinorRef.data(), hostGauge, hostClover, hostCloverInv, spinor.data(), inv_param.kappa,
@@ -392,8 +395,8 @@ struct DslashTestWrapper {
       case dslash_test_type::MatDagMat:
         clover_mat(spinorTmp.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.dagger,
                    inv_param.cpu_prec, gauge_param);
-        clover_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa, not_dagger, inv_param.cpu_prec,
-                   gauge_param);
+        clover_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa, not_dagger,
+                   inv_param.cpu_prec, gauge_param);
         break;
       default: printfQuda("Test type not defined\n"); exit(-1);
       }
@@ -402,14 +405,14 @@ struct DslashTestWrapper {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
         // My dslash should be the same as the clover dslash
-        clover_dslash(spinorRef.data(), hostGauge, hostCloverInv, spinor.data(), parity, inv_param.dagger, inv_param.cpu_prec,
-                      gauge_param);
+        clover_dslash(spinorRef.data(), hostGauge, hostCloverInv, spinor.data(), parity, inv_param.dagger,
+                      inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatPC:
         // my matpc op
-        cloverHasenbuschTwist_matpc(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa,
-                                    inv_param.mu, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec,
-                                    gauge_param);
+        cloverHasenbuschTwist_matpc(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv,
+                                    inv_param.kappa, inv_param.mu, inv_param.matpc_type, inv_param.dagger,
+                                    inv_param.cpu_prec, gauge_param);
 
         break;
       case dslash_test_type::Mat:
@@ -420,20 +423,21 @@ struct DslashTestWrapper {
       case dslash_test_type::MatPCDagMatPC:
         // matpc^\dagger matpc
         // my matpc op
-        cloverHasenbuschTwist_matpc(spinorTmp.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa,
-                                    inv_param.mu, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec,
-                                    gauge_param);
+        cloverHasenbuschTwist_matpc(spinorTmp.data(), hostGauge, spinor.data(), hostClover, hostCloverInv,
+                                    inv_param.kappa, inv_param.mu, inv_param.matpc_type, inv_param.dagger,
+                                    inv_param.cpu_prec, gauge_param);
 
-        cloverHasenbuschTwist_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), hostClover, hostCloverInv, inv_param.kappa,
-                                    inv_param.mu, inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param);
+        cloverHasenbuschTwist_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), hostClover, hostCloverInv,
+                                    inv_param.kappa, inv_param.mu, inv_param.matpc_type, not_dagger, inv_param.cpu_prec,
+                                    gauge_param);
 
         break;
       case dslash_test_type::MatDagMat:
         // my mat
         cloverHasenbuchTwist_mat(spinorTmp.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu,
                                  inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.matpc_type);
-        cloverHasenbuchTwist_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa, inv_param.mu,
-                                 not_dagger, inv_param.cpu_prec, gauge_param, inv_param.matpc_type);
+        cloverHasenbuchTwist_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa,
+                                 inv_param.mu, not_dagger, inv_param.cpu_prec, gauge_param, inv_param.matpc_type);
 
         break;
       default: printfQuda("Test type not defined\n"); exit(-1);
@@ -442,11 +446,11 @@ struct DslashTestWrapper {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET)
-          tm_dslash(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, parity,
-                    inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
+          tm_dslash(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+                    parity, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         else {
-          tm_ndeg_dslash(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon, parity,
-                         inv_param.dagger, inv_param.matpc_type, inv_param.cpu_prec, gauge_param);
+          tm_ndeg_dslash(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
+                         parity, inv_param.dagger, inv_param.matpc_type, inv_param.cpu_prec, gauge_param);
         }
         break;
       case dslash_test_type::MatPC:
@@ -499,9 +503,9 @@ struct DslashTestWrapper {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET)
-          tmc_dslash(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
-                     inv_param.twist_flavor, parity, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec,
-                     gauge_param);
+          tmc_dslash(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa,
+                     inv_param.mu, inv_param.twist_flavor, parity, inv_param.matpc_type, inv_param.dagger,
+                     inv_param.cpu_prec, gauge_param);
         else
           tmc_ndeg_dslash(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa,
                           inv_param.mu, inv_param.epsilon, parity, inv_param.matpc_type, inv_param.dagger,
@@ -512,8 +516,9 @@ struct DslashTestWrapper {
           tmc_matpc(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
                     inv_param.twist_flavor, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         else
-          tmc_ndeg_matpc(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
-                         inv_param.epsilon, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
+          tmc_ndeg_matpc(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa,
+                         inv_param.mu, inv_param.epsilon, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec,
+                         gauge_param);
         break;
       case dslash_test_type::Mat:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET)
@@ -527,11 +532,13 @@ struct DslashTestWrapper {
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) {
           tmc_matpc(spinorTmp.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
                     inv_param.twist_flavor, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tmc_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
-                    inv_param.twist_flavor, inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param);
+          tmc_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), hostClover, hostCloverInv, inv_param.kappa,
+                    inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, not_dagger, inv_param.cpu_prec,
+                    gauge_param);
         } else {
-          tmc_ndeg_matpc(spinorTmp.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
-                         inv_param.epsilon, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
+          tmc_ndeg_matpc(spinorTmp.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa,
+                         inv_param.mu, inv_param.epsilon, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec,
+                         gauge_param);
           tmc_ndeg_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), hostClover, hostCloverInv, inv_param.kappa,
                          inv_param.mu, inv_param.epsilon, inv_param.matpc_type, not_dagger, inv_param.cpu_prec,
                          gauge_param);
@@ -555,8 +562,8 @@ struct DslashTestWrapper {
     } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        dw_dslash(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                  inv_param.mass);
+        dw_dslash(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
+                  gauge_param, inv_param.mass);
         break;
       case dslash_test_type::MatPC:
         dw_matpc(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.matpc_type, inv_param.dagger,
@@ -573,8 +580,8 @@ struct DslashTestWrapper {
                  gauge_param.cpu_prec, gauge_param, inv_param.mass);
         break;
       case dslash_test_type::MatDagMat:
-        dw_matdagmat(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                     inv_param.mass);
+        dw_matdagmat(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec,
+                     gauge_param, inv_param.mass);
         break;
       default: printf("Test type not supported for domain wall\n"); exit(-1);
       }
@@ -583,24 +590,24 @@ struct DslashTestWrapper {
       for (int xs = 0; xs < Ls; xs++) kappa_5[xs] = kappa5;
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        dslash_4_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                    inv_param.mass);
+        dslash_4_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
+                    gauge_param, inv_param.mass);
         break;
       case dslash_test_type::M5:
         dw_dslash_5_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
                        gauge_param, inv_param.mass, true);
         break;
       case dslash_test_type::M5inv:
-        dslash_5_inv(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                     inv_param.mass, kappa_5);
+        dslash_5_inv(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
+                     gauge_param, inv_param.mass, kappa_5);
         break;
       case dslash_test_type::MatPC:
         dw_4d_matpc(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.matpc_type, inv_param.dagger,
                     gauge_param.cpu_prec, gauge_param, inv_param.mass);
         break;
       case dslash_test_type::Mat:
-        dw_4d_mat(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                  inv_param.mass);
+        dw_4d_mat(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec,
+                  gauge_param, inv_param.mass);
         break;
       case dslash_test_type::MatPCDagMatPC:
         dw_4d_matpc(spinorTmp.data(), hostGauge, spinor.data(), kappa5, inv_param.matpc_type, inv_param.dagger,
@@ -609,8 +616,8 @@ struct DslashTestWrapper {
                     gauge_param.cpu_prec, gauge_param, inv_param.mass);
         break;
       case dslash_test_type::MatDagMat:
-        dw_4d_mat(spinorTmp.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                  inv_param.mass);
+        dw_4d_mat(spinorTmp.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec,
+                  gauge_param, inv_param.mass);
         dw_4d_mat(spinorRef.data(), hostGauge, spinorTmp.data(), kappa5, not_dagger, gauge_param.cpu_prec, gauge_param,
                   inv_param.mass);
         break;
@@ -630,12 +637,12 @@ struct DslashTestWrapper {
       }
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        dslash_4_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                    inv_param.mass);
+        dslash_4_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
+                    gauge_param, inv_param.mass);
         break;
       case dslash_test_type::M5:
-        mdw_dslash_5(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                     inv_param.mass, kappa_5, true);
+        mdw_dslash_5(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
+                     gauge_param, inv_param.mass, kappa_5, true);
         break;
       case dslash_test_type::Dslash4pre:
         mdw_dslash_4_pre(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
@@ -689,8 +696,8 @@ struct DslashTestWrapper {
       }
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        dslash_4_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                    inv_param.mass);
+        dslash_4_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
+                    gauge_param, inv_param.mass);
         break;
       case dslash_test_type::M5:
         mdw_eofa_m5(spinorRef.data(), spinor.data(), parity, inv_param.dagger, inv_param.mass, inv_param.m5,
@@ -730,8 +737,8 @@ struct DslashTestWrapper {
                        gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]),
                        (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm,
                        inv_param.eofa_shift);
-        mdw_eofa_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.matpc_type, not_dagger, gauge_param.cpu_prec,
-                       gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]),
+        mdw_eofa_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.matpc_type, not_dagger,
+                       gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]),
                        (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm,
                        inv_param.eofa_shift);
         break;
diff --git a/tests/gauge_path_test.cpp b/tests/gauge_path_test.cpp
index 7d37c9faad..f15bcdbda5 100644
--- a/tests/gauge_path_test.cpp
+++ b/tests/gauge_path_test.cpp
@@ -165,9 +165,9 @@ void gauge_force_test(bool compute_force = true)
     mom = Mom_milc.data();
   } else if (gauge_order == QUDA_QDP_GAUGE_ORDER) {
     for (int d = 0; d < 4; d++) sitelink_array[d] = U_qdp.data(d);
-    sitelink = reinterpret_cast<void*>(sitelink_array);
+    sitelink = reinterpret_cast<void *>(sitelink_array);
     for (int d = 0; d < 4; d++) mom_array[d] = Mom_qdp.data(d);
-    mom = reinterpret_cast<void*>(mom_array);
+    mom = reinterpret_cast<void *>(mom_array);
   } else {
     errorQuda("Unsupported gauge order %d", gauge_order);
   }
@@ -204,8 +204,7 @@ void gauge_force_test(bool compute_force = true)
   void *refmom = Mom_ref_milc.data();
   int *check_out = compute_force ? &force_check : &path_check;
   if (verify_results) {
-    gauge_force_reference(refmom, eb3, U_qdp, input_path_buf, length,
-                          loop_coeff, num_paths, compute_force);
+    gauge_force_reference(refmom, eb3, U_qdp, input_path_buf, length, loop_coeff, num_paths, compute_force);
     *check_out
       = compare_floats(Mom_milc.data(), refmom, 4 * V * mom_site_size, getTolerance(cuda_prec), gauge_param.cpu_prec);
     if (compute_force) strong_check_mom(Mom_milc.data(), refmom, 4 * V, gauge_param.cpu_prec);
@@ -276,7 +275,7 @@ void gauge_loop_test()
     sitelink = U_milc.data();
   } else if (gauge_order == QUDA_QDP_GAUGE_ORDER) {
     for (int d = 0; d < 4; d++) sitelink_array[d] = U_qdp.data(d);
-    sitelink = reinterpret_cast<void*>(sitelink_array);
+    sitelink = reinterpret_cast<void *>(sitelink_array);
   } else {
     errorQuda("Unsupported gauge order %d", gauge_order);
   }
@@ -318,8 +317,8 @@ void gauge_loop_test()
   std::vector<quda::Complex> traces_ref(num_paths);
 
   if (verify_results) {
-    gauge_loop_trace_reference(U_qdp, traces_ref, scale_factor, trace_path_p,
-                               trace_loop_length_p, trace_loop_coeff_p, num_paths);
+    gauge_loop_trace_reference(U_qdp, traces_ref, scale_factor, trace_path_p, trace_loop_length_p, trace_loop_coeff_p,
+                               num_paths);
 
     loop_deviation = 0;
     for (int i = 0; i < num_paths; i++) {
diff --git a/tests/heatbath_test.cpp b/tests/heatbath_test.cpp
index 557914b772..771909ffa5 100644
--- a/tests/heatbath_test.cpp
+++ b/tests/heatbath_test.cpp
@@ -75,8 +75,8 @@ void heatbath_test(int argc, char **argv)
   loadGaugeQuda((void *)load_gauge, &gauge_param);
 
   quda::quda_ptr num_failures(QUDA_MEMORY_MAPPED, sizeof(int), false);
-  int &num_failures_h = *static_cast<int*>(num_failures.data_host());
-  int &num_failures_d = *static_cast<int*>(num_failures.data_device());
+  int &num_failures_h = *static_cast<int *>(num_failures.data_host());
+  int &num_failures_d = *static_cast<int *>(num_failures.data_device());
   num_failures_h = 0;
 
   // start the timer
diff --git a/tests/hisq_paths_force_test.cpp b/tests/hisq_paths_force_test.cpp
index 9df6d2ec4c..7560dbc105 100644
--- a/tests/hisq_paths_force_test.cpp
+++ b/tests/hisq_paths_force_test.cpp
@@ -464,8 +464,7 @@ static int hisq_force_test(bool lepage)
 
     int res = 1;
     for (int dir = 0; dir < 4; dir++) {
-      res &= compare_floats(cpuForce->data<void*>(dir),
-                            hostVerifyForce->data<void*>(dir), V * gauge_site_size,
+      res &= compare_floats(cpuForce->data<void *>(dir), hostVerifyForce->data<void *>(dir), V * gauge_site_size,
                             getTolerance(force_prec), force_prec);
     }
 
@@ -499,8 +498,7 @@ static int hisq_force_test(bool lepage)
 
       int res = 1;
       for (int dir = 0; dir < 4; dir++) {
-        res &= compare_floats(cpuForce->data(dir),
-                              hostVerifyForce->data(dir), V * gauge_site_size,
+        res &= compare_floats(cpuForce->data(dir), hostVerifyForce->data(dir), V * gauge_site_size,
                               getTolerance(force_prec), force_prec);
       }
 
diff --git a/tests/hisq_unitarize_force_test.cpp b/tests/hisq_unitarize_force_test.cpp
index 41b977b8e1..01e3c78c18 100644
--- a/tests/hisq_unitarize_force_test.cpp
+++ b/tests/hisq_unitarize_force_test.cpp
@@ -34,7 +34,7 @@ void createNoisyLinkCPU(quda::GaugeField &field, QudaPrecision prec, int seed)
   for (int dir = 0; dir < 4; ++dir) {
     for (int i = 0; i < V * 18; ++i) {
       if (prec == QUDA_DOUBLE_PRECISION) {
-        double *ptr = field.data<double*>(dir) + i;
+        double *ptr = field.data<double *>(dir) + i;
         *ptr += (rand() - RAND_MAX / 2.0) / (20.0 * RAND_MAX);
       } else if (prec == QUDA_SINGLE_PRECISION) {
         float *ptr = field.data<float *>(dir) + i;
diff --git a/tests/host_reference/covdev_reference.cpp b/tests/host_reference/covdev_reference.cpp
index 05a8fe839b..66aaf85fa8 100644
--- a/tests/host_reference/covdev_reference.cpp
+++ b/tests/host_reference/covdev_reference.cpp
@@ -178,8 +178,8 @@ void covdevReference_mg4dir(sFloat *res, gFloat **link, gFloat **ghostLink, cons
     int offset = spinor_site_size * sid;
 
     gFloat *lnk = gaugeLink_mg4dir(sid, mu, oddBit, linkEven, linkOdd, ghostLinkEven, ghostLinkOdd, 1, 1);
-    const sFloat *spinor = spinorNeighbor_mg4dir(sid, mu, oddBit, static_cast<const sFloat *>(in.data()), fwd_nbr_spinor,
-                                                 back_nbr_spinor, 1, 1);
+    const sFloat *spinor = spinorNeighbor_mg4dir(sid, mu, oddBit, static_cast<const sFloat *>(in.data()),
+                                                 fwd_nbr_spinor, back_nbr_spinor, 1, 1);
 
     sFloat gaugedSpinor[spinor_site_size];
 
@@ -212,15 +212,19 @@ void covdev_dslash_mg4dir(ColorSpinorField &out, const GaugeField &link, const C
 
   if (sPrecision == QUDA_DOUBLE_PRECISION) {
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      covdevReference_mg4dir((double *)out.data(), reinterpret_cast<double**>(data), (double **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((double *)out.data(), reinterpret_cast<double **>(data), (double **)ghostLink, in, oddBit,
+                             daggerBit, mu);
     } else {
-      covdevReference_mg4dir((double *)out.data(), reinterpret_cast<float**>(data), (float **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((double *)out.data(), reinterpret_cast<float **>(data), (float **)ghostLink, in, oddBit,
+                             daggerBit, mu);
     }
   } else {
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      covdevReference_mg4dir((float *)out.data(), reinterpret_cast<double**>(data), (double **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((float *)out.data(), reinterpret_cast<double **>(data), (double **)ghostLink, in, oddBit,
+                             daggerBit, mu);
     } else {
-      covdevReference_mg4dir((float *)out.data(), reinterpret_cast<float**>(data), (float **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((float *)out.data(), reinterpret_cast<float **>(data), (float **)ghostLink, in, oddBit,
+                             daggerBit, mu);
     }
   }
 }
@@ -237,8 +241,8 @@ void Mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinor
     auto &outOdd = out.Odd();
 
     inEven.exchangeGhost(QUDA_EVEN_PARITY, nFace, daggerBit);
-    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outOdd.data()), reinterpret_cast<gFloat**>(data),
-                           reinterpret_cast<gFloat**>(ghostLink), in.Even(), 1, daggerBit, mu);
+    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outOdd.data()), reinterpret_cast<gFloat **>(data),
+                           reinterpret_cast<gFloat **>(ghostLink), in.Even(), 1, daggerBit, mu);
   }
 
   {
@@ -246,8 +250,8 @@ void Mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinor
     auto &outEven = out.Even();
 
     inOdd.exchangeGhost(QUDA_ODD_PARITY, nFace, daggerBit);
-    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outEven.data()), reinterpret_cast<gFloat**>(data),
-                           reinterpret_cast<gFloat**>(ghostLink), in.Odd(), 0, daggerBit, mu);
+    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outEven.data()), reinterpret_cast<gFloat **>(data),
+                           reinterpret_cast<gFloat **>(ghostLink), in.Odd(), 0, daggerBit, mu);
   }
 }
 
@@ -268,9 +272,8 @@ void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinor
   }
 }
 
-void matdagmat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit,
-                      int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp,
-                      QudaParity parity)
+void matdagmat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu,
+                      QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity)
 {
   // assert sPrecision and gPrecision must be the same
   if (sPrecision != gPrecision) errorQuda("Spinor precision and gPrecison is not the same");
diff --git a/tests/host_reference/covdev_reference.h b/tests/host_reference/covdev_reference.h
index 679736109a..3c0c1b18e3 100644
--- a/tests/host_reference/covdev_reference.h
+++ b/tests/host_reference/covdev_reference.h
@@ -17,6 +17,5 @@ void matdagmat(void *out, const GaugeField &link, void *in, int dagger_bit, int
                QudaPrecision gPrecision, void *tmp, QudaParity parity);
 
 void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu);
-void matdagmat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit,
-                      int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp,
-                      QudaParity parity);
+void matdagmat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu,
+                      QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity);
diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index 3ed0c9aa5f..4edc471143 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -744,13 +744,16 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou
 }
 
 double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fatlink, quda::GaugeField &longlink,
-                                QudaGaugeParam &gauge_param, QudaInvertParam &inv_param, int shift)
+                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fatlink,
+                                quda::GaugeField &longlink, QudaGaugeParam &gauge_param, QudaInvertParam &inv_param,
+                                int shift)
 {
   void *qdp_fatlink[] = {fatlink.data(0), fatlink.data(1), fatlink.data(2), fatlink.data(3)};
   void *qdp_longlink[] = {longlink.data(0), longlink.data(1), longlink.data(2), longlink.data(3)};
-  void *ghost_fatlink[] = {fatlink.Ghost()[0].data(), fatlink.Ghost()[1].data(), fatlink.Ghost()[2].data(), fatlink.Ghost()[3].data()};
-  void *ghost_longlink[] = {longlink.Ghost()[0].data(), longlink.Ghost()[1].data(), longlink.Ghost()[2].data(), longlink.Ghost()[3].data()};
+  void *ghost_fatlink[]
+    = {fatlink.Ghost()[0].data(), fatlink.Ghost()[1].data(), fatlink.Ghost()[2].data(), fatlink.Ghost()[3].data()};
+  void *ghost_longlink[]
+    = {longlink.Ghost()[0].data(), longlink.Ghost()[1].data(), longlink.Ghost()[2].data(), longlink.Ghost()[3].data()};
 
   switch (test_type) {
   case 0: // full parity solution, full parity system
diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h
index b6939ee08c..82745008fc 100644
--- a/tests/host_reference/dslash_reference.h
+++ b/tests/host_reference/dslash_reference.h
@@ -110,8 +110,9 @@ std::array<double, 2> verifyWilsonTypeInversion(void *spinorOut, void **spinorOu
                                                 QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv);
 
 double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fatlink, quda::GaugeField &longlink,
-                                QudaGaugeParam &gauge_param, QudaInvertParam &inv_param, int shift);
+                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fatlink,
+                                quda::GaugeField &longlink, QudaGaugeParam &gauge_param, QudaInvertParam &inv_param,
+                                int shift);
 
 // i represents a "half index" into an even or odd "half lattice".
 // when oddBit={0,1} the half lattice is {even,odd}.
diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp
index a575532731..c43a7ca07e 100644
--- a/tests/host_reference/gauge_force_reference.cpp
+++ b/tests/host_reference/gauge_force_reference.cpp
@@ -478,8 +478,8 @@ void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *s
   host_free(staple);
 }
 
-void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int ***path_dir,
-                           int *length, void *loop_coeff, int num_paths, bool compute_force)
+void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int ***path_dir, int *length,
+                           void *loop_coeff, int num_paths, bool compute_force)
 {
   void *sitelink[] = {u.data(0), u.data(1), u.data(2), u.data(3)};
 
@@ -496,15 +496,15 @@ void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int **
 
   void *sitelink_ex[] = {qdp_ex->data(0), qdp_ex->data(1), qdp_ex->data(2), qdp_ex->data(3)};
   for (int dir = 0; dir < 4; dir++) {
-    gauge_force_reference_dir(refMom, dir, eb3, sitelink, sitelink_ex, u.Precision(), path_dir[dir], length,
-                              loop_coeff, num_paths, lat, compute_force);
+    gauge_force_reference_dir(refMom, dir, eb3, sitelink, sitelink_ex, u.Precision(), path_dir[dir], length, loop_coeff,
+                              num_paths, lat, compute_force);
   }
 
   delete qdp_ex;
 }
 
-void gauge_loop_trace_reference(quda::GaugeField &u, std::vector<quda::Complex> &loop_traces,
-                                double factor, int **input_path, int *length, double *path_coeff, int num_paths)
+void gauge_loop_trace_reference(quda::GaugeField &u, std::vector<quda::Complex> &loop_traces, double factor,
+                                int **input_path, int *length, double *path_coeff, int num_paths)
 {
   void *sitelink[] = {u.data(0), u.data(1), u.data(2), u.data(3)};
 
diff --git a/tests/host_reference/gauge_force_reference.h b/tests/host_reference/gauge_force_reference.h
index 5d65ecc82b..9b6d06a555 100644
--- a/tests/host_reference/gauge_force_reference.h
+++ b/tests/host_reference/gauge_force_reference.h
@@ -2,8 +2,8 @@
 
 #include <gauge_field.h>
 
-void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int ***path_dir,
-                           int *length,  void *loop_coeff, int num_paths, bool compute_force);
+void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int ***path_dir, int *length,
+                           void *loop_coeff, int num_paths, bool compute_force);
 
-void gauge_loop_trace_reference(quda::GaugeField &u, std::vector<quda::Complex> &loop_traces,
-                                double factor, int **input_path, int *length, double *path_coeff, int num_paths);
+void gauge_loop_trace_reference(quda::GaugeField &u, std::vector<quda::Complex> &loop_traces, double factor,
+                                int **input_path, int *length, double *path_coeff, int num_paths);
diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp
index 9cd4ee4d9c..5179f38a66 100644
--- a/tests/host_reference/hisq_force_reference.cpp
+++ b/tests/host_reference/hisq_force_reference.cpp
@@ -86,7 +86,7 @@ typedef struct {
 
 template <typename su3_matrix> su3_matrix *get_su3_matrix(quda::GaugeField &p, int idx, int dir)
 {
-  auto data = static_cast<su3_matrix*>(p.data(dir));
+  auto data = static_cast<su3_matrix *>(p.data(dir));
   return data + idx;
 }
 
@@ -1226,11 +1226,13 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda
   void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)};
   void *noprod_array[] = {newOprod->data(0), newOprod->data(1), newOprod->data(2), newOprod->data(3)};
   if (precision == QUDA_DOUBLE_PRECISION) {
-    doHisqStaplesForceCPU<double>(X_, act_path_coeff, reinterpret_cast<double*>(oprod_array), reinterpret_cast<double*>(link_array),
-                                  (double **)tempmat, reinterpret_cast<double*>(noprod_array));
+    doHisqStaplesForceCPU<double>(X_, act_path_coeff, reinterpret_cast<double *>(oprod_array),
+                                  reinterpret_cast<double *>(link_array), (double **)tempmat,
+                                  reinterpret_cast<double *>(noprod_array));
   } else if (precision == QUDA_SINGLE_PRECISION) {
-    doHisqStaplesForceCPU<float>(X_, act_path_coeff, reinterpret_cast<float*>(oprod_array), reinterpret_cast<float*>(link_array),
-                                 (float **)tempmat, reinterpret_cast<float*>(noprod_array));
+    doHisqStaplesForceCPU<float>(X_, act_path_coeff, reinterpret_cast<float *>(oprod_array),
+                                 reinterpret_cast<float *>(link_array), (float **)tempmat,
+                                 reinterpret_cast<float *>(noprod_array));
   } else {
     errorQuda("Unsupported precision");
   }
@@ -1304,8 +1306,7 @@ void computeLongLinkField(const int dim[4], const Real *const oprod, const Real
   }
 }
 
-void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeField &link,
-                          quda::GaugeField *newOprod)
+void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeField &link, quda::GaugeField *newOprod)
 {
   int X_[4];
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
@@ -1316,11 +1317,11 @@ void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeFiel
   void *noprod_array[] = {newOprod->data(0), newOprod->data(1), newOprod->data(2), newOprod->data(3)};
   for (int sig = 0; sig < 4; ++sig) {
     if (precision == QUDA_SINGLE_PRECISION) {
-      computeLongLinkField<float>(X_, reinterpret_cast<float*>(oprod_array), reinterpret_cast<float*>(link_array),
-                                  sig, coeff, reinterpret_cast<float*>(noprod_array));
+      computeLongLinkField<float>(X_, reinterpret_cast<float *>(oprod_array), reinterpret_cast<float *>(link_array),
+                                  sig, coeff, reinterpret_cast<float *>(noprod_array));
     } else if (precision == QUDA_DOUBLE_PRECISION) {
-      computeLongLinkField<double>(X_, reinterpret_cast<double*>(oprod_array), reinterpret_cast<double*>(link_array),
-                                   sig, coeff, reinterpret_cast<double*>(noprod_array));
+      computeLongLinkField<double>(X_, reinterpret_cast<double *>(oprod_array), reinterpret_cast<double *>(link_array),
+                                   sig, coeff, reinterpret_cast<double *>(noprod_array));
     } else {
       errorQuda("Unrecognised precision");
     }
@@ -1376,11 +1377,11 @@ void hisqCompleteForceCPU(quda::GaugeField &oprod, quda::GaugeField &link, quda:
   void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)};
   for (int sig = 0; sig < 4; ++sig) {
     if (precision == QUDA_SINGLE_PRECISION) {
-      completeForceField<float>(X_, reinterpret_cast<float*>(oprod_array), reinterpret_cast<float*>(link_array),
-                                sig, mom->data<float*>());
+      completeForceField<float>(X_, reinterpret_cast<float *>(oprod_array), reinterpret_cast<float *>(link_array), sig,
+                                mom->data<float *>());
     } else if (precision == QUDA_DOUBLE_PRECISION) {
-      completeForceField<double>(X_, reinterpret_cast<double*>(oprod_array), reinterpret_cast<double*>(link_array),
-                                 sig, mom->data<double*>());
+      completeForceField<double>(X_, reinterpret_cast<double *>(oprod_array), reinterpret_cast<double *>(link_array),
+                                 sig, mom->data<double *>());
     } else {
       errorQuda("Unrecognised precision");
     }
diff --git a/tests/host_reference/hisq_force_reference.h b/tests/host_reference/hisq_force_reference.h
index da6a8b770e..bdf78c4750 100644
--- a/tests/host_reference/hisq_force_reference.h
+++ b/tests/host_reference/hisq_force_reference.h
@@ -31,8 +31,7 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda
    @param[in] link Gauge field links
    @param[out] newOprod Force accumulated with fat link contributions
 */
-void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeField &link,
-                          quda::GaugeField *newOprod);
+void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeField &link, quda::GaugeField *newOprod);
 
 /**
    @brief Accumulate the force contributions into the momentum field, CPU version
diff --git a/tests/staggered_gsmear_test_utils.h b/tests/staggered_gsmear_test_utils.h
index b9adfe4361..06c252292d 100644
--- a/tests/staggered_gsmear_test_utils.h
+++ b/tests/staggered_gsmear_test_utils.h
@@ -128,10 +128,10 @@ struct StaggeredGSmearTestWrapper { //
         quda::blas::ax(ftmp, tmp);
         quda::blas::axpy(a, tmp, tmp2);
 
-        staggeredTwoLinkGaussianSmear(spinorRef.Even(), qdp_twolnk, *cpuTwoLink, tmp.Even(),
-                                      &gauge_param, &inv_param, 0, smear_coeff, smear_t0, gauge_param.cpu_prec);
-        staggeredTwoLinkGaussianSmear(spinorRef.Odd(), qdp_twolnk, *cpuTwoLink, tmp.Odd(),
-                                      &gauge_param, &inv_param, 1, smear_coeff, smear_t0, gauge_param.cpu_prec);
+        staggeredTwoLinkGaussianSmear(spinorRef.Even(), qdp_twolnk, *cpuTwoLink, tmp.Even(), &gauge_param, &inv_param,
+                                      0, smear_coeff, smear_t0, gauge_param.cpu_prec);
+        staggeredTwoLinkGaussianSmear(spinorRef.Odd(), qdp_twolnk, *cpuTwoLink, tmp.Odd(), &gauge_param, &inv_param, 1,
+                                      smear_coeff, smear_t0, gauge_param.cpu_prec);
 
         // blas::xpay(*tmp2, -1.0, *spinorRef);
         xpay(tmp2.Even().data(), -1.0, spinorRef.Even().data(), spinor.Even().Length(), gauge_param.cpu_prec);
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 0d9f2c3e5e..5d4a96540c 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -178,9 +178,9 @@ void test(int argc, char **argv)
   cpuParam.order = QUDA_MILC_GAUGE_ORDER;
   GaugeField cpuLongMILC = GaugeField(cpuParam);
 
-  void* qdp_inlink[4] = {cpuIn.data(0), cpuIn.data(1), cpuIn.data(2), cpuIn.data(3)};
-  void* qdp_fatlink[4] = {cpuFatQDP.data(0), cpuFatQDP.data(1), cpuFatQDP.data(2), cpuFatQDP.data(3)};
-  void* qdp_longlink[4] = {cpuLongQDP.data(0), cpuLongQDP.data(1), cpuLongQDP.data(2), cpuLongQDP.data(3)};
+  void *qdp_inlink[4] = {cpuIn.data(0), cpuIn.data(1), cpuIn.data(2), cpuIn.data(3)};
+  void *qdp_fatlink[4] = {cpuFatQDP.data(0), cpuFatQDP.data(1), cpuFatQDP.data(2), cpuFatQDP.data(3)};
+  void *qdp_longlink[4] = {cpuLongQDP.data(0), cpuLongQDP.data(1), cpuLongQDP.data(2), cpuLongQDP.data(3)};
   constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv);
   // Reorder gauge fields to MILC order
   cpuFatMILC = cpuFatQDP;
@@ -277,8 +277,8 @@ void test(int argc, char **argv)
       }
       inv_param.num_src = Nsrc;
       inv_param.num_src_per_sub_partition = Nsrc / num_sub_partition;
-      invertMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, cpuFatMILC.data(),
-                                  cpuLongMILC.data(), &gauge_param);
+      invertMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, cpuFatMILC.data(), cpuLongMILC.data(),
+                                  &gauge_param);
       quda::comm_allreduce_int(inv_param.iter);
       inv_param.iter /= comm_size() / num_sub_partition;
       quda::comm_allreduce_sum(inv_param.gflops);
@@ -345,7 +345,8 @@ void test(int argc, char **argv)
 
       for (int i = 0; i < multishift; i++) {
         printfQuda("%dth solution: mass=%f, ", i, masses[i]);
-        verifyStaggeredInversion(tmp, ref, in[k], qudaOutArray[i], masses[i], cpuFatQDP, cpuLongQDP, gauge_param, inv_param, i);
+        verifyStaggeredInversion(tmp, ref, in[k], qudaOutArray[i], masses[i], cpuFatQDP, cpuLongQDP, gauge_param,
+                                 inv_param, i);
       }
     }
   } else {
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index d6eb26304f..75e0d2ad1f 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -59,7 +59,9 @@ void computeLongLinkCPU(void **longlink, void **sitelink, QudaPrecision prec, vo
 void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink,
                          void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik);
 void computeTwoLinkCPU(void **twolink, void **sitelink, QudaGaugeParam *gauge_param);
-void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], const quda::GaugeField &twolnk,  quda::ColorSpinorField &in, QudaGaugeParam *qudaGaugeParam, QudaInvertParam *inv_param, const int oddBit, const double width, const int t0, QudaPrecision prec);
+void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], const quda::GaugeField &twolnk,
+                                   quda::ColorSpinorField &in, QudaGaugeParam *qudaGaugeParam, QudaInvertParam *inv_param,
+                                   const int oddBit, const double width, const int t0, QudaPrecision prec);
 template <typename Float>
 void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type);
 void applyGaugeFieldScaling_long(void **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type,
@@ -183,7 +185,8 @@ double compare_floats_v2(void *a, void *b, int len, double epsilon, QudaPrecisio
 void check_gauge(void **, void **, double epsilon, QudaPrecision precision);
 
 int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec);
-int strong_check_link(const quda::GaugeField &linkA, const std::string &msgA, const quda::GaugeField &linkB, const std::string &msgB);
+int strong_check_link(const quda::GaugeField &linkA, const std::string &msgA, const quda::GaugeField &linkB,
+                      const std::string &msgB);
 int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec);
 
 /**
diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp
index cc9148fca5..df1d787ff0 100644
--- a/tests/utils/staggered_host_utils.cpp
+++ b/tests/utils/staggered_host_utils.cpp
@@ -474,7 +474,8 @@ void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk
   return;
 }
 #else
-void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &, void **, const quda::GaugeField &,  quda::ColorSpinorField&, QudaGaugeParam* , QudaInvertParam* , const int , const double , const int , QudaPrecision )
+void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &, void **, const quda::GaugeField &, quda::ColorSpinorField &,
+                                   QudaGaugeParam *, QudaInvertParam *, const int, const double, const int, QudaPrecision)
 {}
 #endif
 

From 5e55e1ec15d49d6e133107dc0e6cc7e35746a409 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 16 Oct 2023 16:21:43 -0700
Subject: [PATCH 60/60] More clang-format

---
 include/gauge_field_order.h        |  2 +-
 include/tune_quda.h                |  2 +-
 lib/coarse_op.in.cu                |  2 +-
 lib/coarse_op_preconditioned.in.cu |  6 ++++--
 lib/color_spinor_util.in.cu        |  3 ++-
 lib/dslash_clover_helper.cu        | 20 ++++++++++++++++----
 lib/dslash_gamma_helper.cu         | 10 ++++++++--
 lib/gauge_fix_fft.cu               |  4 ++--
 lib/gauge_stout.cu                 | 10 ++++++++--
 lib/hisq_paths_force_quda.cu       |  8 ++++----
 lib/llfat_quda.cu                  | 25 ++++++++-----------------
 lib/staggered_coarse_op.in.cu      |  2 +-
 lib/staggered_kd_build_xinv.cu     | 17 ++++++++++-------
 lib/staggered_oprod.cu             | 12 ++++++++++--
 lib/staggered_two_link_quda.cu     |  2 +-
 lib/unitarize_links_quda.cu        | 16 ++++++++--------
 16 files changed, 85 insertions(+), 56 deletions(-)

diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 2ce7197a83..a2c0300a1d 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -883,7 +883,7 @@ namespace quda {
         __device__ __host__ inline auto VolumeCB() const { return volumeCB; }
 
         /** Returns the field geometric dimension */
-	__device__ __host__ inline int Ndim() const { return nDim; }
+        __device__ __host__ inline int Ndim() const { return nDim; }
 
 	/** Returns the field geometry */
 	__device__ __host__ inline int Geometry() const { return geometry; }
diff --git a/include/tune_quda.h b/include/tune_quda.h
index c6a7d0c111..9da6a82411 100644
--- a/include/tune_quda.h
+++ b/include/tune_quda.h
@@ -22,7 +22,7 @@ namespace quda {
     dim3 grid;
     unsigned int shared_bytes = 0;
     bool set_max_shared_bytes = false; // whether to opt in to max shared bytes per thread block
-    int4 aux = {1, 1, 1, 1}; // free parameter used as an arbitrary autotuning dimension
+    int4 aux = {1, 1, 1, 1};           // free parameter used as an arbitrary autotuning dimension
 
     std::string comment;
     float time = FLT_MAX;
diff --git a/lib/coarse_op.in.cu b/lib/coarse_op.in.cu
index 259da32c98..c586f8f1dd 100644
--- a/lib/coarse_op.in.cu
+++ b/lib/coarse_op.in.cu
@@ -97,7 +97,7 @@ namespace quda {
       gCoarseAtomic yAccessorAtomic(const_cast<GaugeField&>(Yatomic));
       gCoarseAtomic xAccessorAtomic(const_cast<GaugeField&>(Xatomic));
       cFine cAccessor(const_cast<CloverField&>(c), false);
-      cFine cInvAccessor(const_cast<CloverField&>(c), c.Inverse());
+      cFine cInvAccessor(const_cast<CloverField &>(c), c.Inverse());
 
       calculateY<use_mma, QUDA_CUDA_FIELD_LOCATION, false,Float,fineSpin,fineColor,coarseSpin,coarseColor>
         (yAccessor, xAccessor, yAccessorAtomic, xAccessorAtomic, uvAccessor,
diff --git a/lib/coarse_op_preconditioned.in.cu b/lib/coarse_op_preconditioned.in.cu
index 7f487d0b9b..4ac89b7c0b 100644
--- a/lib/coarse_op_preconditioned.in.cu
+++ b/lib/coarse_op_preconditioned.in.cu
@@ -175,7 +175,8 @@ namespace quda
       GaugeField *X_aos = create_gauge_copy(X, true);
       Xinv_aos = create_gauge_copy(Xinv, false);
 
-      Tunable::flops_global(invert(Xinv_aos->data(), X_aos->data(), n, X_aos->Volume(), X_aos->Precision(), X.Location()) + Tunable::flops_global());
+      Tunable::flops_global(invert(Xinv_aos->data(), X_aos->data(), n, X_aos->Volume(), X_aos->Precision(), X.Location())
+                            + Tunable::flops_global());
 
       if (&Xinv != Xinv_aos) {
         if (Xinv.Precision() < QUDA_SINGLE_PRECISION) Xinv.Scale(Xinv_aos->abs_max());
@@ -186,7 +187,8 @@ namespace quda
       if (!use_mma) { delete Xinv_aos; }
 
     } else if (X.Location() == QUDA_CPU_FIELD_LOCATION && X.Order() == QUDA_QDP_GAUGE_ORDER) {
-      Tunable::flops_global(invert(Xinv.data<void *>(0), X.data<void *>(0), n, X.Volume(), X.Precision(), X.Location()) + Tunable::flops_global());
+      Tunable::flops_global(invert(Xinv.data<void *>(0), X.data<void *>(0), n, X.Volume(), X.Precision(), X.Location())
+                            + Tunable::flops_global());
     } else {
       errorQuda("Unsupported location=%d and order=%d", X.Location(), X.Order());
     }
diff --git a/lib/color_spinor_util.in.cu b/lib/color_spinor_util.in.cu
index c1471b262b..0b9355d4d1 100644
--- a/lib/color_spinor_util.in.cu
+++ b/lib/color_spinor_util.in.cu
@@ -417,7 +417,8 @@ namespace quda {
 
     param.create = create;
     if (create == QUDA_COPY_FIELD_CREATE) param.field = &const_cast<ColorSpinorField&>(src);
-    else if (create == QUDA_REFERENCE_FIELD_CREATE) param.v = src.data();
+    else if (create == QUDA_REFERENCE_FIELD_CREATE)
+      param.v = src.data();
 
     resize(v, new_size, param);
   }
diff --git a/lib/dslash_clover_helper.cu b/lib/dslash_clover_helper.cu
index accc50d31a..fa2ba4d365 100644
--- a/lib/dslash_clover_helper.cu
+++ b/lib/dslash_clover_helper.cu
@@ -35,8 +35,14 @@ namespace quda {
       launch<CloverApply>(tp, stream, CloverArg<Float, nColor>(out, in, clover, parity));
     }
 
-    void preTune() { if (out.data() == in.data()) out.backup(); }  // Backup if in and out fields alias
-    void postTune() { if (out.data() == in.data()) out.restore(); } // Restore if the in and out fields alias
+    void preTune()
+    {
+      if (out.data() == in.data()) out.backup();
+    } // Backup if in and out fields alias
+    void postTune()
+    {
+      if (out.data() == in.data()) out.restore();
+    } // Restore if the in and out fields alias
     long long flops() const { return in.Volume()*504ll; }
     long long bytes() const { return out.Bytes() + in.Bytes() + clover.Bytes() / (3 - in.SiteSubset()); }
   };
@@ -115,8 +121,14 @@ namespace quda {
       }
     }
 
-    void preTune() { if (out.data() == in.data()) out.backup(); } // Restore if the in and out fields alias
-    void postTune() { if (out.data() == in.data()) out.restore(); } // Restore if the in and out fields alias
+    void preTune()
+    {
+      if (out.data() == in.data()) out.backup();
+    } // Restore if the in and out fields alias
+    void postTune()
+    {
+      if (out.data() == in.data()) out.restore();
+    } // Restore if the in and out fields alias
     long long flops() const { return (inverse ? 1056ll : 552ll) * in.Volume(); }
     long long bytes() const {
       long long rtn = out.Bytes() + in.Bytes() + clover.Bytes() / (3 - in.SiteSubset());
diff --git a/lib/dslash_gamma_helper.cu b/lib/dslash_gamma_helper.cu
index 2e76504afd..66c523df20 100644
--- a/lib/dslash_gamma_helper.cu
+++ b/lib/dslash_gamma_helper.cu
@@ -74,8 +74,14 @@ namespace quda {
       launch<TwistGamma>(tp, stream, GammaArg<Float, nColor>(out, in, d, kappa, mu, epsilon, dagger, type));
     }
 
-    void preTune() { if (out.data() == in.data()) out.backup(); }
-    void postTune() { if (out.data() == in.data()) out.restore(); }
+    void preTune()
+    {
+      if (out.data() == in.data()) out.backup();
+    }
+    void postTune()
+    {
+      if (out.data() == in.data()) out.restore();
+    }
     long long flops() const { return 0; }
     long long bytes() const { return out.Bytes() + in.Bytes(); }
   };
diff --git a/lib/gauge_fix_fft.cu b/lib/gauge_fix_fft.cu
index fea9a92623..b72a3aef78 100644
--- a/lib/gauge_fix_fft.cu
+++ b/lib/gauge_fix_fft.cu
@@ -366,10 +366,10 @@ namespace quda {
                    double alpha, int autotune, double tolerance, int stopWtheta)
     {
       if (gauge_dir != 3) {
-	logQuda(QUDA_SUMMARIZE, "Starting Landau gauge fixing with FFTs...\n");
+        logQuda(QUDA_SUMMARIZE, "Starting Landau gauge fixing with FFTs...\n");
         gaugeFixingFFT<Float, recon, 4>(data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
       } else {
-	logQuda(QUDA_SUMMARIZE, "Starting Coulomb gauge fixing with FFTs...\n");
+        logQuda(QUDA_SUMMARIZE, "Starting Coulomb gauge fixing with FFTs...\n");
         gaugeFixingFFT<Float, recon, 3>(data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
       }
     }
diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu
index f537ca60ea..c7f256f2ee 100644
--- a/lib/gauge_stout.cu
+++ b/lib/gauge_stout.cu
@@ -49,8 +49,14 @@ namespace quda {
       }
     }
 
-    void preTune() { if (out.data() == in.data()) out.backup(); }
-    void postTune() { if (out.data() == in.data()) out.restore(); }
+    void preTune()
+    {
+      if (out.data() == in.data()) out.backup();
+    }
+    void postTune()
+    {
+      if (out.data() == in.data()) out.restore();
+    }
 
     long long flops() const // just counts matrix multiplication
     {
diff --git a/lib/hisq_paths_force_quda.cu b/lib/hisq_paths_force_quda.cu
index e6e30f90bc..f076ca0b5b 100644
--- a/lib/hisq_paths_force_quda.cu
+++ b/lib/hisq_paths_force_quda.cu
@@ -570,10 +570,10 @@ namespace quda {
       auto Pnumu_next = GaugeField(gauge_param);
       auto Qnumu_next = GaugeField(gauge_param);
 
-      instantiateGaugeStaggered<HisqStaplesForce>(link, P3, GaugeField_ref(Pmu),
-        GaugeField_ref(P5), GaugeField_ref(Pnumu), GaugeField_ref(Qnumu),
-        GaugeField_ref(Pmu_next), GaugeField_ref(Pnumu_next), GaugeField_ref(Qnumu_next),
-        newOprod, oprod, path_coeff_array);
+      instantiateGaugeStaggered<HisqStaplesForce>(link, P3, GaugeField_ref(Pmu), GaugeField_ref(P5),
+                                                  GaugeField_ref(Pnumu), GaugeField_ref(Qnumu),
+                                                  GaugeField_ref(Pmu_next), GaugeField_ref(Pnumu_next),
+                                                  GaugeField_ref(Qnumu_next), newOprod, oprod, path_coeff_array);
 
       getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     }
diff --git a/lib/llfat_quda.cu b/lib/llfat_quda.cu
index cd32a54e6c..face33527a 100644
--- a/lib/llfat_quda.cu
+++ b/lib/llfat_quda.cu
@@ -173,7 +173,7 @@ namespace quda {
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
-  void fatKSLink(GaugeField &fat, const GaugeField& u, const double *coeff)
+  void fatKSLink(GaugeField &fat, const GaugeField &u, const double *coeff)
   {
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
 
@@ -184,13 +184,12 @@ namespace quda {
     GaugeField staple(gParam);
     GaugeField staple1(gParam);
 
-    if ( ((fat.X()[0] % 2 != 0) || (fat.X()[1] % 2 != 0) || (fat.X()[2] % 2 != 0) || (fat.X()[3] % 2 != 0))
-	&& (u.Reconstruct()  != QUDA_RECONSTRUCT_NO)){
-      errorQuda("Reconstruct %d and odd dimension size is not supported by link fattening code (yet)",
-		u.Reconstruct());
+    if (((fat.X()[0] % 2 != 0) || (fat.X()[1] % 2 != 0) || (fat.X()[2] % 2 != 0) || (fat.X()[3] % 2 != 0))
+        && (u.Reconstruct() != QUDA_RECONSTRUCT_NO)) {
+      errorQuda("Reconstruct %d and odd dimension size is not supported by link fattening code (yet)", u.Reconstruct());
     }
 
-    computeOneLink(fat, u, coeff[0]-6.0*coeff[5]);
+    computeOneLink(fat, u, coeff[0] - 6.0 * coeff[5]);
 
     // Check the coefficients. If all of the following are zero, return.
     if (fabs(coeff[2]) >= MIN_COEFF || fabs(coeff[3]) >= MIN_COEFF ||
@@ -208,9 +207,7 @@ namespace quda {
 
             if (fabs(coeff[4]) > MIN_COEFF) {
               for (int sig = 0; sig < 4; sig++) {
-                if (sig != nu && sig != rho) {
-                  computeStaple(fat, staple, staple1, u, sig, nu, rho, coeff[4], 0);
-                }
+                if (sig != nu && sig != rho) { computeStaple(fat, staple, staple1, u, sig, nu, rho, coeff[4], 0); }
               } //sig
             } // MIN_COEFF
           }
@@ -221,15 +218,9 @@ namespace quda {
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else
-  void longKSLink(GaugeField &, const GaugeField&, const double *)
-  {
-    errorQuda("Long-link computation not enabled");
-  }
+  void longKSLink(GaugeField &, const GaugeField &, const double *) { errorQuda("Long-link computation not enabled"); }
 
-  void fatKSLink(GaugeField &, const GaugeField&, const double *)
-  {
-    errorQuda("Fat-link computation not enabled");
-  }
+  void fatKSLink(GaugeField &, const GaugeField &, const double *) { errorQuda("Fat-link computation not enabled"); }
 #endif
 
 #undef MIN_COEFF
diff --git a/lib/staggered_coarse_op.in.cu b/lib/staggered_coarse_op.in.cu
index 3a03467d9a..cec28e4f2e 100644
--- a/lib/staggered_coarse_op.in.cu
+++ b/lib/staggered_coarse_op.in.cu
@@ -357,7 +357,7 @@ namespace quda {
       //Copy the cuda gauge field to the cpu
       tmp_U.get()->copy(gauge);
 
-            // Create either a real or a dummy L field
+      // Create either a real or a dummy L field
       GaugeFieldParam lgf_param(longGauge.X(), precision, QUDA_RECONSTRUCT_NO, pad, longGauge.Geometry());
       if (!(dirac == QUDA_ASQTAD_DIRAC || dirac == QUDA_ASQTADKD_DIRAC))
         for (int i = 0; i < lgf_param.nDim; i++) lgf_param.x[i] = 0;
diff --git a/lib/staggered_kd_build_xinv.cu b/lib/staggered_kd_build_xinv.cu
index 1d93de4389..4494b75e1b 100644
--- a/lib/staggered_kd_build_xinv.cu
+++ b/lib/staggered_kd_build_xinv.cu
@@ -113,7 +113,8 @@ namespace quda {
      @param mass[in] Mass of staggered fermion
      @param dagger_approximation[in] Whether or not to use the dagger approximation, using the dagger of X instead of Xinv
    */
-  void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const GaugeField &gauge, const double mass, const bool dagger_approximation)
+  void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const GaugeField &gauge, const double mass,
+                                        const bool dagger_approximation)
   {
     using namespace blas_lapack;
     auto invert = use_native() ? native::BatchInvertMatrix : generic::BatchInvertMatrix;
@@ -234,17 +235,18 @@ namespace quda {
         param.order = QUDA_MILC_GAUGE_ORDER; // MILC order == QDP order for Xinv
         param.setPrecision(QUDA_SINGLE_PRECISION);
         GaugeField X_(param);
-        
+
         X_.copy(X);
 
-        Tunable::flops_global(invert(xInvMilcOrder->data(), X_.data(), n, X_.Volume(), X_.Precision(), X.Location()) + Tunable::flops_global());
+        Tunable::flops_global(invert(xInvMilcOrder->data(), X_.data(), n, X_.Volume(), X_.Precision(), X.Location())
+                              + Tunable::flops_global());
 
       } else if (location == QUDA_CPU_FIELD_LOCATION) {
-        Tunable::flops_global(invert(xInvMilcOrder->data(), X.data(), n, X.Volume(), X.Precision(), X.Location()) + Tunable::flops_global());
+        Tunable::flops_global(invert(xInvMilcOrder->data(), X.data(), n, X.Volume(), X.Precision(), X.Location())
+                              + Tunable::flops_global());
       }
 
       logQuda(QUDA_VERBOSE, "xInvMilcOrder = %e\n", xInvMilcOrder->norm2(0));
-
     }
 
     // Step 6: reorder the KD inverse into a "gauge field" with a QUDA_KDINVERSE_GEOMETRY
@@ -257,7 +259,8 @@ namespace quda {
 
 
   // Allocates and calculates the inverse KD block, returning Xinv
-  std::shared_ptr<GaugeField> AllocateAndBuildStaggeredKahlerDiracInverse(const GaugeField &gauge, const double mass, const bool dagger_approximation)
+  std::shared_ptr<GaugeField> AllocateAndBuildStaggeredKahlerDiracInverse(const GaugeField &gauge, const double mass,
+                                                                          const bool dagger_approximation)
   {
     GaugeFieldParam gParam(gauge);
     gParam.reconstruct = QUDA_RECONSTRUCT_NO;
@@ -271,7 +274,7 @@ namespace quda {
     // latter true is to force FLOAT2
     gParam.setPrecision(gauge.Precision(), true);
 
-    std::shared_ptr<GaugeField> Xinv(reinterpret_cast<GaugeField*>(new GaugeField(gParam)));
+    std::shared_ptr<GaugeField> Xinv(reinterpret_cast<GaugeField *>(new GaugeField(gParam)));
 
     BuildStaggeredKahlerDiracInverse(*Xinv, gauge, mass, dagger_approximation);
 
diff --git a/lib/staggered_oprod.cu b/lib/staggered_oprod.cu
index bd085c899b..5899b73916 100644
--- a/lib/staggered_oprod.cu
+++ b/lib/staggered_oprod.cu
@@ -83,8 +83,16 @@ namespace quda {
       }
     } // apply
 
-    void preTune() { U.backup(); if (U.data() != L.data()) L.backup(); }
-    void postTune() { U.restore(); if (U.data() != L.data()) L.restore(); }
+    void preTune()
+    {
+      U.backup();
+      if (U.data() != L.data()) L.backup();
+    }
+    void postTune()
+    {
+      U.restore();
+      if (U.data() != L.data()) L.restore();
+    }
 
     long long flops() const { return 0; } // FIXME
     long long bytes() const { return 0; } // FIXME
diff --git a/lib/staggered_two_link_quda.cu b/lib/staggered_two_link_quda.cu
index 3afb950d82..87182fbcab 100644
--- a/lib/staggered_two_link_quda.cu
+++ b/lib/staggered_two_link_quda.cu
@@ -57,7 +57,7 @@ namespace quda
     checkNative(newTwoLink, link);
     checkLocation(newTwoLink, link);
     checkPrecision(newTwoLink, link);
-    instantiate<ComputeTwoLink, ReconstructNone>(link, newTwoLink);//FIXME : enable link-12/8 reconstruction  
+    instantiate<ComputeTwoLink, ReconstructNone>(link, newTwoLink);//FIXME : enable link-12/8 reconstruction
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else
diff --git a/lib/unitarize_links_quda.cu b/lib/unitarize_links_quda.cu
index 83ea615c48..8bddafea55 100644
--- a/lib/unitarize_links_quda.cu
+++ b/lib/unitarize_links_quda.cu
@@ -62,12 +62,12 @@ namespace quda {
 	if (infield.Precision() == QUDA_SINGLE_PRECISION) {
           copyArrayToLink(inlink, infield.data<float *>() + (i * 4 + dir) * 18); // order of arguments?
           unitarizeLinkNewton(outlink, inlink, max_iter_newton);
-	  copyLinkToArray(outfield.data<float *>() + (i * 4 + dir) * 18, outlink);
-	} else if (infield.Precision() == QUDA_DOUBLE_PRECISION) {
-	  copyArrayToLink(inlink, infield.data<double *>() + (i * 4 + dir) * 18); // order of arguments?
+          copyLinkToArray(outfield.data<float *>() + (i * 4 + dir) * 18, outlink);
+        } else if (infield.Precision() == QUDA_DOUBLE_PRECISION) {
+          copyArrayToLink(inlink, infield.data<double *>() + (i * 4 + dir) * 18); // order of arguments?
           unitarizeLinkNewton(outlink, inlink, max_iter_newton);
-	  copyLinkToArray(outfield.data<double*>() + (i * 4 + dir) * 18, outlink);
-	} // precision?
+          copyLinkToArray(outfield.data<double *>() + (i * 4 + dir) * 18, outlink);
+        } // precision?
       } // dir
     }   // loop over volume
   }
@@ -85,9 +85,9 @@ namespace quda {
         } else if (field.Precision() == QUDA_DOUBLE_PRECISION) {
           copyArrayToLink(link, field.data<double *>() + (i * 4 + dir) * 18); // order of arguments?
         } else {
-	  errorQuda("Unsupported precision\n");
-	}
-	if (link.isUnitary(max_error) == false) {
+          errorQuda("Unsupported precision\n");
+        }
+        if (link.isUnitary(max_error) == false) {
 	  printf("Unitarity failure\n");
 	  printf("site index = %u,\t direction = %d\n", i, dir);
 	  printLink(link);