From 3a9db3361cf9e567ec7bc2b393e01ccdac40b1a5 Mon Sep 17 00:00:00 2001
From: Mahias Wagner <mathiasw@nvidia.com>
Date: Wed, 19 Jul 2023 16:22:06 +0200
Subject: [PATCH 1/9] add github actions build for CUDA

---
 .../workflows/cuda_githubactions_build.yml    | 63 +++++++++++++++++++
 CMakeLists.txt                                |  4 +-
 lib/dslash_coarse.hpp                         |  2 +
 lib/targets/cuda/malloc.cpp                   |  1 +
 lib/targets/cuda/target_cuda.cmake            |  5 +-
 5 files changed, 72 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/cuda_githubactions_build.yml
diff --git a/.github/workflows/cuda_githubactions_build.yml b/.github/workflows/cuda_githubactions_build.yml
new file mode 100644
index 0000000000..9cd10f6500
--- /dev/null
+++ b/.github/workflows/cuda_githubactions_build.yml
@@ -0,0 +1,63 @@
+name: CUDA_GithubActions_Build
+
+on:
+  pull_request:
+    branches: [ "develop" ]
+
+defaults:
+  run:
+    shell: bash
+
+env:
+  BUILD_TYPE: STRICT
+  CCACHE_COMPILERCHECK: content
+
+jobs:
+    build:
+      strategy:
+        matrix:
+          compiler: [g++-12, clang++-14]
+      runs-on: ubuntu-latest
+
+      steps:
+      - name: Install software
+        run: |
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
+          sudo dpkg -i cuda-keyring_1.0-1_all.deb
+          sudo apt-get update -y
+          sudo apt-get install -y --no-install-recommends  ninja-build cmake libopenmpi-dev gfortran
+     
+      - uses: awalsh128/cache-apt-pkgs-action@latest
+        with:
+          packages: cuda-compiler-12-1 cuda-libraries-dev-12-1 cuda-nvml-dev-12-1
+          execute_install_scripts: true
+
+      - uses: actions/checkout@v3
+
+      - name: Ccache for gh actions
+        uses: hendrikmuhs/ccache-action@v1.2.9
+        with:
+          key: ${{ github.job }}-${{ matrix.compiler }}
+          max-size: 2000M
+
+      - name: Configure CMake
+        run: >
+          cmake 
+          -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.1/bin/nvcc
+          -DCMAKE_CXX_COMPILER=${{matrix.compiler}}
+          -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache
+          -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+          -DQUDA_GPU_ARCH=sm_80 -DQUDA_GPU_ARCH_SUFFIX=virtual 
+          -DQUDA_MULTIGRID=ON 
+          -DQUDA_JITIFY=ON
+          -DQUDA_MPI=ON -DMPI_CXX_SKIP_MPICXX=ON 
+          -DQUDA_PRECISION=10 -DQUDA_FAST_COMPILE_DSLASH=ON -DQUDA_FAST_COMPILE_REDUCE=ON
+          -GNinja
+          -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
+
+      - name: Build
+        run: cmake --build ${{github.workspace}}/build -v
+
+      - name: Install
+        run: cmake --install ${{github.workspace}}/build 
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e14963618..dcde282f73 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -339,7 +339,7 @@ set(CMAKE_CXX_FLAGS_DEVEL
   "-g -O3"
   CACHE STRING "Flags used by the C++ compiler during regular development builds.")
 set(CMAKE_CXX_FLAGS_STRICT
-  "-O3"
+  "-Os"
   CACHE STRING "Flags used by the C++ compiler during strict jenkins builds.")
 set(CMAKE_CXX_FLAGS_RELEASE
   "-O3 ${CXX_OPT}"
@@ -361,7 +361,7 @@ set(CMAKE_C_FLAGS_DEVEL
   "-g -O3"
   CACHE STRING "Flags used by the C compiler during regular development builds.")
 set(CMAKE_C_FLAGS_STRICT
-  "-O3"
+  "-Os"
   CACHE STRING "Flags used by the C compiler during strict jenkins builds.")
 set(CMAKE_C_FLAGS_RELEASE
   "-O3"
diff --git a/lib/dslash_coarse.hpp b/lib/dslash_coarse.hpp
index 6e46d953fe..f696dac20e 100644
--- a/lib/dslash_coarse.hpp
+++ b/lib/dslash_coarse.hpp
@@ -669,10 +669,12 @@ namespace quda {
 
   static inline void enable_policy(DslashCoarsePolicy p) { policies[static_cast<std::size_t>(p)] = p; }
 
+#if 0
   static inline void disable_policy(DslashCoarsePolicy p)
   {
     policies[static_cast<std::size_t>(p)] = DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED;
   }
+#endif
 
   template <typename Launch>
   class DslashCoarsePolicyTune : public Tunable {
diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp
index 7988dc6479..bd18d0f1d4 100644
--- a/lib/targets/cuda/malloc.cpp
+++ b/lib/targets/cuda/malloc.cpp
@@ -482,6 +482,7 @@ namespace quda
   void host_free_(const char *func, const char *file, int line, void *ptr)
   {
     if (!ptr) { errorQuda("Attempt to free NULL host pointer (%s:%d in %s())\n", file, line, func); }
+    track_free(MAPPED, ptr);
     if (alloc[HOST].count(ptr)) {
       track_free(HOST, ptr);
       free(ptr);
diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index a62e971420..ec4b87a7ae 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -242,6 +242,7 @@ target_include_directories(quda PRIVATE ${CMAKE_SOURCE_DIR}/include/targets/cuda
 target_include_directories(quda PUBLIC $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include/targets/cuda>
                                        $<INSTALL_INTERFACE:include/targets/cuda>)
 target_include_directories(quda SYSTEM PRIVATE ${CMAKE_SOURCE_DIR}/include/targets/cuda/externals)
+target_include_directories(quda_cpp SYSTEM PRIVATE ${CMAKE_SOURCE_DIR}/include/targets/cuda/externals)
 
 # Specific config dependent warning suppressions and lineinfo forwarding
 
@@ -257,7 +258,9 @@ target_compile_options(
           -Wreorder
           $<$<CXX_COMPILER_ID:Clang>:
           -Xcompiler=-Wno-unused-function
-          -Xcompiler=-Wno-unknown-pragmas>
+          -Xcompiler=-Wno-unknown-pragmas
+          -Xcompiler=-Wno-error=pass-failed
+          -Xcompiler=-Wno-error=unneeded-internal-declaration>
           $<$<CXX_COMPILER_ID:GNU>:
           -Xcompiler=-Wno-unknown-pragmas>
           $<$<CONFIG:DEVEL>:-Xptxas

From e8ab6007a82a20e684d44f5f525b843800fa94e4 Mon Sep 17 00:00:00 2001
From: Mathias Wagner <mathiasw@nvidia.com>
Date: Thu, 20 Jul 2023 09:59:36 +0200
Subject: [PATCH 2/9] remove track_free

it might not be necessary here
---
 lib/targets/cuda/malloc.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp
index bd18d0f1d4..14dbe30ce0 100644
--- a/lib/targets/cuda/malloc.cpp
+++ b/lib/targets/cuda/malloc.cpp
@@ -240,7 +240,7 @@ namespace quda
 #endif
 
     if (is_prefetch_enabled()) qudaMemPrefetchAsync(ptr, size, QUDA_CUDA_FIELD_LOCATION, device::get_default_stream());
-    track_malloc(DEVICE, a, ptr);
+    _malloc(DEVICE, a, ptr);
 #ifdef HOST_DEBUG
     cudaMemset(ptr, 0xff, size);
 #endif
@@ -267,7 +267,7 @@ namespace quda
     if (err != CUDA_SUCCESS) {
       errorQuda("Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
     }
-    track_malloc(DEVICE_PINNED, a, ptr);
+    _malloc(DEVICE_PINNED, a, ptr);
 #ifdef HOST_DEBUG
     cudaMemset(ptr, 0xff, size);
 #endif
@@ -286,7 +286,7 @@ namespace quda
 
     void *ptr = malloc(size);
     if (!ptr) { errorQuda("Failed to allocate host memory of size %zu (%s:%d in %s())\n", size, file, line, func); }
-    track_malloc(HOST, a, ptr);
+    _malloc(HOST, a, ptr);
 #ifdef HOST_DEBUG
     memset(ptr, 0xff, size);
 #endif
@@ -311,7 +311,7 @@ namespace quda
     if (err != cudaSuccess) {
       errorQuda("Failed to register pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func);
     }
-    track_malloc(PINNED, a, ptr);
+    _malloc(PINNED, a, ptr);
 #ifdef HOST_DEBUG
     memset(ptr, 0xff, a.base_size);
 #endif
@@ -482,7 +482,6 @@ namespace quda
   void host_free_(const char *func, const char *file, int line, void *ptr)
   {
     if (!ptr) { errorQuda("Attempt to free NULL host pointer (%s:%d in %s())\n", file, line, func); }
-    track_free(MAPPED, ptr);
     if (alloc[HOST].count(ptr)) {
       track_free(HOST, ptr);
       free(ptr);

From 96de83a9d831a705bf9854f0ae117afe6dec73b3 Mon Sep 17 00:00:00 2001
From: Mathias Wagner <mathiasw@nvidia.com>
Date: Thu, 20 Jul 2023 10:06:37 +0200
Subject: [PATCH 3/9] Update malloc.cpp

---
 lib/targets/cuda/malloc.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp
index 14dbe30ce0..7988dc6479 100644
--- a/lib/targets/cuda/malloc.cpp
+++ b/lib/targets/cuda/malloc.cpp
@@ -240,7 +240,7 @@ namespace quda
 #endif
 
     if (is_prefetch_enabled()) qudaMemPrefetchAsync(ptr, size, QUDA_CUDA_FIELD_LOCATION, device::get_default_stream());
-    _malloc(DEVICE, a, ptr);
+    track_malloc(DEVICE, a, ptr);
 #ifdef HOST_DEBUG
     cudaMemset(ptr, 0xff, size);
 #endif
@@ -267,7 +267,7 @@ namespace quda
     if (err != CUDA_SUCCESS) {
       errorQuda("Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
     }
-    _malloc(DEVICE_PINNED, a, ptr);
+    track_malloc(DEVICE_PINNED, a, ptr);
 #ifdef HOST_DEBUG
     cudaMemset(ptr, 0xff, size);
 #endif
@@ -286,7 +286,7 @@ namespace quda
 
     void *ptr = malloc(size);
     if (!ptr) { errorQuda("Failed to allocate host memory of size %zu (%s:%d in %s())\n", size, file, line, func); }
-    _malloc(HOST, a, ptr);
+    track_malloc(HOST, a, ptr);
 #ifdef HOST_DEBUG
     memset(ptr, 0xff, size);
 #endif
@@ -311,7 +311,7 @@ namespace quda
     if (err != cudaSuccess) {
       errorQuda("Failed to register pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func);
     }
-    _malloc(PINNED, a, ptr);
+    track_malloc(PINNED, a, ptr);
 #ifdef HOST_DEBUG
     memset(ptr, 0xff, a.base_size);
 #endif

From 2b147dc644e518c128c33056b01b58856179fc3b Mon Sep 17 00:00:00 2001
From: Mahias Wagner <mathiasw@nvidia.com>
Date: Thu, 20 Jul 2023 10:29:27 +0200
Subject: [PATCH 4/9] further reduce build

---
 .github/workflows/cuda_githubactions_build.yml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/cuda_githubactions_build.yml b/.github/workflows/cuda_githubactions_build.yml
index 9cd10f6500..8248e5242e 100644
--- a/.github/workflows/cuda_githubactions_build.yml
+++ b/.github/workflows/cuda_githubactions_build.yml
@@ -1,4 +1,4 @@
-name: CUDA_GithubActions_Build
+name: cuda_ghactions_build
 
 on:
   pull_request:
@@ -47,16 +47,17 @@ jobs:
           -DCMAKE_CXX_COMPILER=${{matrix.compiler}}
           -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache
           -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-          -DQUDA_GPU_ARCH=sm_80 -DQUDA_GPU_ARCH_SUFFIX=virtual 
-          -DQUDA_MULTIGRID=ON 
-          -DQUDA_JITIFY=ON
+          -DQUDA_GPU_ARCH=sm_80 -DQUDA_GPU_ARCH_SUFFIX=virtual -DQUDA_JITIFY=ON
+          -DQUDA_MULTIGRID=ON
+          -DQUDA_MULTIGRID_NVEC_LIST=24
+          -DQUDA_MDW_FUSED_LS_LIST=4
           -DQUDA_MPI=ON -DMPI_CXX_SKIP_MPICXX=ON 
           -DQUDA_PRECISION=10 -DQUDA_FAST_COMPILE_DSLASH=ON -DQUDA_FAST_COMPILE_REDUCE=ON
           -GNinja
           -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
 
       - name: Build
-        run: cmake --build ${{github.workspace}}/build -v
+        run: cmake --build ${{github.workspace}}/build
 
       - name: Install
         run: cmake --install ${{github.workspace}}/build 

From 25ee0ac18abed7b98704ecd91696a93acf25b7e8 Mon Sep 17 00:00:00 2001
From: Mahias Wagner <mathiasw@nvidia.com>
Date: Fri, 21 Jul 2023 10:41:25 +0200
Subject: [PATCH 5/9] bump CPM version

---
 CMakeLists.txt  | 13 +++++--------
 cmake/CPM.cmake | 16 ++++++++++++++--
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dcde282f73..51e2d0f6ba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -419,14 +419,11 @@ if(QUDA_DOWNLOAD_EIGEN)
       CACHE STRING "Eigen use for QUDA_DOWNLOAD_EIGEN")
   mark_as_advanced(QUDA_EIGEN_VERSION)
   CPMAddPackage(
-    NAME
-    Eigen
-    VERSION
-    ${QUDA_EIGEN_VERSION}
-    URL
-    https://gitlab.com/libeigen/eigen/-/archive/${QUDA_EIGEN_VERSION}/eigen-${QUDA_EIGEN_VERSION}.tar.bz2
-    DOWNLOAD_ONLY
-    YES)
+    NAME Eigen
+    VERSION ${QUDA_EIGEN_VERSION}
+    URL https://gitlab.com/libeigen/eigen/-/archive/${QUDA_EIGEN_VERSION}/eigen-${QUDA_EIGEN_VERSION}.tar.bz2
+    DOWNLOAD_ONLY YES
+    SYSTEM YES)
   target_include_directories(Eigen SYSTEM INTERFACE ${Eigen_SOURCE_DIR})
   install(DIRECTORY ${Eigen_SOURCE_DIR}/Eigen TYPE INCLUDE)
 else()
diff --git a/cmake/CPM.cmake b/cmake/CPM.cmake
index 772103fc33..f49d743412 100644
--- a/cmake/CPM.cmake
+++ b/cmake/CPM.cmake
@@ -1,4 +1,4 @@
-set(CPM_DOWNLOAD_VERSION 0.36.0)
+set(CPM_DOWNLOAD_VERSION 0.38.2)
 
 if(CPM_SOURCE_CACHE)
   set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
@@ -10,12 +10,24 @@ endif()
 
 # Expand relative path. This is important if the provided path contains a tilde (~)
 get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)
-if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
+
+function(download_cpm)
   message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
   file(DOWNLOAD
        https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
        ${CPM_DOWNLOAD_LOCATION}
   )
+endfunction()
+
+if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
+  download_cpm()
+else()
+  # resume download if it previously failed
+  file(READ ${CPM_DOWNLOAD_LOCATION} check)
+  if("${check}" STREQUAL "")
+    download_cpm()
+  endif()
+  unset(check)
 endif()
 
 include(${CPM_DOWNLOAD_LOCATION})

From 0ac01fa313a400745f7760ee2807259fb072c0b7 Mon Sep 17 00:00:00 2001
From: Mahias Wagner <mathiasw@nvidia.com>
Date: Tue, 25 Jul 2023 20:23:42 +0200
Subject: [PATCH 6/9] fix for clang warning options

---
 lib/targets/cuda/target_cuda.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index ec4b87a7ae..82dd41ddae 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -259,8 +259,8 @@ target_compile_options(
           $<$<CXX_COMPILER_ID:Clang>:
           -Xcompiler=-Wno-unused-function
           -Xcompiler=-Wno-unknown-pragmas
-          -Xcompiler=-Wno-error=pass-failed
-          -Xcompiler=-Wno-error=unneeded-internal-declaration>
+          -Xcompiler=-Wno-pass-failed
+          -Xcompiler=-Wno-unneeded-internal-declaration>
           $<$<CXX_COMPILER_ID:GNU>:
           -Xcompiler=-Wno-unknown-pragmas>
           $<$<CONFIG:DEVEL>:-Xptxas

From 23e2d8891c1d7e8e4117a23513d7c7e9efefa74a Mon Sep 17 00:00:00 2001
From: Mahias Wagner <mathiasw@nvidia.com>
Date: Wed, 26 Jul 2023 16:32:45 +0200
Subject: [PATCH 7/9] Revert "fix for clang warning options"

This reverts commit 0ac01fa313a400745f7760ee2807259fb072c0b7.
---
 lib/targets/cuda/target_cuda.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index 82dd41ddae..ec4b87a7ae 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -259,8 +259,8 @@ target_compile_options(
           $<$<CXX_COMPILER_ID:Clang>:
           -Xcompiler=-Wno-unused-function
           -Xcompiler=-Wno-unknown-pragmas
-          -Xcompiler=-Wno-pass-failed
-          -Xcompiler=-Wno-unneeded-internal-declaration>
+          -Xcompiler=-Wno-error=pass-failed
+          -Xcompiler=-Wno-error=unneeded-internal-declaration>
           $<$<CXX_COMPILER_ID:GNU>:
           -Xcompiler=-Wno-unknown-pragmas>
           $<$<CONFIG:DEVEL>:-Xptxas

From 2406c188dbc0dab0224c9414cf0add7367c9944d Mon Sep 17 00:00:00 2001
From: Mahias Wagner <mathiasw@nvidia.com>
Date: Wed, 26 Jul 2023 16:34:59 +0200
Subject: [PATCH 8/9] remove check_language(CUDA) as this hits
 https://gitlab.kitware.com/cmake/cmake/-/issues/25093

---
 lib/targets/cuda/target_cuda.cmake | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index ec4b87a7ae..605a70e073 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -3,8 +3,6 @@
 set(CMAKE_CUDA_EXTENSIONS OFF)
 
 find_package(CUDAToolkit REQUIRED)
-include(CheckLanguage)
-check_language(CUDA)
 
 set(QUDA_TARGET_CUDA ON)
 

From 929be0369af66674254875bdf7e541e6fbb2022d Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Sun, 13 Aug 2023 22:17:51 -0700
Subject: [PATCH 9/9] Fix code so we can remove
 -Wno-error=unneeded-internal-declaration when using clang.  Set loop unroll
 count for host code to 4 when using clang (avoids need for
 -Wno-error=pass-failed)

---
 lib/coarse_op.cuh                  |  9 ++++-----
 lib/dslash_coarse.hpp              | 26 +++++++++-----------------
 lib/targets/cuda/target_cuda.cmake |  4 ++--
 3 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/lib/coarse_op.cuh b/lib/coarse_op.cuh
index ff14edaee8..4c23f50781 100644
--- a/lib/coarse_op.cuh
+++ b/lib/coarse_op.cuh
@@ -7,11 +7,6 @@
 
 namespace quda {
 
-  // For coarsening un-preconditioned operators we use uni-directional
-  // coarsening to reduce the set up code.  For debugging we can force
-  // bi-directional coarsening.
-  static bool bidirectional_debug = false;
-
   enum ComputeType {
     COMPUTE_UV,
     COMPUTE_LV,
@@ -983,6 +978,10 @@ namespace quda {
                   double mu_factor, bool allow_truncation, QudaDiracType dirac, QudaMatPCType matpc, bool need_bidirectional,
                   const int *fine_to_coarse, const int *coarse_to_fine)
   {
+    // For coarsening un-preconditioned operators we use uni-directional
+    // coarsening to reduce the set up code.  For debugging we can force
+    // bi-directional coarsening.
+    static bool bidirectional_debug = false;
 
     // sanity checks
     if (matpc == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpc == QUDA_MATPC_ODD_ODD_ASYMMETRIC)
diff --git a/lib/dslash_coarse.hpp b/lib/dslash_coarse.hpp
index f696dac20e..ebb8c03a05 100644
--- a/lib/dslash_coarse.hpp
+++ b/lib/dslash_coarse.hpp
@@ -660,25 +660,17 @@ namespace quda {
     }
   };
 
-  static bool dslash_init = false;
-  static std::vector<DslashCoarsePolicy> policies(static_cast<int>(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED), DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED);
-  static int first_active_policy=static_cast<int>(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED);
-
-  // string used as a tunekey to ensure we retune if the dslash policy env changes
-  static char policy_string[TuneKey::aux_n];
-
-  static inline void enable_policy(DslashCoarsePolicy p) { policies[static_cast<std::size_t>(p)] = p; }
-
-#if 0
-  static inline void disable_policy(DslashCoarsePolicy p)
-  {
-    policies[static_cast<std::size_t>(p)] = DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED;
-  }
-#endif
-
   template <typename Launch>
   class DslashCoarsePolicyTune : public Tunable {
 
+    static inline bool dslash_init = false;
+    static inline int first_active_policy = static_cast<int>(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED);
+    // string used as a tunekey to ensure we retune if the dslash policy env changes
+    static inline char policy_string[TuneKey::aux_n] = {};
+    static inline std::vector<DslashCoarsePolicy> policies = {static_cast<int>(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED), DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED};
+
+    static void enable_policy(DslashCoarsePolicy p) { policies[static_cast<std::size_t>(p)] = p; }
+
    Launch &dslash;
 
    bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
@@ -686,7 +678,7 @@ namespace quda {
    static constexpr bool enable_coarse_shmem_overlap = Launch::enable_coarse_shmem_overlap();
 
  public:
-   inline DslashCoarsePolicyTune(Launch &dslash) : dslash(dslash)
+   DslashCoarsePolicyTune(Launch &dslash) : dslash(dslash)
    {
       if (!dslash_init) {
 
diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index 605a70e073..576cc5c5d8 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -257,8 +257,8 @@ target_compile_options(
           $<$<CXX_COMPILER_ID:Clang>:
           -Xcompiler=-Wno-unused-function
           -Xcompiler=-Wno-unknown-pragmas
-          -Xcompiler=-Wno-error=pass-failed
-          -Xcompiler=-Wno-error=unneeded-internal-declaration>
+          -Xcompiler=-mllvm\ -unroll-count=4
+          >
           $<$<CXX_COMPILER_ID:GNU>:
           -Xcompiler=-Wno-unknown-pragmas>
           $<$<CONFIG:DEVEL>:-Xptxas