Merge branch 'develop' of github.com:lattice/quda into feature/gaugef…

…ield_unity
lattice · Aug 18, 2023 · a92296b · a92296b
2 parents 50987b1 + b869d61
commit a92296b
Show file tree

Hide file tree

Showing 26 changed files with 323 additions and 155 deletions.
diff --git a/.github/workflows/cuda_githubactions_build.yml b/.github/workflows/cuda_githubactions_build.yml
@@ -0,0 +1,64 @@
+name: cuda_ghactions_build
+
+on:
+  pull_request:
+    branches: [ "develop" ]
+
+defaults:
+  run:
+    shell: bash
+
+env:
+  BUILD_TYPE: STRICT
+  CCACHE_COMPILERCHECK: content
+
+jobs:
+    build:
+      strategy:
+        matrix:
+          compiler: [g++-12, clang++-14]
+      runs-on: ubuntu-latest
+
+      steps:
+      - name: Install software
+        run: |
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
+          sudo dpkg -i cuda-keyring_1.0-1_all.deb
+          sudo apt-get update -y
+          sudo apt-get install -y --no-install-recommends  ninja-build cmake libopenmpi-dev gfortran
+     
+      - uses: awalsh128/cache-apt-pkgs-action@latest
+        with:
+          packages: cuda-compiler-12-1 cuda-libraries-dev-12-1 cuda-nvml-dev-12-1
+          execute_install_scripts: true
+
+      - uses: actions/checkout@v3
+
+      - name: Ccache for gh actions
+        uses: hendrikmuhs/ccache-action@v1.2.9
+        with:
+          key: ${{ github.job }}-${{ matrix.compiler }}
+          max-size: 2000M
+
+      - name: Configure CMake
+        run: >
+          cmake 
+          -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.1/bin/nvcc
+          -DCMAKE_CXX_COMPILER=${{matrix.compiler}}
+          -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache
+          -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+          -DQUDA_GPU_ARCH=sm_80 -DQUDA_GPU_ARCH_SUFFIX=virtual -DQUDA_JITIFY=ON
+          -DQUDA_MULTIGRID=ON
+          -DQUDA_MULTIGRID_NVEC_LIST=24
+          -DQUDA_MDW_FUSED_LS_LIST=4
+          -DQUDA_MPI=ON -DMPI_CXX_SKIP_MPICXX=ON 
+          -DQUDA_PRECISION=10 -DQUDA_FAST_COMPILE_DSLASH=ON -DQUDA_FAST_COMPILE_REDUCE=ON
+          -GNinja
+          -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
+
+      - name: Build
+        run: cmake --build ${{github.workspace}}/build
+
+      - name: Install
+        run: cmake --install ${{github.workspace}}/build 
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -339,7 +339,7 @@ set(CMAKE_CXX_FLAGS_DEVEL
   "-g -O3"
   CACHE STRING "Flags used by the C++ compiler during regular development builds.")
 set(CMAKE_CXX_FLAGS_STRICT
-  "-O3"
+  "-Os"
   CACHE STRING "Flags used by the C++ compiler during strict jenkins builds.")
 set(CMAKE_CXX_FLAGS_RELEASE
   "-O3 ${CXX_OPT}"
@@ -361,7 +361,7 @@ set(CMAKE_C_FLAGS_DEVEL
   "-g -O3"
   CACHE STRING "Flags used by the C compiler during regular development builds.")
 set(CMAKE_C_FLAGS_STRICT
-  "-O3"
+  "-Os"
   CACHE STRING "Flags used by the C compiler during strict jenkins builds.")
 set(CMAKE_C_FLAGS_RELEASE
   "-O3"
@@ -419,14 +419,11 @@ if(QUDA_DOWNLOAD_EIGEN)
       CACHE STRING "Eigen use for QUDA_DOWNLOAD_EIGEN")
   mark_as_advanced(QUDA_EIGEN_VERSION)
   CPMAddPackage(
-    NAME
-    Eigen
-    VERSION
-    ${QUDA_EIGEN_VERSION}
-    URL
-    https://gitlab.com/libeigen/eigen/-/archive/${QUDA_EIGEN_VERSION}/eigen-${QUDA_EIGEN_VERSION}.tar.bz2
-    DOWNLOAD_ONLY
-    YES)
+    NAME Eigen
+    VERSION ${QUDA_EIGEN_VERSION}
+    URL https://gitlab.com/libeigen/eigen/-/archive/${QUDA_EIGEN_VERSION}/eigen-${QUDA_EIGEN_VERSION}.tar.bz2
+    DOWNLOAD_ONLY YES
+    SYSTEM YES)
   target_include_directories(Eigen SYSTEM INTERFACE ${Eigen_SOURCE_DIR})
   install(DIRECTORY ${Eigen_SOURCE_DIR}/Eigen TYPE INCLUDE)
 else()

diff --git a/cmake/CPM.cmake b/cmake/CPM.cmake
@@ -1,4 +1,4 @@
-set(CPM_DOWNLOAD_VERSION 0.36.0)
+set(CPM_DOWNLOAD_VERSION 0.38.2)
 
 if(CPM_SOURCE_CACHE)
   set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
@@ -10,12 +10,24 @@ endif()
 
 # Expand relative path. This is important if the provided path contains a tilde (~)
 get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)
-if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
+
+function(download_cpm)
   message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
   file(DOWNLOAD
        https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
        ${CPM_DOWNLOAD_LOCATION}
   )
+endfunction()
+
+if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
+  download_cpm()
+else()
+  # resume download if it previously failed
+  file(READ ${CPM_DOWNLOAD_LOCATION} check)
+  if("${check}" STREQUAL "")
+    download_cpm()
+  endif()
+  unset(check)
 endif()
 
 include(${CPM_DOWNLOAD_LOCATION})
diff --git a/include/multigrid.h b/include/multigrid.h
@@ -157,6 +157,9 @@ namespace quda {
     /** Filename for where to load/store the null space */
     char filename[100];
 
+    /** Whether to save in partfile format (true) or singlefile (false) */
+    bool mg_vec_partfile;
+
     /** Whether or not this is a staggered solve or not */
     QudaTransferType transfer_type;
 
@@ -193,6 +196,7 @@ namespace quda {
       smoother_solve_type(param.smoother_solve_type[level]),
       location(param.location[level]),
       setup_location(param.setup_location[level]),
+      mg_vec_partfile(param.mg_vec_partfile[level]),
       transfer_type(param.transfer_type[level]),
       setup_use_mma(param.setup_use_mma[level] == QUDA_BOOLEAN_TRUE),
       dslash_use_mma(param.dslash_use_mma[level] == QUDA_BOOLEAN_TRUE)
@@ -230,6 +234,7 @@ namespace quda {
       smoother_solve_type(param.mg_global.smoother_solve_type[level]),
       location(param.mg_global.location[level]),
       setup_location(param.mg_global.setup_location[level]),
+      mg_vec_partfile(param.mg_global.mg_vec_partfile[level]),
       transfer_type(param.mg_global.transfer_type[level]),
       setup_use_mma(param.mg_global.setup_use_mma[level] == QUDA_BOOLEAN_TRUE),
       dslash_use_mma(param.mg_global.dslash_use_mma[level] == QUDA_BOOLEAN_TRUE)

diff --git a/include/qio_field.h b/include/qio_field.h
@@ -6,8 +6,9 @@ void read_gauge_field(const char *filename, void *gauge[], QudaPrecision prec, c
 void write_gauge_field(const char *filename, void *gauge[], QudaPrecision prec, const int *X, int argc, char *argv[]);
 void read_spinor_field(const char *filename, void *V[], QudaPrecision precision, const int *X, QudaSiteSubset subset,
                        QudaParity parity, int nColor, int nSpin, int Nvec, int argc, char *argv[]);
-void write_spinor_field(const char *filename, const void *V[], QudaPrecision precision, const int *X, QudaSiteSubset subset,
-                        QudaParity parity, int nColor, int nSpin, int Nvec, int argc, char *argv[]);
+void write_spinor_field(const char *filename, const void *V[], QudaPrecision precision, const int *X,
+                        QudaSiteSubset subset, QudaParity parity, int nColor, int nSpin, int Nvec, int argc,
+                        char *argv[], bool partfile = false);
 #else
 inline void read_gauge_field(const char *, void *[], QudaPrecision, const int *, int, char *[])
 {
@@ -25,8 +26,8 @@ inline void read_spinor_field(const char *, void *[], QudaPrecision, const int *
   printf("QIO support has not been enabled\n");
   exit(-1);
 }
-inline void write_spinor_field(const char *, const void *[], QudaPrecision, const int *, QudaSiteSubset, QudaParity, int, int,
-                               int, int, char *[])
+inline void write_spinor_field(const char *, const void *[], QudaPrecision, const int *, QudaSiteSubset, QudaParity,
+                               int, int, int, int, char *[], bool)
 {
   printf("QIO support has not been enabled\n");
   exit(-1);

diff --git a/include/quda.h b/include/quda.h
@@ -585,6 +585,9 @@ extern "C" {
         MILC I/O) */
     QudaBoolean io_parity_inflate;
 
+    /** Whether to save eigenvectors in QIO singlefile or partfile format */
+    QudaBoolean partfile;
+
     /** The Gflops rate of the eigensolver setup */
     double gflops;
 
@@ -779,6 +782,9 @@ extern "C" {
     /** Filename prefix for where to save the null-space vectors */
     char vec_outfile[QUDA_MAX_MG_LEVEL][256];
 
+    /** Whether to store the null-space vectors in singlefile or partfile format */
+    QudaBoolean mg_vec_partfile[QUDA_MAX_MG_LEVEL];
+
     /** Whether to use and initial guess during coarse grid deflation */
     QudaBoolean coarse_guess;
 

diff --git a/include/vector_io.h b/include/vector_io.h
@@ -15,15 +15,17 @@ namespace quda
   {
     const std::string filename;
     bool parity_inflate;
+    bool partfile;
 
   public:
     /**
        Constructor for VectorIO class
        @param[in] filename The filename associated with this IO object
        @param[in] parity_inflate Whether to inflate single_parity
        field to dual parity fields for I/O
+       @param[in] partfile Whether or not to save in partfiles (ignored on load)
     */
-    VectorIO(const std::string &filename, bool parity_inflate = false);
+    VectorIO(const std::string &filename, bool parity_inflate = false, bool partfile = false);
 
     /**
        @brief Load vectors from filename

diff --git a/lib/check_params.h b/lib/check_params.h
@@ -197,6 +197,7 @@ void printQudaEigParam(QudaEigParam *param) {
   P(extlib_type, QUDA_EIGEN_EXTLIB);
   P(mem_type_ritz, QUDA_MEMORY_DEVICE);
   P(ortho_block_size, 0);
+  P(partfile, QUDA_BOOLEAN_FALSE);
 #else
   P(use_eigen_qr, QUDA_BOOLEAN_INVALID);
   P(use_poly_acc, QUDA_BOOLEAN_INVALID);
@@ -226,6 +227,7 @@ void printQudaEigParam(QudaEigParam *param) {
   P(extlib_type, QUDA_EXTLIB_INVALID);
   P(mem_type_ritz, QUDA_MEMORY_INVALID);
   P(ortho_block_size, INVALID_INT);
+  P(partfile, QUDA_BOOLEAN_INVALID);
 #endif
 
   // only need to enfore block size checking if doing a block eigen solve
@@ -931,6 +933,12 @@ void printQudaMultigridParam(QudaMultigridParam *param) {
 #else
     P(setup_location[i], QUDA_INVALID_FIELD_LOCATION);
 #endif
+
+#ifdef INIT_PARAM
+    P(mg_vec_partfile[i], QUDA_BOOLEAN_FALSE);
+#else
+    P(mg_vec_partfile[i], QUDA_BOOLEAN_INVALID);
+#endif
   }
 
 #ifdef INIT_PARAM

diff --git a/lib/coarse_op.cuh b/lib/coarse_op.cuh
@@ -7,11 +7,6 @@
 
 namespace quda {
 
-  // For coarsening un-preconditioned operators we use uni-directional
-  // coarsening to reduce the set up code.  For debugging we can force
-  // bi-directional coarsening.
-  static bool bidirectional_debug = false;
-
   enum ComputeType {
     COMPUTE_UV,
     COMPUTE_LV,
@@ -983,6 +978,10 @@ namespace quda {
                   double mu_factor, bool allow_truncation, QudaDiracType dirac, QudaMatPCType matpc, bool need_bidirectional,
                   const int *fine_to_coarse, const int *coarse_to_fine)
   {
+    // For coarsening un-preconditioned operators we use uni-directional
+    // coarsening to reduce the set up code.  For debugging we can force
+    // bi-directional coarsening.
+    static bool bidirectional_debug = false;
 
     // sanity checks
     if (matpc == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpc == QUDA_MATPC_ODD_ODD_ASYMMETRIC)

diff --git a/lib/dslash_coarse.hpp b/lib/dslash_coarse.hpp
@@ -660,31 +660,25 @@ namespace quda {
     }
   };
 
-  static bool dslash_init = false;
-  static std::vector<DslashCoarsePolicy> policies(static_cast<int>(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED), DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED);
-  static int first_active_policy=static_cast<int>(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED);
-
-  // string used as a tunekey to ensure we retune if the dslash policy env changes
-  static char policy_string[TuneKey::aux_n];
-
-  static inline void enable_policy(DslashCoarsePolicy p) { policies[static_cast<std::size_t>(p)] = p; }
-
-  static inline void disable_policy(DslashCoarsePolicy p)
-  {
-    policies[static_cast<std::size_t>(p)] = DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED;
-  }
-
   template <typename Launch>
   class DslashCoarsePolicyTune : public Tunable {
 
+    static inline bool dslash_init = false;
+    static inline int first_active_policy = static_cast<int>(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED);
+    // string used as a tunekey to ensure we retune if the dslash policy env changes
+    static inline char policy_string[TuneKey::aux_n] = {};
+    static inline std::vector<DslashCoarsePolicy> policies = {static_cast<int>(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED), DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED};
+
+    static void enable_policy(DslashCoarsePolicy p) { policies[static_cast<std::size_t>(p)] = p; }
+
    Launch &dslash;
 
    bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
    bool tuneAuxDim() const { return true; } // Do tune the aux dimensions.
    static constexpr bool enable_coarse_shmem_overlap = Launch::enable_coarse_shmem_overlap();
 
  public:
-   inline DslashCoarsePolicyTune(Launch &dslash) : dslash(dslash)
+   DslashCoarsePolicyTune(Launch &dslash) : dslash(dslash)
    {
       if (!dslash_init) {
 

diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp
@@ -255,7 +255,7 @@ namespace quda
       for (auto &k : kSpace) k.setSuggestedParity(mat_parity);
 
       // save the vectors
-      VectorIO io(eig_param->vec_outfile, eig_param->io_parity_inflate == QUDA_BOOLEAN_TRUE);
+      VectorIO io(eig_param->vec_outfile, eig_param->io_parity_inflate == QUDA_BOOLEAN_TRUE, eig_param->partfile);
       io.save(kSpace, save_prec, n_eig);
     }
 

diff --git a/lib/inv_cg_quda.cpp b/lib/inv_cg_quda.cpp
@@ -834,14 +834,16 @@ namespace quda {
         blas::copy(x, xSloppy); // no op when these pointers alias
         blas::xpy(x, y);
         mat(r, y);
-        blas::copy(rSloppy, r); // no op when these pointers alias
-        blas::zero(xSloppy);
 
         // Recompute the exact residual and heavy quark residual
         r2 = blas::xmyNorm(b, r);
         rNorm = sqrt(r2);
         hq_res = sqrt(blas::HeavyQuarkResidualNorm(y, r).z);
 
+        // Copy and update fields
+        blas::copy(rSloppy, r); // no op when these pointers alias
+        blas::zero(xSloppy);
+
         // Check and see if we're "done" with the L2 norm. This could be because
         // we were already done with it, we never needed it, or the L2 norm has finally converged.
         if (!L2breakdown && convergenceL2(r2, hq_res, stop, param.tol_hq)) L2breakdown = true;
@@ -941,13 +943,13 @@ namespace quda {
         // we "reset" the solve in a different way.
         if (heavy_quark_restart) {
           // If we're in the HQ residual part of the solve, we just do a hard CG restart.
-          logQuda(QUDA_SUMMARIZE, "HQ restart == hard CG restart\n");
+          logQuda(QUDA_DEBUG_VERBOSE, "HQ restart == hard CG restart\n");
           blas::copy(p, rSloppy);
           heavy_quark_restart = false;
         } else {
           // If we're still in the L2 norm part of the solve, we explicitly restore
           // the orthogonality of the gradient vector, recompute beta, update `p`, and carry on with our lives.
-          logQuda(QUDA_SUMMARIZE, "Regular restart == explicit gradient vector re-orthogonalization\n");
+          logQuda(QUDA_DEBUG_VERBOSE, "Regular restart == explicit gradient vector re-orthogonalization\n");
           Complex rp = blas::cDotProduct(rSloppy, p) / (r2);
           blas::caxpy(-rp, rSloppy, p);