From 3a9db3361cf9e567ec7bc2b393e01ccdac40b1a5 Mon Sep 17 00:00:00 2001 From: Mahias Wagner Date: Wed, 19 Jul 2023 16:22:06 +0200 Subject: [PATCH 1/9] add github actions build for CUDA --- .../workflows/cuda_githubactions_build.yml | 63 +++++++++++++++++++ CMakeLists.txt | 4 +- lib/dslash_coarse.hpp | 2 + lib/targets/cuda/malloc.cpp | 1 + lib/targets/cuda/target_cuda.cmake | 5 +- 5 files changed, 72 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/cuda_githubactions_build.yml diff --git a/.github/workflows/cuda_githubactions_build.yml b/.github/workflows/cuda_githubactions_build.yml new file mode 100644 index 0000000000..9cd10f6500 --- /dev/null +++ b/.github/workflows/cuda_githubactions_build.yml @@ -0,0 +1,63 @@ +name: CUDA_GithubActions_Build + +on: + pull_request: + branches: [ "develop" ] + +defaults: + run: + shell: bash + +env: + BUILD_TYPE: STRICT + CCACHE_COMPILERCHECK: content + +jobs: + build: + strategy: + matrix: + compiler: [g++-12, clang++-14] + runs-on: ubuntu-latest + + steps: + - name: Install software + run: | + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb + sudo dpkg -i cuda-keyring_1.0-1_all.deb + sudo apt-get update -y + sudo apt-get install -y --no-install-recommends ninja-build cmake libopenmpi-dev gfortran + + - uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: cuda-compiler-12-1 cuda-libraries-dev-12-1 cuda-nvml-dev-12-1 + execute_install_scripts: true + + - uses: actions/checkout@v3 + + - name: Ccache for gh actions + uses: hendrikmuhs/ccache-action@v1.2.9 + with: + key: ${{ github.job }}-${{ matrix.compiler }} + max-size: 2000M + + - name: Configure CMake + run: > + cmake + -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.1/bin/nvcc + -DCMAKE_CXX_COMPILER=${{matrix.compiler}} + -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache + -DQUDA_GPU_ARCH=sm_80 -DQUDA_GPU_ARCH_SUFFIX=virtual + -DQUDA_MULTIGRID=ON + -DQUDA_JITIFY=ON + -DQUDA_MPI=ON -DMPI_CXX_SKIP_MPICXX=ON + -DQUDA_PRECISION=10 -DQUDA_FAST_COMPILE_DSLASH=ON -DQUDA_FAST_COMPILE_REDUCE=ON + -GNinja + -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} + + - name: Build + run: cmake --build ${{github.workspace}}/build -v + + - name: Install + run: cmake --install ${{github.workspace}}/build + diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e14963618..dcde282f73 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -339,7 +339,7 @@ set(CMAKE_CXX_FLAGS_DEVEL "-g -O3" CACHE STRING "Flags used by the C++ compiler during regular development builds.") set(CMAKE_CXX_FLAGS_STRICT - "-O3" + "-Os" CACHE STRING "Flags used by the C++ compiler during strict jenkins builds.") set(CMAKE_CXX_FLAGS_RELEASE "-O3 ${CXX_OPT}" @@ -361,7 +361,7 @@ set(CMAKE_C_FLAGS_DEVEL "-g -O3" CACHE STRING "Flags used by the C compiler during regular development builds.") set(CMAKE_C_FLAGS_STRICT - "-O3" + "-Os" CACHE STRING "Flags used by the C compiler during strict jenkins builds.") set(CMAKE_C_FLAGS_RELEASE "-O3" diff --git a/lib/dslash_coarse.hpp b/lib/dslash_coarse.hpp index 6e46d953fe..f696dac20e 100644 --- a/lib/dslash_coarse.hpp +++ b/lib/dslash_coarse.hpp @@ -669,10 +669,12 @@ namespace quda { static inline void enable_policy(DslashCoarsePolicy p) { policies[static_cast(p)] = p; } +#if 0 static inline void disable_policy(DslashCoarsePolicy p) { policies[static_cast(p)] = DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED; } +#endif template class DslashCoarsePolicyTune : public Tunable { diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp index 7988dc6479..bd18d0f1d4 100644 --- a/lib/targets/cuda/malloc.cpp +++ b/lib/targets/cuda/malloc.cpp @@ -482,6 +482,7 @@ namespace quda void host_free_(const char *func, const char *file, int line, void *ptr) { if (!ptr) { errorQuda("Attempt to free NULL host pointer (%s:%d in %s())\n", file, line, func); } + track_free(MAPPED, ptr); if (alloc[HOST].count(ptr)) { track_free(HOST, ptr); free(ptr); diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake index a62e971420..ec4b87a7ae 100644 --- a/lib/targets/cuda/target_cuda.cmake +++ b/lib/targets/cuda/target_cuda.cmake @@ -242,6 +242,7 @@ target_include_directories(quda PRIVATE ${CMAKE_SOURCE_DIR}/include/targets/cuda target_include_directories(quda PUBLIC $ $) target_include_directories(quda SYSTEM PRIVATE ${CMAKE_SOURCE_DIR}/include/targets/cuda/externals) +target_include_directories(quda_cpp SYSTEM PRIVATE ${CMAKE_SOURCE_DIR}/include/targets/cuda/externals) # Specific config dependent warning suppressions and lineinfo forwarding @@ -257,7 +258,9 @@ target_compile_options( -Wreorder $<$: -Xcompiler=-Wno-unused-function - -Xcompiler=-Wno-unknown-pragmas> + -Xcompiler=-Wno-unknown-pragmas + -Xcompiler=-Wno-error=pass-failed + -Xcompiler=-Wno-error=unneeded-internal-declaration> $<$: -Xcompiler=-Wno-unknown-pragmas> $<$:-Xptxas From e8ab6007a82a20e684d44f5f525b843800fa94e4 Mon Sep 17 00:00:00 2001 From: Mathias Wagner Date: Thu, 20 Jul 2023 09:59:36 +0200 Subject: [PATCH 2/9] remove track_free it might not be necessary here --- lib/targets/cuda/malloc.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp index bd18d0f1d4..14dbe30ce0 100644 --- a/lib/targets/cuda/malloc.cpp +++ b/lib/targets/cuda/malloc.cpp @@ -240,7 +240,7 @@ namespace quda #endif if (is_prefetch_enabled()) qudaMemPrefetchAsync(ptr, size, QUDA_CUDA_FIELD_LOCATION, device::get_default_stream()); - track_malloc(DEVICE, a, ptr); + _malloc(DEVICE, a, ptr); #ifdef HOST_DEBUG cudaMemset(ptr, 0xff, size); #endif @@ -267,7 +267,7 @@ namespace quda if (err != CUDA_SUCCESS) { errorQuda("Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func); } - track_malloc(DEVICE_PINNED, a, ptr); + _malloc(DEVICE_PINNED, a, ptr); #ifdef HOST_DEBUG cudaMemset(ptr, 0xff, size); #endif @@ -286,7 +286,7 @@ namespace quda void *ptr = malloc(size); if (!ptr) { errorQuda("Failed to allocate host memory of size %zu (%s:%d in %s())\n", size, file, line, func); } - track_malloc(HOST, a, ptr); + _malloc(HOST, a, ptr); #ifdef HOST_DEBUG memset(ptr, 0xff, size); #endif @@ -311,7 +311,7 @@ namespace quda if (err != cudaSuccess) { errorQuda("Failed to register pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func); } - track_malloc(PINNED, a, ptr); + _malloc(PINNED, a, ptr); #ifdef HOST_DEBUG memset(ptr, 0xff, a.base_size); #endif @@ -482,7 +482,6 @@ namespace quda void host_free_(const char *func, const char *file, int line, void *ptr) { if (!ptr) { errorQuda("Attempt to free NULL host pointer (%s:%d in %s())\n", file, line, func); } - track_free(MAPPED, ptr); if (alloc[HOST].count(ptr)) { track_free(HOST, ptr); free(ptr); From 96de83a9d831a705bf9854f0ae117afe6dec73b3 Mon Sep 17 00:00:00 2001 From: Mathias Wagner Date: Thu, 20 Jul 2023 10:06:37 +0200 Subject: [PATCH 3/9] Update malloc.cpp --- lib/targets/cuda/malloc.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp index 14dbe30ce0..7988dc6479 100644 --- a/lib/targets/cuda/malloc.cpp +++ b/lib/targets/cuda/malloc.cpp @@ -240,7 +240,7 @@ namespace quda #endif if (is_prefetch_enabled()) qudaMemPrefetchAsync(ptr, size, QUDA_CUDA_FIELD_LOCATION, device::get_default_stream()); - _malloc(DEVICE, a, ptr); + track_malloc(DEVICE, a, ptr); #ifdef HOST_DEBUG cudaMemset(ptr, 0xff, size); #endif @@ -267,7 +267,7 @@ namespace quda if (err != CUDA_SUCCESS) { errorQuda("Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func); } - _malloc(DEVICE_PINNED, a, ptr); + track_malloc(DEVICE_PINNED, a, ptr); #ifdef HOST_DEBUG cudaMemset(ptr, 0xff, size); #endif @@ -286,7 +286,7 @@ namespace quda void *ptr = malloc(size); if (!ptr) { errorQuda("Failed to allocate host memory of size %zu (%s:%d in %s())\n", size, file, line, func); } - _malloc(HOST, a, ptr); + track_malloc(HOST, a, ptr); #ifdef HOST_DEBUG memset(ptr, 0xff, size); #endif @@ -311,7 +311,7 @@ namespace quda if (err != cudaSuccess) { errorQuda("Failed to register pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func); } - _malloc(PINNED, a, ptr); + track_malloc(PINNED, a, ptr); #ifdef HOST_DEBUG memset(ptr, 0xff, a.base_size); #endif From 2b147dc644e518c128c33056b01b58856179fc3b Mon Sep 17 00:00:00 2001 From: Mahias Wagner Date: Thu, 20 Jul 2023 10:29:27 +0200 Subject: [PATCH 4/9] further reduce build --- .github/workflows/cuda_githubactions_build.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cuda_githubactions_build.yml b/.github/workflows/cuda_githubactions_build.yml index 9cd10f6500..8248e5242e 100644 --- a/.github/workflows/cuda_githubactions_build.yml +++ b/.github/workflows/cuda_githubactions_build.yml @@ -1,4 +1,4 @@ -name: CUDA_GithubActions_Build +name: cuda_ghactions_build on: pull_request: @@ -47,16 +47,17 @@ jobs: -DCMAKE_CXX_COMPILER=${{matrix.compiler}} -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache - -DQUDA_GPU_ARCH=sm_80 -DQUDA_GPU_ARCH_SUFFIX=virtual - -DQUDA_MULTIGRID=ON - -DQUDA_JITIFY=ON + -DQUDA_GPU_ARCH=sm_80 -DQUDA_GPU_ARCH_SUFFIX=virtual -DQUDA_JITIFY=ON + -DQUDA_MULTIGRID=ON + -DQUDA_MULTIGRID_NVEC_LIST=24 + -DQUDA_MDW_FUSED_LS_LIST=4 -DQUDA_MPI=ON -DMPI_CXX_SKIP_MPICXX=ON -DQUDA_PRECISION=10 -DQUDA_FAST_COMPILE_DSLASH=ON -DQUDA_FAST_COMPILE_REDUCE=ON -GNinja -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} - name: Build - run: cmake --build ${{github.workspace}}/build -v + run: cmake --build ${{github.workspace}}/build - name: Install run: cmake --install ${{github.workspace}}/build From 25ee0ac18abed7b98704ecd91696a93acf25b7e8 Mon Sep 17 00:00:00 2001 From: Mahias Wagner Date: Fri, 21 Jul 2023 10:41:25 +0200 Subject: [PATCH 5/9] bump CPM version --- CMakeLists.txt | 13 +++++-------- cmake/CPM.cmake | 16 ++++++++++++++-- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dcde282f73..51e2d0f6ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -419,14 +419,11 @@ if(QUDA_DOWNLOAD_EIGEN) CACHE STRING "Eigen use for QUDA_DOWNLOAD_EIGEN") mark_as_advanced(QUDA_EIGEN_VERSION) CPMAddPackage( - NAME - Eigen - VERSION - ${QUDA_EIGEN_VERSION} - URL - https://gitlab.com/libeigen/eigen/-/archive/${QUDA_EIGEN_VERSION}/eigen-${QUDA_EIGEN_VERSION}.tar.bz2 - DOWNLOAD_ONLY - YES) + NAME Eigen + VERSION ${QUDA_EIGEN_VERSION} + URL https://gitlab.com/libeigen/eigen/-/archive/${QUDA_EIGEN_VERSION}/eigen-${QUDA_EIGEN_VERSION}.tar.bz2 + DOWNLOAD_ONLY YES + SYSTEM YES) target_include_directories(Eigen SYSTEM INTERFACE ${Eigen_SOURCE_DIR}) install(DIRECTORY ${Eigen_SOURCE_DIR}/Eigen TYPE INCLUDE) else() diff --git a/cmake/CPM.cmake b/cmake/CPM.cmake index 772103fc33..f49d743412 100644 --- a/cmake/CPM.cmake +++ b/cmake/CPM.cmake @@ -1,4 +1,4 @@ -set(CPM_DOWNLOAD_VERSION 0.36.0) +set(CPM_DOWNLOAD_VERSION 0.38.2) if(CPM_SOURCE_CACHE) set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") @@ -10,12 +10,24 @@ endif() # Expand relative path. This is important if the provided path contains a tilde (~) get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE) -if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION})) + +function(download_cpm) message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}") file(DOWNLOAD https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake ${CPM_DOWNLOAD_LOCATION} ) +endfunction() + +if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION})) + download_cpm() +else() + # resume download if it previously failed + file(READ ${CPM_DOWNLOAD_LOCATION} check) + if("${check}" STREQUAL "") + download_cpm() + endif() + unset(check) endif() include(${CPM_DOWNLOAD_LOCATION}) From 0ac01fa313a400745f7760ee2807259fb072c0b7 Mon Sep 17 00:00:00 2001 From: Mahias Wagner Date: Tue, 25 Jul 2023 20:23:42 +0200 Subject: [PATCH 6/9] fix for clang warning options --- lib/targets/cuda/target_cuda.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake index ec4b87a7ae..82dd41ddae 100644 --- a/lib/targets/cuda/target_cuda.cmake +++ b/lib/targets/cuda/target_cuda.cmake @@ -259,8 +259,8 @@ target_compile_options( $<$: -Xcompiler=-Wno-unused-function -Xcompiler=-Wno-unknown-pragmas - -Xcompiler=-Wno-error=pass-failed - -Xcompiler=-Wno-error=unneeded-internal-declaration> + -Xcompiler=-Wno-pass-failed + -Xcompiler=-Wno-unneeded-internal-declaration> $<$: -Xcompiler=-Wno-unknown-pragmas> $<$:-Xptxas From 23e2d8891c1d7e8e4117a23513d7c7e9efefa74a Mon Sep 17 00:00:00 2001 From: Mahias Wagner Date: Wed, 26 Jul 2023 16:32:45 +0200 Subject: [PATCH 7/9] Revert "fix for clang warning options" This reverts commit 0ac01fa313a400745f7760ee2807259fb072c0b7. --- lib/targets/cuda/target_cuda.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake index 82dd41ddae..ec4b87a7ae 100644 --- a/lib/targets/cuda/target_cuda.cmake +++ b/lib/targets/cuda/target_cuda.cmake @@ -259,8 +259,8 @@ target_compile_options( $<$: -Xcompiler=-Wno-unused-function -Xcompiler=-Wno-unknown-pragmas - -Xcompiler=-Wno-pass-failed - -Xcompiler=-Wno-unneeded-internal-declaration> + -Xcompiler=-Wno-error=pass-failed + -Xcompiler=-Wno-error=unneeded-internal-declaration> $<$: -Xcompiler=-Wno-unknown-pragmas> $<$:-Xptxas From 2406c188dbc0dab0224c9414cf0add7367c9944d Mon Sep 17 00:00:00 2001 From: Mahias Wagner Date: Wed, 26 Jul 2023 16:34:59 +0200 Subject: [PATCH 8/9] remove check_language(CUDA) as this hits https://gitlab.kitware.com/cmake/cmake/-/issues/25093 --- lib/targets/cuda/target_cuda.cmake | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake index ec4b87a7ae..605a70e073 100644 --- a/lib/targets/cuda/target_cuda.cmake +++ b/lib/targets/cuda/target_cuda.cmake @@ -3,8 +3,6 @@ set(CMAKE_CUDA_EXTENSIONS OFF) find_package(CUDAToolkit REQUIRED) -include(CheckLanguage) -check_language(CUDA) set(QUDA_TARGET_CUDA ON) From 929be0369af66674254875bdf7e541e6fbb2022d Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Sun, 13 Aug 2023 22:17:51 -0700 Subject: [PATCH 9/9] Fix code so we can remove -Wno-error=unneeded-internal-declaration when using clang. Set loop unroll count for host code to 4 when using clang (avoids need for -Wno-error=pass-failed) --- lib/coarse_op.cuh | 9 ++++----- lib/dslash_coarse.hpp | 26 +++++++++----------------- lib/targets/cuda/target_cuda.cmake | 4 ++-- 3 files changed, 15 insertions(+), 24 deletions(-) diff --git a/lib/coarse_op.cuh b/lib/coarse_op.cuh index ff14edaee8..4c23f50781 100644 --- a/lib/coarse_op.cuh +++ b/lib/coarse_op.cuh @@ -7,11 +7,6 @@ namespace quda { - // For coarsening un-preconditioned operators we use uni-directional - // coarsening to reduce the set up code. For debugging we can force - // bi-directional coarsening. - static bool bidirectional_debug = false; - enum ComputeType { COMPUTE_UV, COMPUTE_LV, @@ -983,6 +978,10 @@ namespace quda { double mu_factor, bool allow_truncation, QudaDiracType dirac, QudaMatPCType matpc, bool need_bidirectional, const int *fine_to_coarse, const int *coarse_to_fine) { + // For coarsening un-preconditioned operators we use uni-directional + // coarsening to reduce the set up code. For debugging we can force + // bi-directional coarsening. + static bool bidirectional_debug = false; // sanity checks if (matpc == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpc == QUDA_MATPC_ODD_ODD_ASYMMETRIC) diff --git a/lib/dslash_coarse.hpp b/lib/dslash_coarse.hpp index f696dac20e..ebb8c03a05 100644 --- a/lib/dslash_coarse.hpp +++ b/lib/dslash_coarse.hpp @@ -660,25 +660,17 @@ namespace quda { } }; - static bool dslash_init = false; - static std::vector policies(static_cast(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED), DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED); - static int first_active_policy=static_cast(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED); - - // string used as a tunekey to ensure we retune if the dslash policy env changes - static char policy_string[TuneKey::aux_n]; - - static inline void enable_policy(DslashCoarsePolicy p) { policies[static_cast(p)] = p; } - -#if 0 - static inline void disable_policy(DslashCoarsePolicy p) - { - policies[static_cast(p)] = DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED; - } -#endif - template class DslashCoarsePolicyTune : public Tunable { + static inline bool dslash_init = false; + static inline int first_active_policy = static_cast(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED); + // string used as a tunekey to ensure we retune if the dslash policy env changes + static inline char policy_string[TuneKey::aux_n] = {}; + static inline std::vector policies = {static_cast(DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED), DslashCoarsePolicy::DSLASH_COARSE_POLICY_DISABLED}; + + static void enable_policy(DslashCoarsePolicy p) { policies[static_cast(p)] = p; } + Launch &dslash; bool tuneGridDim() const { return false; } // Don't tune the grid dimensions. @@ -686,7 +678,7 @@ namespace quda { static constexpr bool enable_coarse_shmem_overlap = Launch::enable_coarse_shmem_overlap(); public: - inline DslashCoarsePolicyTune(Launch &dslash) : dslash(dslash) + DslashCoarsePolicyTune(Launch &dslash) : dslash(dslash) { if (!dslash_init) { diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake index 605a70e073..576cc5c5d8 100644 --- a/lib/targets/cuda/target_cuda.cmake +++ b/lib/targets/cuda/target_cuda.cmake @@ -257,8 +257,8 @@ target_compile_options( $<$: -Xcompiler=-Wno-unused-function -Xcompiler=-Wno-unknown-pragmas - -Xcompiler=-Wno-error=pass-failed - -Xcompiler=-Wno-error=unneeded-internal-declaration> + -Xcompiler=-mllvm\ -unroll-count=4 + > $<$: -Xcompiler=-Wno-unknown-pragmas> $<$:-Xptxas