diff --git a/.ci/env/derecho.sh b/.ci/env/derecho.sh new file mode 100755 index 0000000000..2feea5c9ee --- /dev/null +++ b/.ci/env/derecho.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +echo "Setting up derecho environment" +workingDirectory=$PWD +. /etc/profile.d/z00_modules.sh +echo "Loading modules : $*" +cmd="module purge" +echo $cmd && eval "${cmd}" + +# We should be handed in the modules to load +while [ $# -gt 0 ]; do + cmd="module load $1" + echo $cmd && eval "${cmd}" + shift +done + +# Go back to working directory if for unknown reason HPC config changing your directory on you +if [ "$workingDirectory" != "$PWD" ]; then + echo "derecho module loading changed working directory" + echo " Moving back to $workingDirectory" + cd $workingDirectory +fi diff --git a/.ci/env/helpers.sh b/.ci/env/helpers.sh new file mode 100644 index 0000000000..2ddd560893 --- /dev/null +++ b/.ci/env/helpers.sh @@ -0,0 +1,46 @@ +#!/bin/sh + +# Useful string manipulation functions, leaving in for posterity +# https://stackoverflow.com/a/8811800 +# contains(string, substring) +# +# Returns 0 if the specified string contains the specified substring, +# otherwise returns 1. +contains() +{ + string="$1" + substring="$2" + + if [ "${string#*"$substring"}" != "$string" ]; then + echo 0 # $substring is in $string + else + echo 1 # $substring is not in $string + fi +} + +setenvStr() +{ + # Changing IFS produces the most consistent results + tmpIFS=$IFS + IFS="," + string="$1" + for s in $string; do + if [ ! -z $s ]; then + eval "echo export \"$s\"" + eval "export \"$s\"" + fi + done + IFS=$tmpIFS +} + +banner() +{ + lengthBanner=$1 + shift + # https://www.shellscript.sh/examples/banner/ + printf "#%${lengthBanner}s#\n" | tr " " "=" + printf "# %-$(( ${lengthBanner} - 2 ))s #\n" "`date`" + printf "# %-$(( ${lengthBanner} - 2 ))s #\n" " " + printf "# %-$(( ${lengthBanner} - 2 ))s #\n" "$*" + printf "#%${lengthBanner}s#\n" | tr " " "=" +} diff --git a/.ci/env/hostenv.sh b/.ci/env/hostenv.sh new file mode 100644 index 0000000000..208d1e57f3 --- /dev/null +++ b/.ci/env/hostenv.sh @@ -0,0 +1,16 @@ +#!/bin/sh + +# Allow selection of hostname, and if none is provided use the current machine +# While this may seem unintuitive at first, it provides the flexibility of using +# "named" configurations without being explicitly tied to fqdn +hostname=$AS_HOST +if [ -z "$hostname" ]; then + hostname=$( python3 -c "import socket; print( socket.getfqdn() )" ) +fi + +if [ $( contains ${hostname} hsn.de.hpc ) -eq 0 ]; then + # Derecho HPC SuSE PBS + . .ci/env/derecho.sh +else + echo "No known environment for '${hostname}', using current" +fi diff --git a/.ci/hpc-workflows b/.ci/hpc-workflows new file mode 160000 index 0000000000..ba8393447c --- /dev/null +++ b/.ci/hpc-workflows @@ -0,0 +1 @@ +Subproject commit ba8393447c8a2cef23952c01425154ceb34d64e4 diff --git a/.ci/tests/build.sh b/.ci/tests/build.sh new file mode 100755 index 0000000000..a0598fd499 --- /dev/null +++ b/.ci/tests/build.sh @@ -0,0 +1,108 @@ +#!/bin/sh +help() +{ + echo "./build.sh as_host workingdir [options] [-- ]" + echo " as_host First argument must be the host configuration to use for environment loading" + echo " workingdir First argument must be the working dir to immediate cd to" + echo " -c Configuration build type, piped directly into configure" + echo " -n Configuration nesting type, piped directly into configure" + echo " -o Configuration optstring passed into configure" + echo " -b Build command passed into compile" + echo " -e environment variables in comma-delimited list, e.g. var=1,foo,bar=0" + echo " -- Directly pass options to hostenv.sh, equivalent to hostenv.sh " + echo " -h Print this message" + echo "" + echo "If you wish to use an env var in your arg such as '-c \$SERIAL -e SERIAL=32', you must" + echo "you will need to do '-c \\\$SERIAL -e SERIAL=32' to delay shell expansion" +} + +echo "Input arguments:" +echo "$*" + +AS_HOST=$1 +shift +if [ $AS_HOST = "-h" ]; then + help + exit 0 +fi + +workingDirectory=$1 +shift + +cd $workingDirectory + +# Get some helper functions +. .ci/env/helpers.sh + +while getopts c:n:o:b:e:h opt; do + case $opt in + c) + configuration="$OPTARG" + ;; + n) + nesting="$OPTARG" + ;; + o) + configOpt="$OPTARG" + ;; + b) + buildCommand="$OPTARG" + ;; + e) + envVars="$envVars,$OPTARG" + ;; + h) help; exit 0 ;; + *) help; exit 1 ;; + :) help; exit 1 ;; + \?) help; exit 1 ;; + esac +done + +shift "$((OPTIND - 1))" + +# Everything else goes to our env setup +. .ci/env/hostenv.sh $* + +# Now evaluate env vars in case it pulls from hostenv.sh +if [ ! -z "$envVars" ]; then + setenvStr "$envVars" +fi + +# Re-evaluate input values for delayed expansion +eval "configuration=\"$configuration\"" +eval "nesting=\"$nesting\"" +eval "configOpt=\"$configOpt\"" +eval "buildCommand=\"$buildCommand\"" + +./clean -a + +echo "Compiling with option $configuration nesting=$nesting and additional flags '$configOpt'" +./configure $configOpt << EOF +$configuration +$nesting +EOF + +if [ ! -f configure.wrf ]; then + echo "Failed to configure" + exit 1 +fi + +echo "./compile $buildCommand" +./compile $buildCommand + +result=$? + +if [ $result -ne 0 ]; then + echo "Failed to compile" + exit 1 +fi + +# And a *very* special check because WRF compiles the WRF way and force-ignores all make errors +# putting the onus on US to check for things +if [ ! -x ./main/wrf.exe ]; then # There's a bunch of other execs but this is the most important and + # doing more checks to accomodate just reinforces this bad design + echo "Failed to compile" + exit 1 +fi + +echo "TEST $(basename $0) PASS" diff --git a/.ci/wrf_compilation_tests-make.json b/.ci/wrf_compilation_tests-make.json new file mode 100644 index 0000000000..b80a2b8b53 --- /dev/null +++ b/.ci/wrf_compilation_tests-make.json @@ -0,0 +1,69 @@ +{ + "submit_options" : + { + "timelimit" : "00:20:00", + "working_directory" : "..", + "arguments" : + { + "base_env_numprocs" : [ "-e", "NUM_PROCS=4" ], + + ".*make.*::args_nesting" : [ "-n", "1" ], + ".*make.*::args_configopt" : [ "-o", "-d" ], + ".*make.*::args_build_tgt" : [ "-b", "em_real -j $NUM_PROCS" ] + }, + "hsn.de.hpc" : + { + "submission" : "PBS", + "queue" : "main", + "hpc_arguments" : + { + "node_select" : { "-l " : { "select" : 1, "ncpus" : 16 } }, + "priority" : { "-l " : { "job_priority" : "economy" } } + }, + "arguments" : + { + "base_env_numprocs" : [ "-e", "NUM_PROCS=16" ], + "very_last_modules" : [ "netcdf" ], + ".*gnu.*::test_modules" : [ "gcc" ], + ".*intel(?!-llvm).*::test_modules" : [ "intel-classic" ], + ".*intel-llvm.*::test_modules" : [ "intel-oneapi" ], + ".*pgi.*::test_modules" : [ "nvhpc" ], + ".*dm.*::test_mpi_module" : [ "cray-mpich" ] + } + } + }, + "make-gnu" : + { + "steps" : + { + "serial" : + { + "command" : ".ci/tests/build.sh", + "arguments" : [ "-c", "32" ] + }, + "sm" : + { + "command" : ".ci/tests/build.sh", + "arguments" : [ "-c", "33" ], + "dependencies" : { "serial" : "afterany" } + } + } + }, + "make-gnu-mpi" : + { + "steps" : + { + "dm" : + { + "command" : ".ci/tests/build.sh", + "arguments" : [ "-c", "34" ] + }, + "dm+sm" : + { + "command" : ".ci/tests/build.sh", + "arguments" : [ "-c", "35" ], + "dependencies" : { "dm" : "afterany" } + } + } + } +} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000..ec396e2ce2 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,97 @@ +name: Regression Suite +run-name : ${{ github.event_name == 'push' && 'CI' || github.event.label.name }} (${{ github.event_name }}) + +on: + push: + branches: [ master, develop ] +# See https://stackoverflow.com/a/78444521 and +# https://github.com/orgs/community/discussions/26874#discussioncomment-3253755 +# as well as official (but buried) documentation : +# https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#pull-request-events-for-forked-repositories-2 + pull_request: + types: [ labeled ] + +# https://docs.github.com/en/actions/sharing-automations/reusing-workflows#supported-keywords-for-jobs-that-call-a-reusable-workflow +# Also https://stackoverflow.com/a/74959635 +# TL;DR - For public repositories the safest approach will be to use the default read permissions, but at the cost +# of not being able to modify the labels. That will need to be a separate [trusted] workflow that runs from the base repo +# permissions : +# contents : read +# pull-requests : write + +# Write our tests out this way for easier legibility +# testsSet : +# - key : value +# key : value +# tests : +# - value +# - value +# - < next test > +# https://stackoverflow.com/a/68940067 +jobs: + buildtests: + if : ${{ github.event.label.name == 'compile-tests' || github.event.label.name == 'all-tests' || github.event_name == 'push' }} + strategy: + max-parallel: 4 + fail-fast: false + matrix: + + testSet : + - host : derecho + hpc-workflows_path : .ci/hpc-workflows + archive : /glade/work/aislas/github/runners/wrf/derecho/logs/ + account : NMMM0012 + name : "Make Compilation Tests" + id : make-tests + fileroot : wrf_compilation_tests-make + args : -j='{"node_select":{"-l ":{"select":1}}}' + pool : 8 + tpool : 1 + mkdirs : true + tests : + - make-gnu + - make-gnu-mpi + # add new compilation tests here + + uses : ./.github/workflows/test_workflow.yml + with : + # This should be the only hard-coded value, we don't use ${{ github.event.label.name }} + # to avoid 'all-tests' to be used in this workflow + label : compile-tests + + # Everything below this should remain the same and comes from the testSet matrix + hpc-workflows_path : ${{ matrix.testSet.hpc-workflows_path }} + archive : ${{ matrix.testSet.archive }} + name : ${{ matrix.testSet.name }} + id : ${{ matrix.testSet.id }} + host : ${{ matrix.testSet.host }} + fileroot : ${{ matrix.testSet.fileroot }} + account : ${{ matrix.testSet.account }} + tests : ${{ toJson( matrix.testSet.tests ) }} + mkdirs : ${{ matrix.testSet.mkdirs }} + args : ${{ matrix.testSet.args }} + pool : ${{ matrix.testSet.pool }} + tpool : ${{ matrix.testSet.tpool }} + # I am leaving this here for posterity if this is to be replicated in private repositories for testing + permissions: + contents: read + pull-requests: write + name : Test ${{ matrix.testSet.name }} on ${{ matrix.testSet.host }} + + # In the event that 'all-tests' is used, this final job will be the one to remove + # the label from the PR + removeAllLabel : + if : ${{ !cancelled() && github.event.label.name == 'all-tests' }} + name : Remove 'all-tests' label + runs-on: ubuntu-latest + needs : [ buildtests ] # Put tests here to make this wait for the tests to complete + steps: + - name : Remove '${{ github.event.label.name }}' label + env: + PR_NUMBER: ${{ github.event.number }} + run: | + curl \ + -X DELETE \ + -H "Accept: application/vnd.github.v3+json" \ + -H 'Authorization: token ${{ github.token }}' \ + https://api.github.com/repos/${GITHUB_REPOSITORY}/issues/${PR_NUMBER}/labels/${{ github.event.label.name }} diff --git a/.github/workflows/test_workflow.yml b/.github/workflows/test_workflow.yml new file mode 100644 index 0000000000..abcb901c0a --- /dev/null +++ b/.github/workflows/test_workflow.yml @@ -0,0 +1,150 @@ + + +on : + workflow_call : + inputs : + label : + required : true + type : string + hpc-workflows_path : + required : true + type : string + archive : + required : true + type : string + + name : + required : true + type : string + id : + required : true + type : string + host : + required : true + type : string + fileroot : + required : true + type : string + account : + required : true + type : string + tests : + required : true + type : string + mkdirs : + required : true + type : boolean + args : + required : false + type : string + default : "" + pool : + required : false + type : number + default : 1 + tpool : + required : false + type : number + default : 1 + + + +jobs: + test_workflow : + + # Is 5 days a reasonable wait time for testing? + timeout-minutes: 7200 + name: Test ${{ inputs.name }} on ${{ inputs.host }} + runs-on: ${{ inputs.host }} + env : + LOG_SUFFIX : ${{ github.event_name == 'push' && 'master' || github.event.number }} + steps: + - uses: actions/checkout@v4 + with: + path : main + submodules: true + + # Immediately copy out to # of tests to do + - name: Create testing directories + if : ${{ inputs.mkdirs }} + id : cpTestDirs + run : | + for testDir in ${{ join( fromJson( inputs.tests ), ' ' ) }}; do + echo "Creating duplicate directory for $testDir" + # Remove if it exists to get a fresh start + rm -rf $testDir + cp -Rp main/ $testDir + done + + - name: Test ${{ inputs.name }} + id : runTest + run: | + if [ "${{ inputs.mkdirs }}" = "true" ]; then + ALT_DIRS="-alt ../${{ join( fromJson( inputs.tests ), '/.ci ../' ) }}/.ci" + fi + ./main/${{ inputs.hpc-workflows_path }}/.ci/runner.py \ + ./main/.ci/${{ inputs.fileroot }}.json \ + -t ${{ join( fromJson( inputs.tests ), ' ' ) }} \ + -a "${{ inputs.account }}" \ + -p ${{ inputs.pool}} -tp ${{ inputs.tpool }} \ + ${{ inputs.args }} $ALT_DIRS + + + - name: Report failed tests and steps + if : ${{ failure() }} + run : | + # move log files to safe location + ./main/${{ inputs.hpc-workflows_path }}/.ci/relocator.py ./main/.ci/${{ inputs.fileroot }}.log ${{ inputs.archive }}/$LOG_SUFFIX/${{ inputs.id }} + + # report on them - alt dirs need extra help + if [ "${{ inputs.mkdirs }}" = "true" ]; then + masterlogLoc=main/.ci + fi + ./main/${{ inputs.hpc-workflows_path }}/.ci/reporter.py ${{ inputs.archive }}/$LOG_SUFFIX/${{ inputs.id }}/$masterlogLoc/${{ inputs.fileroot }}.log \ + -e ./${{ inputs.hpc-workflows_path }}/.ci/runner.py \ + -o GITHUB -m # only mark fail steps with gh syntax + + # report on them + echo "# Summary for ${{ join( fromJson( inputs.tests ), ' ' ) }}" >> $GITHUB_STEP_SUMMARY + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY + ./main/${{ inputs.hpc-workflows_path }}/.ci/reporter.py ${{ inputs.archive }}/$LOG_SUFFIX/${{ inputs.id }}/$masterlogLoc/${{ inputs.fileroot }}.log \ + -e ./${{ inputs.hpc-workflows_path }}/.ci/runner.py \ + -s >> $GITHUB_STEP_SUMMARY + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY + + - name: Clean up testing directories + if : ${{ success() }} + id : rmTestDirs + run : | + for testDir in ${{ join( fromJson( inputs.tests ), ' ' ) }}; do + echo "Removing duplicate directory for $testDir" + rm -rf $testDir + done + + - name: Upload test logs + if : ${{ failure() }} + uses : actions/upload-artifact@v4 + with: + # as per usual with ci/cd stuff I am shocked but not surprised when the advertised + # *documented* functionality doesn't work as expected. Wow, bravo + # can't use ${{ env. }} as somehow this combination of matrix->reusable workflow->call step is too complex + # and expands to nothing + name: ${{ github.event_name == 'push' && 'master' || github.event.number }}-${{ inputs.id }}_logfiles + path: ${{ inputs.archive }}/${{ github.event_name == 'push' && 'master' || github.event.number }}/${{ inputs.id }}/ + + # As noted in ci.yml, this will need to be moved to a separate workflow with pull_request_target + # and strictly controlled usage of the GH token + # - name : Remove '${{ inputs.label }}' label + # if : ${{ !cancelled() && github.event.label.name == inputs.label }} + # env: + # PR_NUMBER: ${{ github.event.number }} + # run: | + # curl \ + # -X DELETE \ + # -H "Accept: application/vnd.github.v3+json" \ + # -H 'Authorization: token ${{ github.token }}' \ + # https://api.github.com/repos/${GITHUB_REPOSITORY}/issues/${PR_NUMBER}/labels/${{ inputs.label }} + + + + diff --git a/.gitignore b/.gitignore index 876fb30491..09e62c68a5 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,4 @@ wrfout_d* *.nc rsl.out.* rsl.error.* +*.log diff --git a/.gitmodules b/.gitmodules index 4e323edda7..5d206bcbf5 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "phys/noahmp"] path = phys/noahmp url = https://github.com/NCAR/noahmp +[submodule ".ci/hpc-workflows"] + path = .ci/hpc-workflows + url = https://github.com/islas/hpc-workflows