diff --git a/.github/.parallelworks/README.md b/.github/.parallelworks/README.md new file mode 100644 index 0000000..4f1dfe6 --- /dev/null +++ b/.github/.parallelworks/README.md @@ -0,0 +1,6 @@ +# .parallelworks Directory + +The .parallelworks directory stores the CI scripts that reside on Parallelworks +These scripts are executed via the GitHub Actions Workflows in .github/workflows + +On Parallelworks these scripts are installed at: /contrib/fv3/SHiELD_build_CI diff --git a/.github/.parallelworks/checkout.sh b/.github/.parallelworks/checkout.sh new file mode 100755 index 0000000..8d991d9 --- /dev/null +++ b/.github/.parallelworks/checkout.sh @@ -0,0 +1,90 @@ +#!/bin/bash -xe + +############################################################################## +## User set up variables +## Root directory for CI +dirRoot=/contrib/fv3 +## Intel version to be used +intelVersion=2023.2.0 +############################################################################## +## HPC-ME container +container=/contrib/containers/noaa-intel-prototype_2023.09.25.sif +container_env_script=/contrib/containers/load_spack_noaa-intel.sh +############################################################################## + +#Parse Arguments +branch=main +commit="" +while [[ $# -gt 0 ]]; do + case $1 in + -b|--branch) + branch="$2" + shift # past argument + shift # past value + ;; + -h|--hash) + commit="$2" + shift # past argument + shift # past value + ;; + *) + echo "unknown argument" + exit 1 + ;; + esac +done + +echo "branch is $branch" +echo "commit is $commit" + + +## Set up the directories +testDir=${dirRoot}/${intelVersion}/SHiELD_build/${branch}/${commit} +logDir=${testDir}/log +export MODULESHOME=/usr/share/lmod/lmod +#Define External Libs path +export EXTERNAL_LIBS=${dirRoot}/${intelVersion}/SHiELD_build/externallibs +mkdir -p ${EXTERNAL_LIBS} +## create directories +rm -rf ${testDir} +mkdir -p ${logDir} +# salloc commands to start up +#2 tests layout 8,8 (16 nodes) +#2 tests layout 4,8 (8 nodes) +#9 tests layout 4,4 (18 nodes) +#5 tests layout 4,1 (5 nodes) +#17 tests layout 2,2 (17 nodes) +#salloc --partition=p2 -N 64 -J ${branch} sleep 20m & + +## clone code +cd ${testDir} +git clone --recursive https://github.com/NOAA-GFDL/SHiELD_build.git +## Check out the PR +cd ${testDir}/SHiELD_build && git fetch origin ${branch}:toMerge && git merge toMerge + +##checkout components +cd ${testDir}/SHiELD_build && ./CHECKOUT_code +#Check if we already have FMS compiled +grep -m 1 "fms_release" ${testDir}/SHiELD_build/CHECKOUT_code > ${logDir}/release.txt +source ${logDir}/release.txt +echo ${fms_release} +echo `cat ${EXTERNAL_LIBS}/FMSversion` +if [[ ${fms_release} != `cat ${EXTERNAL_LIBS}/FMSversion` ]] + then + #remove libFMS if it exists + if [ -d $EXTERNAL_LIBS/libFMS ] + then + rm -rf $EXTERNAL_LIBS/libFMS + fi + if [ -e $EXTERNAL_LIBS/FMSversion ] + then + rm $EXTERNAL_LIBS/FMSversion + fi + echo $fms_release > $EXTERNAL_LIBS/FMSversion + echo $container > $EXTERNAL_LIBS/FMScontainerversion + echo $container_env_script >> $EXTERNAL_LIBS/FMScontainerversion + # Build FMS + cd ${testDir}/SHiELD_build/Build + set -o pipefail + singularity exec -B /contrib ${container} ${container_env_script} "./BUILDlibfms intel" + fi diff --git a/.github/.parallelworks/compile.sh b/.github/.parallelworks/compile.sh new file mode 100755 index 0000000..63e7852 --- /dev/null +++ b/.github/.parallelworks/compile.sh @@ -0,0 +1,84 @@ +#!/bin/bash -xe + +############################################################################## +## User set up variables +## Root directory for CI +dirRoot=/contrib/fv3 +## Intel version to be used +intelVersion=2023.2.0 +############################################################################## +## HPC-ME container +container=/contrib/containers/noaa-intel-prototype_2023.09.25.sif +container_env_script=/contrib/containers/load_spack_noaa-intel.sh +############################################################################## + +#Parse Arguments +branch=main +commit="" +while [[ $# -gt 0 ]]; do + case $1 in + -b|--branch) + branch="$2" + shift # past argument + shift # past value + ;; + -h|--hash) + commit="$2" + shift # past argument + shift # past value + ;; + -c|--config) + config="$2" + shift # past argument + shift # past value + ;; + --hydro) + hydro="$2" + shift # past argument + shift # past value + ;; + --bit) + bit="$2" + shift # past argument + shift # past value + ;; + -m|--mode) + mode="$2" + shift # past argument + shift # past value + ;; + *) + echo "unknown argument" + exit 1 + ;; + esac +done + +if [ -z $mode ] || [ -z $bit ] || [ -z $hydro ] || [ -z $config ] + then + echo "must specify config, hydro, bit, and mode options for compile" + exit 1 +fi + +echo "branch is $branch" +echo "commit is $commit" +echo "mode is $mode" +echo "bit is $bit" +echo "hydro is $hydro" +echo "config is $config" + +if [ $hydro = "sw" ] && [ $config = "shield" ] + then + echo "this combination should not be tested" + else + ## Set up the directories + testDir=${dirRoot}/${intelVersion}/SHiELD_build/${branch}/${commit} + logDir=${testDir}/log + # Set up build + cd ${testDir}/SHiELD_build/Build + #Define External Libs path + export EXTERNAL_LIBS=${dirRoot}/${intelVersion}/SHiELD_build/externallibs + # Build SHiELD + set -o pipefail + singularity exec -B /contrib ${container} ${container_env_script} "./COMPILE ${config} ${hydro} ${bit} ${mode} intel clean" +fi diff --git a/.github/.parallelworks/run_test.sh b/.github/.parallelworks/run_test.sh new file mode 100755 index 0000000..c33fbc6 --- /dev/null +++ b/.github/.parallelworks/run_test.sh @@ -0,0 +1,89 @@ +#!/bin/bash -xe +ulimit -s unlimited +############################################################################## +## User set up veriables +## Root directory for CI +dirRoot=/contrib/fv3 +## Intel version to be used +intelVersion=2023.2.0 +############################################################################## +## HPC-ME container +container=/contrib/containers/noaa-intel-prototype_2023.09.25.sif +container_env_script=/contrib/containers/load_spack_noaa-intel.sh +############################################################################## + +#Parse Arguments +branch=main +commit="" +while [[ $# -gt 0 ]]; do + case $1 in + -b|--branch) + branch="$2" + shift # past argument + shift # past value + ;; + -h|--hash) + commit="$2" + shift # past argument + shift # past value + ;; + -t|--test) + testname="$2" + shift # past argument + shift # past value + ;; + *) + echo "unknown argument" + exit 1 + ;; + esac +done + +if [ -z $testname ] + then + echo "must specify a test name with -t" + exit 1 +fi + +echo "branch is $branch" +echo "commit is $commit" +echo "test is $testname" + +## Set up the directories +MODULESHOME=/usr/share/lmod/lmod +testDir=${dirRoot}/${intelVersion}/SHiELD_build/${branch}/${commit} +logDir=${testDir}/log +baselineDir=${dirRoot}/baselines/intel/${intelVersion} + +## Run the CI Test +# Define the builddir testscriptdir and rundir +# Set the BUILDDIR for the test script to use +export BUILDDIR="${testDir}/SHiELD_build" +testscriptDir=${BUILDDIR}/RTS/CI +runDir=${BUILDDIR}/CI/BATCH-CI + +# Run CI test scripts +cd ${testscriptDir} +set -o pipefail +# Execute the test piping output to log file +./${testname} " --partition=p2 --mpi=pmi2 --job-name=${commit}_${testname} singularity exec -B /contrib ${container} ${container_env_script}" |& tee ${logDir}/run_${testname}.log + +## Compare Restarts to Baseline +#The following tests are not expectred to have run-to-run reproducibility: +#d96_2k.solo.bubble +#d96_2k.solo.bubble.n0 +#d96_2k.solo.bubble.nhK +if [[ ${testname} == "d96_2k.solo.bubble" || ${testname} == "d96_2k.solo.bubble.n0" || ${testname} == "d96_2k.solo.bubble.nhK" ]] + then + echo "${testname} is not expected to reproduce so answers were not compared" + else + source $MODULESHOME/init/sh + export MODULEPATH=/mnt/shared/manual_modules:/usr/share/modulefiles/Linux:/usr/share/modulefiles/Core:/usr/share/lmod/lmod/modulefiles/Core:/apps/modules/modulefiles:/apps/modules/modulefamilies/intel + module load intel/2022.1.2 + module load netcdf + module load nccmp + for resFile in `ls ${baselineDir}/${testname}` + do + nccmp -d ${baselineDir}/${testname}/${resFile} ${runDir}/${testname}/RESTART/${resFile} + done +fi diff --git a/.github/workflows/Intel_Parallelworks_CI.yaml b/.github/workflows/Intel_Parallelworks_CI.yaml new file mode 100644 index 0000000..794190f --- /dev/null +++ b/.github/workflows/Intel_Parallelworks_CI.yaml @@ -0,0 +1,166 @@ +name: Compile SHiELD SOLO and run tests + +# This GitHub Action Workflow is running on the cloud devcimultiintel cluster +# The tests are run inside of a container with the following software/libraries: +# -intel: 2023.2.0 +# -hdf5: 1.14.0 +# -netcdf-c: 4.9.2 +# -netcdf-fortran: 4.6.0 +# -cmake +# -libyaml + +on: + pull_request: + branches: + - main + +#this should cancel in progress ci runs for the same PR +#(e.g. a second commit on the same PR comes in while CI is still running) +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + checkout: + if: github.repository == 'NOAA-GFDL/SHiELD_build' + runs-on: [shieldbuild] + name: Checkout Code + steps: + # It can take a long time (5-15 minutes) to spinup nodes + # so this salloc will prompt 46 nodes to startup and stay active for 20 min + # this is enough nodes for the first 17 tests to run in parallel, and we + # have 17 runners configured. + - run: salloc --partition=p2 -N 46 -J $GITHUB_SHA sleep 20m & + - run: /contrib/fv3/SHiELD_build_CI/checkout.sh -b $GITHUB_REF -h $GITHUB_SHA + + build: + if: github.repository == 'NOAA-GFDL/SHiELD_build' + runs-on: [shieldbuild] + name: SOLO SHiELD build + needs: [checkout] + strategy: + fail-fast: true + max-parallel: 17 + matrix: + runscript: [/contrib/fv3/SHiELD_build_CI/compile.sh] + config: [shield, solo] + hydro: [sw, nh, hydro] + bit: [32bit, 64bit] + mode: [repro, prod, debug] + steps: + - env: + RUNSCRIPT: ${{ matrix.runscript }} + CONFIG: ${{ matrix.config }} + HYDRO: ${{ matrix.hydro }} + BIT: ${{ matrix.bit }} + MODE: ${{ matrix.mode }} + run: $RUNSCRIPT -b $GITHUB_REF -h $GITHUB_SHA -c $CONFIG --hydro $HYDRO --bit $BIT -m $MODE + + test: + if: github.repository == 'NOAA-GFDL/SHiELD_build' + runs-on: [shieldbuild] + name: SOLO SHiELD test suite + needs: [checkout, build] + strategy: + fail-fast: false + max-parallel: 17 + matrix: + runscript: [/contrib/fv3/SHiELD_build_CI/run_test.sh] + argument: + # These are placed in order of largest to smallest jobs + #layout 8,8 needs 8 nodes + - C512r20.solo.superC + - C768.sw.BTwave + #layout 4,8 needs 4 nodes + - C256r20.solo.superC + - C384.sw.BLvortex + #layout 4,4 needs 2 nodes + - C128r20.solo.superC + - C128r3.solo.TC.d1 + - C128r3.solo.TC.h6 + - C128r3.solo.TC + - C128r3.solo.TC.tr8 + - C192.sw.BLvortex + - C192.sw.BTwave + - C192.sw.modon + - C384.sw.BTwave + #layout 4,1 and 2,2 need 1 node + - C96.solo.BCdry.hyd + - C96.solo.BCdry + - C96.solo.BCmoist.hyd.d3 + - C96.solo.BCmoist.hyd + - C96.solo.BCmoist.nhK + - C96.solo.BCmoist + - C96.solo.mtn_rest.hyd.diff2 + - C96.solo.mtn_rest.hyd + - C96.solo.mtn_rest.nonmono.diff2 + - C96.solo.mtn_rest + - C96.sw.BLvortex + - C96.sw.BTwave + - C96.sw.modon + - C96.sw.RHwave + - d96_1k.solo.mtn_rest_shear.olddamp + - d96_1k.solo.mtn_rest_shear + - d96_1k.solo.mtn_schar.mono + - d96_1k.solo.mtn_schar + - d96_2k.solo.bubble.n0 + - d96_2k.solo.bubble.nhK + - d96_2k.solo.bubble + - d96_500m.solo.mtn_schar + steps: + # This will end the slurm job started in the checkout job + - run: scancel -n $GITHUB_SHA + - env: + RUNSCRIPT: ${{ matrix.runscript }} + ARG1: ${{ matrix.argument }} + run: $RUNSCRIPT -t $ARG1 -b $GITHUB_REF -h $GITHUB_SHA + shutdown: + if: always() && github.repository == 'NOAA-GFDL/SHiELD_build' + runs-on: [shieldbuild] + name: Shutdown Processes + needs: [checkout, build, test] + strategy: + fail-fast: false + max-parallel: 17 + matrix: + test: + - C512r20.solo.superC + - C768.sw.BTwave + - C256r20.solo.superC + - C384.sw.BLvortex + - C128r20.solo.superC + - C128r3.solo.TC.d1 + - C128r3.solo.TC.h6 + - C128r3.solo.TC + - C128r3.solo.TC.tr8 + - C192.sw.BLvortex + - C192.sw.BTwave + - C192.sw.modon + - C384.sw.BTwave + - C96.solo.BCdry.hyd + - C96.solo.BCdry + - C96.solo.BCmoist.hyd.d3 + - C96.solo.BCmoist.hyd + - C96.solo.BCmoist.nhK + - C96.solo.BCmoist + - C96.solo.mtn_rest.hyd.diff2 + - C96.solo.mtn_rest.hyd + - C96.solo.mtn_rest.nonmono.diff2 + - C96.solo.mtn_rest + - C96.sw.BLvortex + - C96.sw.BTwave + - C96.sw.modon + - C96.sw.RHwave + - d96_1k.solo.mtn_rest_shear.olddamp + - d96_1k.solo.mtn_rest_shear + - d96_1k.solo.mtn_schar.mono + - d96_1k.solo.mtn_schar + - d96_2k.solo.bubble.n0 + - d96_2k.solo.bubble.nhK + - d96_2k.solo.bubble + - d96_500m.solo.mtn_schar + steps: + - run: scancel -n $GITHUB_SHA + - env: + JOB: ${{ github.sha }}_${{ matrix.test }} + run: scancel -n $JOB diff --git a/.github/workflows/daily_cleanup_parallelworks.yaml b/.github/workflows/daily_cleanup_parallelworks.yaml new file mode 100644 index 0000000..6d0b852 --- /dev/null +++ b/.github/workflows/daily_cleanup_parallelworks.yaml @@ -0,0 +1,17 @@ +name: Old Build Cleanup + +# This GitHub Action Workflow is runing on the devcimultiintel cluster +# This will delete all build directories older than 30 days +# Build directories are on the cloud at /contrib/fv3/2023.2.0 + +on: + schedule: + # run daily at midnight + - cron: '0 0 * * *' + +jobs: + delete: + runs-on: [shieldbuild] + name: Delete Builds + steps: + - run: find /contrib/fv3/2023.2.0/SHiELD_build/refs/pull -maxdepth 1 -mindepth 1 -mtime +30 -type d -print -delete diff --git a/CHECKOUT_code b/CHECKOUT_code index 7ec432f..4d59f8d 100755 --- a/CHECKOUT_code +++ b/CHECKOUT_code @@ -81,6 +81,7 @@ echo ' ' release="main" fv3_release=$release +phy_release=$release fms_release="2024.03" fms_c_release="2024.03.01" drivers_release=$release