Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
a22f206
Add --deps-only flag to separate dependency fetching from source builds
sbryngelson Mar 25, 2026
8d39c7b
Fix --deps-only recursive build: only gate top-level call
sbryngelson Mar 25, 2026
7317ad5
ci: move case-optimization source builds to compute nodes for Frontier
sbryngelson Mar 25, 2026
750b1fd
ci: use -j 1 for Frontier Cray builds to work around CCE 19.0.0 IPA S…
sbryngelson Mar 25, 2026
3adf7dc
ci: suppress Cray warnings 990 and 7208
sbryngelson Mar 26, 2026
f42b7d5
ci: enable --debug for Frontier Cray builds for backtrace on errors
sbryngelson Mar 26, 2026
264d02c
ci: suppress Cray warning 7212; disable IPA in debug builds to avoid …
sbryngelson Mar 26, 2026
ba99b43
ci: move -h ipa0 to all Cray builds (not just debug); suppress warnin…
sbryngelson Mar 26, 2026
2f1835a
ci: replace deprecated Cray -G2 with -g -O0 for debug builds
sbryngelson Mar 26, 2026
407974b
ci: fix CCE optcg crashes by cleaning stale build artifacts before ca…
Mar 26, 2026
e836a1b
ci: fix benchmark checkout failures on NFS by preserving .slurm_job_i…
Mar 26, 2026
7ab2744
ci: split build and test into separate SLURM jobs
Mar 27, 2026
31f2300
Use cluster template for case-opt runs to avoid srun failures on Phoenix
sbryngelson Mar 29, 2026
2b92b9e
Fix case-opt: move -c flag before -- separator to avoid forwarding to…
sbryngelson Mar 29, 2026
c952529
Revert Frontier -j 1 workaround in bench.sh; stale artifact cleanup f…
sbryngelson Mar 29, 2026
50eddce
Remove CCE 19 pyrometheus workaround patch; test if still needed
sbryngelson Mar 29, 2026
533bd07
Revert "Remove CCE 19 pyrometheus workaround patch; test if still nee…
sbryngelson Mar 30, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 25 additions & 8 deletions .github/scripts/run_case_optimization.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,6 @@ if [ "$job_device" = "gpu" ] && [ "$ngpus" -eq 0 ]; then
ngpus=1
fi

# Verify the venv Python interpreter exists (created by ./mfc.sh build)
if [ ! -x build/venv/bin/python3 ]; then
echo "ERROR: build/venv/bin/python3 not found."
echo "The MFC build venv may not have been created. Was the pre-build step successful?"
exit 1
fi

benchmarks=(
benchmarks/5eq_rk3_weno3_hllc/case.py
benchmarks/viscous_weno5_sgb_acoustic/case.py
Expand All @@ -28,6 +21,30 @@ benchmarks=(
benchmarks/igr/case.py
)

# For Frontier/Frontier AMD: deps were fetched on the login node via --deps-only;
# build case-optimized binaries here on the compute node before running.
# For Phoenix: prebuild-case-optimization.sh already built everything in a prior SLURM job.
#
# Clean stale MFC target staging before building. On self-hosted CI runners,
# corrupted intermediate files from a prior failed build (e.g. CCE optcg crash)
# can persist and poison subsequent builds. Each case-opt config gets its own
# hash-named staging dir, but install dirs and other artifacts may be stale.
if [ "$job_cluster" != "phoenix" ]; then
# Clean stale MFC target dirs (hash-named) from prior builds, but
# preserve dependency dirs (hipfort, fftw, etc.) since the compute
# node has no internet to re-fetch them.
echo "=== Cleaning stale MFC target staging/install ==="
find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true

echo "=== Building case-optimized binaries on compute node ==="
for case in "${benchmarks[@]}"; do
echo "--- Building: $case ---"
./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
done
echo "=== All case-optimized binaries built ==="
fi

passed=0
failed=0
failed_cases=""
Expand All @@ -44,7 +61,7 @@ for case in "${benchmarks[@]}"; do
rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"

# Build + run with --case-optimization, small grid, 10 timesteps
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -- --gbpp 1 --steps 10; then
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -c "$job_cluster" -- --gbpp 1 --steps 10; then
# Validate output
if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then
echo "PASS: $case_name"
Expand Down
13 changes: 9 additions & 4 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,42 +68,47 @@ jobs:
flag: f
device: gpu
interface: acc
build_script: "bash .github/workflows/frontier/build.sh gpu acc bench"
build_script: "bash .github/workflows/frontier/build.sh gpu acc"
- cluster: frontier
name: Oak Ridge | Frontier (CCE)
group: phoenix
labels: frontier
flag: f
device: gpu
interface: omp
build_script: "bash .github/workflows/frontier/build.sh gpu omp bench"
build_script: "bash .github/workflows/frontier/build.sh gpu omp"
- cluster: frontier_amd
name: Oak Ridge | Frontier (AMD)
group: phoenix
labels: frontier
flag: famd
device: gpu
interface: omp
build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench"
build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp"
continue-on-error: ${{ matrix.cluster == 'frontier' || matrix.cluster == 'frontier_amd' }}
runs-on:
group: ${{ matrix.group }}
labels: ${{ matrix.labels }}
timeout-minutes: 480
steps:
- name: Clean stale output files
run: rm -f *.out

- name: Clone - PR
uses: actions/checkout@v4
with:
path: pr
clean: false

- name: Clone - Master
uses: actions/checkout@v4
with:
repository: MFlowCode/MFC
ref: master
path: master
clean: false

- name: Setup & Build
- name: Fetch Dependencies
if: matrix.build_script != ''
timeout-minutes: 150
run: |
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/common/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,18 @@ if [ "$job_cluster" = "phoenix" ]; then
trap 'rm -rf "$currentdir" || true' EXIT
fi

# --- Build (if not pre-built on login node) ---
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
# --- Build ---
# Phoenix builds everything inside SLURM (no login-node build step).
# Frontier/Frontier AMD: deps already fetched on login node via --deps-only;
# source code is built here on the compute node.
# Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk).
if [ "$job_cluster" = "phoenix" ]; then
source .github/scripts/clean-build.sh
clean_build
fi

if [ ! -d "build" ]; then
source .github/scripts/retry-build.sh
retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
fi
source .github/scripts/retry-build.sh
retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1

# --- Bench cluster flag ---
if [ "$job_cluster" = "phoenix" ]; then
Expand Down
43 changes: 43 additions & 0 deletions .github/workflows/common/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash
# Build-only script for all clusters.
# Runs inside a SLURM job via submit-slurm-job.sh.
# Builds MFC without running tests (--dry-run).
# Expects env vars: $job_device, $job_interface, $job_shard, $job_cluster

set -euo pipefail

source .github/scripts/gpu-opts.sh
build_opts="$gpu_opts"

# --- Phoenix TMPDIR setup ---
if [ "$job_cluster" = "phoenix" ]; then
tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
currentdir=$tmpbuild/run-$(( RANDOM % 9000 ))
mkdir -p $tmpbuild
mkdir -p $currentdir
export TMPDIR=$currentdir
trap 'rm -rf "$currentdir" || true' EXIT
fi

# --- Build ---
# Phoenix builds everything inside SLURM (no login-node build step).
# Frontier/Frontier AMD: deps already fetched on login node via --deps-only;
# source code is built here on the compute node.
# Phoenix: always start fresh to avoid SIGILL from stale binaries compiled
# on a different microarchitecture.
if [ "$job_cluster" = "phoenix" ]; then
source .github/scripts/clean-build.sh
clean_build
fi

source .github/scripts/retry-build.sh

# Phoenix: smoke-test the syscheck binary to catch architecture mismatches
# (SIGILL from binaries compiled on a different compute node).
validate_cmd=""
if [ "$job_cluster" = "phoenix" ]; then
validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1'
fi

RETRY_VALIDATE_CMD="$validate_cmd" \
retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1
31 changes: 3 additions & 28 deletions .github/workflows/common/test.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/bin/bash
# Unified test script for all clusters.
# Test-only script for all clusters.
# Runs inside a SLURM job via submit-slurm-job.sh.
# Assumes MFC is already built (by a prior build.sh SLURM job).
# Expects env vars: $job_device, $job_interface, $job_shard, $job_cluster

set -euo pipefail
Expand All @@ -9,9 +10,6 @@ source .github/scripts/gpu-opts.sh
build_opts="$gpu_opts"

# --- Phoenix TMPDIR setup ---
# Phoenix compute nodes have a small /tmp. With 8 parallel test threads each
# spawning MPI processes, it fills up and ORTE session dir creation fails.
# Redirect TMPDIR to project storage, same as bench.sh.
if [ "$job_cluster" = "phoenix" ]; then
tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
currentdir=$tmpbuild/run-$(( RANDOM % 9000 ))
Expand All @@ -21,29 +19,6 @@ if [ "$job_cluster" = "phoenix" ]; then
trap 'rm -rf "$currentdir" || true' EXIT
fi

# --- Build (if not pre-built on login node) ---
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
# Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh
# to avoid SIGILL from stale binaries compiled on a different microarchitecture.
if [ "$job_cluster" = "phoenix" ]; then
source .github/scripts/clean-build.sh
clean_build
fi

if [ ! -d "build" ]; then
source .github/scripts/retry-build.sh

# Phoenix: smoke-test the syscheck binary to catch architecture mismatches
# (SIGILL from binaries compiled on a different compute node).
validate_cmd=""
if [ "$job_cluster" = "phoenix" ]; then
validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1'
fi

RETRY_VALIDATE_CMD="$validate_cmd" \
retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1
fi

# --- GPU detection and thread count ---
device_opts=""
rdma_opts=""
Expand Down Expand Up @@ -88,4 +63,4 @@ if [ "${GITHUB_EVENT_NAME:-}" = "pull_request" ]; then
prune_flag="--only-changes"
fi

./mfc.sh test -v --max-attempts 3 $prune_flag -a -j $n_test_threads $rdma_opts $device_opts $build_opts $shard_opts -- -c $job_cluster
./mfc.sh test -v --max-attempts 3 --no-build $prune_flag -a -j $n_test_threads $rdma_opts $device_opts $build_opts $shard_opts -- -c $job_cluster
7 changes: 1 addition & 6 deletions .github/workflows/frontier/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ esac

job_device=$1
job_interface=$2
run_bench=$3
source .github/scripts/gpu-opts.sh
build_opts="$gpu_opts"

Expand All @@ -24,8 +23,4 @@ source .github/scripts/clean-build.sh
clean_build

source .github/scripts/retry-build.sh
if [ "$run_bench" == "bench" ]; then
retry_build ./mfc.sh build -j 8 $build_opts || exit 1
else
retry_build ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts || exit 1
fi
retry_build ./mfc.sh build --deps-only -j 8 $build_opts || exit 1
32 changes: 23 additions & 9 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -400,11 +400,14 @@ jobs:
echo "Coverage cache: none available — full test suite will run"
fi

- name: Build (login node)
- name: Fetch Dependencies
if: matrix.cluster != 'phoenix'
timeout-minutes: 60
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}

- name: Build
run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/build.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }}

- name: Test
run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }}

Expand All @@ -421,23 +424,29 @@ jobs:
if: always()
id: log
run: |
SLUG="test-${{ matrix.device }}-${{ matrix.interface }}"
SHARD_SUFFIX=""
SHARD="${{ matrix.shard }}"
if [ -n "$SHARD" ]; then
SLUG="${SLUG}-$(echo "$SHARD" | sed 's|/|-of-|')"
SHARD_SUFFIX="-$(echo "$SHARD" | sed 's|/|-of-|')"
fi
echo "slug=${SLUG}" >> "$GITHUB_OUTPUT"
echo "build_slug=build-${{ matrix.device }}-${{ matrix.interface }}${SHARD_SUFFIX}" >> "$GITHUB_OUTPUT"
echo "test_slug=test-${{ matrix.device }}-${{ matrix.interface }}${SHARD_SUFFIX}" >> "$GITHUB_OUTPUT"

- name: Print Logs
if: always()
run: cat ${{ steps.log.outputs.slug }}.out
run: |
for f in ${{ steps.log.outputs.build_slug }}.out ${{ steps.log.outputs.test_slug }}.out; do
[ -f "$f" ] && echo "=== $f ===" && cat "$f"
done

- name: Archive Logs
uses: actions/upload-artifact@v4
if: matrix.cluster != 'phoenix'
with:
name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.slug }}
path: ${{ steps.log.outputs.slug }}.out
name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.test_slug }}
path: |
${{ steps.log.outputs.build_slug }}.out
${{ steps.log.outputs.test_slug }}.out

case-optimization:
name: "Case Opt | ${{ matrix.cluster_name }} (${{ matrix.device }}-${{ matrix.interface }})"
Expand Down Expand Up @@ -486,15 +495,20 @@ jobs:
- name: Clean stale output files
run: rm -f *.out

- name: Fetch Dependencies
if: matrix.cluster != 'phoenix'
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}

- name: Pre-Build (SLURM)
if: matrix.cluster == 'phoenix'
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Pre-Build (login node)
- name: Build & Run Case-Optimization Tests
if: matrix.cluster != 'phoenix'
run: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }}
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Run Case-Optimization Tests
if: matrix.cluster == 'phoenix'
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Cancel SLURM Jobs
Expand Down
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
endif()
elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
add_compile_options(
"SHELL:-M 296,878,1391,1069,5025"
"SHELL:-M 296,878,1391,1069,990,5025,7208,7212,7242"
"SHELL:-h static" "SHELL:-h keepfiles"
"SHELL:-h acc_model=auto_async_none"
"SHELL: -h acc_model=no_fast_addr"
Expand All @@ -190,9 +190,9 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
add_compile_options(
"SHELL:-h acc_model=auto_async_none"
"SHELL: -h acc_model=no_fast_addr"
"SHELL: -K trap=fp" "SHELL: -G2"
"SHELL: -K trap=fp" "SHELL: -g" "SHELL: -O0"
)
add_link_options("SHELL: -K trap=fp" "SHELL: -G2")
add_link_options("SHELL: -K trap=fp" "SHELL: -g" "SHELL: -O0")
endif()

elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Flang")
Expand Down
25 changes: 25 additions & 0 deletions toolchain/mfc/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,12 @@ def __build_target(target: typing.Union[MFCTarget, str], case: input.MFCInputFil

history.add(target.name)

# Dependencies are pinned to fixed versions. If already configured
# (built & installed by a prior --deps-only step), skip entirely
# to avoid re-entering the superbuild (which may access the network).
if target.isDependency and target.is_configured(case):
return

for dep in target.requires.compute():
# If we have already built and installed this target,
# do not do so again. This can be inferred by whether
Expand Down Expand Up @@ -594,6 +600,25 @@ def build(targets=None, case: input.MFCInputFile = None, history: typing.Set[str
case = case or input.load(ARG("input"), ARG("--"), {})
case.validate_params()

if ARG("deps_only", False) and len(history) == 0:
all_deps = set()
for t in targets:
resolved = get_target(t)
for dep in resolved.requires.compute():
all_deps.add(dep)

cons.print(f"[bold]Fetch Dependencies | {format_list_to_string([d.name for d in all_deps], 'magenta', 'None')}[/bold]")
cons.print(no_indent=True)

if not all_deps:
cons.print("[yellow]No dependencies to build for the requested targets.[/yellow]")
return

for dep in all_deps:
__build_target(dep, case, history)

return

if len(history) == 0:
cons.print(__generate_header(case, targets))
cons.print(no_indent=True)
Expand Down
7 changes: 7 additions & 0 deletions toolchain/mfc/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,13 @@
default=False,
dest="case_optimization",
),
Argument(
name="deps-only",
help="Only fetch and build dependencies, do not build MFC targets.",
action=ArgAction.STORE_TRUE,
default=False,
dest="deps_only",
),
],
examples=[
Example("./mfc.sh build", "Build all default targets (CPU)"),
Expand Down
Loading