MFlowCode · sbryngelson · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
@@ -13,13 +13,6 @@ if [ "$job_device" = "gpu" ] && [ "$ngpus" -eq 0 ]; then
     ngpus=1
 fi
 
-# Verify the venv Python interpreter exists (created by ./mfc.sh build)
-if [ ! -x build/venv/bin/python3 ]; then
-    echo "ERROR: build/venv/bin/python3 not found."
-    echo "The MFC build venv may not have been created. Was the pre-build step successful?"
-    exit 1
-fi
-
 benchmarks=(
     benchmarks/5eq_rk3_weno3_hllc/case.py
     benchmarks/viscous_weno5_sgb_acoustic/case.py
@@ -28,6 +21,30 @@ benchmarks=(
     benchmarks/igr/case.py
 )
 
+# For Frontier/Frontier AMD: deps were fetched on the login node via --deps-only;
+# build case-optimized binaries here on the compute node before running.
+# For Phoenix: prebuild-case-optimization.sh already built everything in a prior SLURM job.
+#
+# Clean stale MFC target staging before building. On self-hosted CI runners,
+# corrupted intermediate files from a prior failed build (e.g. CCE optcg crash)
+# can persist and poison subsequent builds. Each case-opt config gets its own
+# hash-named staging dir, but install dirs and other artifacts may be stale.
+if [ "$job_cluster" != "phoenix" ]; then
+    # Clean stale MFC target dirs (hash-named) from prior builds, but
+    # preserve dependency dirs (hipfort, fftw, etc.) since the compute
+    # node has no internet to re-fetch them.
+    echo "=== Cleaning stale MFC target staging/install ==="
+    find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
+    find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
+
+    echo "=== Building case-optimized binaries on compute node ==="
+    for case in "${benchmarks[@]}"; do
+        echo "--- Building: $case ---"
+        ./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
+    done
+    echo "=== All case-optimized binaries built ==="
+fi
+
 passed=0
 failed=0
 failed_cases=""
@@ -44,7 +61,7 @@ for case in "${benchmarks[@]}"; do
     rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"
 
     # Build + run with --case-optimization, small grid, 10 timesteps
-    if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -- --gbpp 1 --steps 10; then
+    if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -c "$job_cluster" -- --gbpp 1 --steps 10; then
         # Validate output
         if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then
             echo "PASS: $case_name"

@@ -68,42 +68,47 @@ jobs:
             flag: f
             device: gpu
             interface: acc
-            build_script: "bash .github/workflows/frontier/build.sh gpu acc bench"
+            build_script: "bash .github/workflows/frontier/build.sh gpu acc"
           - cluster: frontier
             name: Oak Ridge | Frontier (CCE)
             group: phoenix
             labels: frontier
             flag: f
             device: gpu
             interface: omp
-            build_script: "bash .github/workflows/frontier/build.sh gpu omp bench"
+            build_script: "bash .github/workflows/frontier/build.sh gpu omp"
           - cluster: frontier_amd
             name: Oak Ridge | Frontier (AMD)
             group: phoenix
             labels: frontier
             flag: famd
             device: gpu
             interface: omp
-            build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench"
+            build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp"
     continue-on-error: ${{ matrix.cluster == 'frontier' || matrix.cluster == 'frontier_amd' }}
     runs-on:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
     timeout-minutes: 480
     steps:
+      - name: Clean stale output files
+        run: rm -f *.out
+
       - name: Clone - PR
         uses: actions/checkout@v4
         with:
           path: pr
+          clean: false
 
       - name: Clone - Master
         uses: actions/checkout@v4
         with:
           repository: MFlowCode/MFC
           ref: master
           path: master
+          clean: false
 
-      - name: Setup & Build
+      - name: Fetch Dependencies
         if: matrix.build_script != ''
         timeout-minutes: 150
         run: |

@@ -21,18 +21,18 @@ if [ "$job_cluster" = "phoenix" ]; then
     trap 'rm -rf "$currentdir" || true' EXIT
 fi
 
-# --- Build (if not pre-built on login node) ---
-# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
+# --- Build ---
+# Phoenix builds everything inside SLURM (no login-node build step).
+# Frontier/Frontier AMD: deps already fetched on login node via --deps-only;
+# source code is built here on the compute node.
 # Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk).
 if [ "$job_cluster" = "phoenix" ]; then
     source .github/scripts/clean-build.sh
     clean_build
 fi
 
-if [ ! -d "build" ]; then
-    source .github/scripts/retry-build.sh
-    retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
-fi
+source .github/scripts/retry-build.sh
+retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
 
 # --- Bench cluster flag ---
 if [ "$job_cluster" = "phoenix" ]; then

@@ -0,0 +1,43 @@
+#!/bin/bash
+# Build-only script for all clusters.
+# Runs inside a SLURM job via submit-slurm-job.sh.
+# Builds MFC without running tests (--dry-run).
+# Expects env vars: $job_device, $job_interface, $job_shard, $job_cluster
+
+set -euo pipefail
+
+source .github/scripts/gpu-opts.sh
+build_opts="$gpu_opts"
+
+# --- Phoenix TMPDIR setup ---
+if [ "$job_cluster" = "phoenix" ]; then
+    tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
+    currentdir=$tmpbuild/run-$(( RANDOM % 9000 ))
+    mkdir -p $tmpbuild
+    mkdir -p $currentdir
+    export TMPDIR=$currentdir
+    trap 'rm -rf "$currentdir" || true' EXIT
+fi
+
+# --- Build ---
+# Phoenix builds everything inside SLURM (no login-node build step).
+# Frontier/Frontier AMD: deps already fetched on login node via --deps-only;
+# source code is built here on the compute node.
+# Phoenix: always start fresh to avoid SIGILL from stale binaries compiled
+# on a different microarchitecture.
+if [ "$job_cluster" = "phoenix" ]; then
+    source .github/scripts/clean-build.sh
+    clean_build
+fi
+
+source .github/scripts/retry-build.sh
+
+# Phoenix: smoke-test the syscheck binary to catch architecture mismatches
+# (SIGILL from binaries compiled on a different compute node).
+validate_cmd=""
+if [ "$job_cluster" = "phoenix" ]; then
+    validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1'
+fi
+
+RETRY_VALIDATE_CMD="$validate_cmd" \
+    retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1
@@ -1,6 +1,7 @@
 #!/bin/bash
-# Unified test script for all clusters.
+# Test-only script for all clusters.
 # Runs inside a SLURM job via submit-slurm-job.sh.
+# Assumes MFC is already built (by a prior build.sh SLURM job).
 # Expects env vars: $job_device, $job_interface, $job_shard, $job_cluster
 
 set -euo pipefail
@@ -9,9 +10,6 @@ source .github/scripts/gpu-opts.sh
 build_opts="$gpu_opts"
 
 # --- Phoenix TMPDIR setup ---
-# Phoenix compute nodes have a small /tmp. With 8 parallel test threads each
-# spawning MPI processes, it fills up and ORTE session dir creation fails.
-# Redirect TMPDIR to project storage, same as bench.sh.
 if [ "$job_cluster" = "phoenix" ]; then
     tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
     currentdir=$tmpbuild/run-$(( RANDOM % 9000 ))
@@ -21,29 +19,6 @@ if [ "$job_cluster" = "phoenix" ]; then
     trap 'rm -rf "$currentdir" || true' EXIT
 fi
 
-# --- Build (if not pre-built on login node) ---
-# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
-# Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh
-# to avoid SIGILL from stale binaries compiled on a different microarchitecture.
-if [ "$job_cluster" = "phoenix" ]; then
-    source .github/scripts/clean-build.sh
-    clean_build
-fi
-
-if [ ! -d "build" ]; then
-    source .github/scripts/retry-build.sh
-
-    # Phoenix: smoke-test the syscheck binary to catch architecture mismatches
-    # (SIGILL from binaries compiled on a different compute node).
-    validate_cmd=""
-    if [ "$job_cluster" = "phoenix" ]; then
-        validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1'
-    fi
-
-    RETRY_VALIDATE_CMD="$validate_cmd" \
-        retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1
-fi
-
 # --- GPU detection and thread count ---
 device_opts=""
 rdma_opts=""
@@ -88,4 +63,4 @@ if [ "${GITHUB_EVENT_NAME:-}" = "pull_request" ]; then
     prune_flag="--only-changes"
 fi
 
-./mfc.sh test -v --max-attempts 3 $prune_flag -a -j $n_test_threads $rdma_opts $device_opts $build_opts $shard_opts -- -c $job_cluster
+./mfc.sh test -v --max-attempts 3 --no-build $prune_flag -a -j $n_test_threads $rdma_opts $device_opts $build_opts $shard_opts -- -c $job_cluster
@@ -14,7 +14,6 @@ esac
 
 job_device=$1
 job_interface=$2
-run_bench=$3
 source .github/scripts/gpu-opts.sh
 build_opts="$gpu_opts"
 
@@ -24,8 +23,4 @@ source .github/scripts/clean-build.sh
 clean_build
 
 source .github/scripts/retry-build.sh
-if [ "$run_bench" == "bench" ]; then
-    retry_build ./mfc.sh build -j 8 $build_opts || exit 1
-else
-    retry_build ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts || exit 1
-fi
+retry_build ./mfc.sh build --deps-only -j 8 $build_opts || exit 1
@@ -400,11 +400,14 @@ jobs:
             echo "Coverage cache: none available — full test suite will run"
           fi
 
-      - name: Build (login node)
+      - name: Fetch Dependencies
         if:   matrix.cluster != 'phoenix'
         timeout-minutes: 60
         run:  bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
 
+      - name: Build
+        run:  bash .github/scripts/submit-slurm-job.sh .github/workflows/common/build.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }}
+
       - name: Test
         run:  bash .github/scripts/submit-slurm-job.sh .github/workflows/common/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }}
 
@@ -421,23 +424,29 @@ jobs:
         if:   always()
         id:   log
         run:  |
-          SLUG="test-${{ matrix.device }}-${{ matrix.interface }}"
+          SHARD_SUFFIX=""
           SHARD="${{ matrix.shard }}"
           if [ -n "$SHARD" ]; then
-            SLUG="${SLUG}-$(echo "$SHARD" | sed 's|/|-of-|')"
+            SHARD_SUFFIX="-$(echo "$SHARD" | sed 's|/|-of-|')"
           fi
-          echo "slug=${SLUG}" >> "$GITHUB_OUTPUT"
+          echo "build_slug=build-${{ matrix.device }}-${{ matrix.interface }}${SHARD_SUFFIX}" >> "$GITHUB_OUTPUT"
+          echo "test_slug=test-${{ matrix.device }}-${{ matrix.interface }}${SHARD_SUFFIX}" >> "$GITHUB_OUTPUT"
 
       - name: Print Logs
         if:   always()
-        run:  cat ${{ steps.log.outputs.slug }}.out
+        run:  |
+          for f in ${{ steps.log.outputs.build_slug }}.out ${{ steps.log.outputs.test_slug }}.out; do
+            [ -f "$f" ] && echo "=== $f ===" && cat "$f"
+          done
 
       - name: Archive Logs
         uses: actions/upload-artifact@v4
         if:   matrix.cluster != 'phoenix'
         with:
-          name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.slug }}
-          path: ${{ steps.log.outputs.slug }}.out
+          name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.test_slug }}
+          path: |
+            ${{ steps.log.outputs.build_slug }}.out
+            ${{ steps.log.outputs.test_slug }}.out
 
   case-optimization:
     name: "Case Opt | ${{ matrix.cluster_name }} (${{ matrix.device }}-${{ matrix.interface }})"
@@ -486,15 +495,20 @@ jobs:
       - name: Clean stale output files
         run:  rm -f *.out
 
+      - name: Fetch Dependencies
+        if:   matrix.cluster != 'phoenix'
+        run:  bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
+
       - name: Pre-Build (SLURM)
         if:   matrix.cluster == 'phoenix'
         run:  bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }}
 
-      - name: Pre-Build (login node)
+      - name: Build & Run Case-Optimization Tests
         if:   matrix.cluster != 'phoenix'
-        run:  bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }}
+        run:  bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
 
       - name: Run Case-Optimization Tests
+        if:   matrix.cluster == 'phoenix'
         run:  bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
 
       - name: Cancel SLURM Jobs

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -176,7 +176,7 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
     endif()
 elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
     add_compile_options(
-        "SHELL:-M 296,878,1391,1069,5025"
+        "SHELL:-M 296,878,1391,1069,990,5025,7208,7212,7242"
         "SHELL:-h static" "SHELL:-h keepfiles"
         "SHELL:-h acc_model=auto_async_none"
         "SHELL: -h acc_model=no_fast_addr"
@@ -190,9 +190,9 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
         add_compile_options(
                 "SHELL:-h acc_model=auto_async_none"
                 "SHELL: -h acc_model=no_fast_addr"
-                "SHELL: -K trap=fp" "SHELL: -G2"
+                "SHELL: -K trap=fp" "SHELL: -g" "SHELL: -O0"
         )
-        add_link_options("SHELL: -K trap=fp" "SHELL: -G2")
+        add_link_options("SHELL: -K trap=fp" "SHELL: -g" "SHELL: -O0")
     endif()
 
 elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Flang")

diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py
@@ -552,6 +552,12 @@ def __build_target(target: typing.Union[MFCTarget, str], case: input.MFCInputFil
 
     history.add(target.name)
 
+    # Dependencies are pinned to fixed versions. If already configured
+    # (built & installed by a prior --deps-only step), skip entirely
+    # to avoid re-entering the superbuild (which may access the network).
+    if target.isDependency and target.is_configured(case):
+        return
+
     for dep in target.requires.compute():
         # If we have already built and installed this target,
         # do not do so again. This can be inferred by whether
@@ -594,6 +600,25 @@ def build(targets=None, case: input.MFCInputFile = None, history: typing.Set[str
     case = case or input.load(ARG("input"), ARG("--"), {})
     case.validate_params()
 
+    if ARG("deps_only", False) and len(history) == 0:
+        all_deps = set()
+        for t in targets:
+            resolved = get_target(t)
+            for dep in resolved.requires.compute():
+                all_deps.add(dep)
+
+        cons.print(f"[bold]Fetch Dependencies | {format_list_to_string([d.name for d in all_deps], 'magenta', 'None')}[/bold]")
+        cons.print(no_indent=True)
+
+        if not all_deps:
+            cons.print("[yellow]No dependencies to build for the requested targets.[/yellow]")
+            return
+
+        for dep in all_deps:
+            __build_target(dep, case, history)
+
+        return
+
     if len(history) == 0:
         cons.print(__generate_header(case, targets))
         cons.print(no_indent=True)

diff --git a/toolchain/mfc/cli/commands.py b/toolchain/mfc/cli/commands.py
@@ -134,6 +134,13 @@
             default=False,
             dest="case_optimization",
         ),
+        Argument(
+            name="deps-only",
+            help="Only fetch and build dependencies, do not build MFC targets.",
+            action=ArgAction.STORE_TRUE,
+            default=False,
+            dest="deps_only",
+        ),
     ],
     examples=[
         Example("./mfc.sh build", "Build all default targets (CPU)"),