MFlowCode · sbryngelson · Feb 28, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 20, 2026
@@ -14,50 +14,27 @@ device="$2"
 interface="$3"
 cluster="$4"
 
-# Get the directory where this script lives
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
 echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
 cd "$dir"
 
-# Submit job
-submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
-  .github/workflows/$cluster/bench.sh "$device" "$interface" 2>&1)
-
-job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p')
-job_slug="bench-$device-$interface"
-output_file="${job_slug}.out"
-
-if [ -z "$job_id" ]; then
-  echo "[$dir] ERROR: Failed to submit job"
-  echo "$submit_output"
-  exit 1
-fi
-
-echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
-
-# Use the monitoring script from PR (where this script lives)
-monitor_exit=0
-bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
-if [ "$monitor_exit" -ne 0 ]; then
-  echo "[$dir] WARNING: SLURM job exited with code $monitor_exit"
-else
-  echo "[$dir] Monitoring complete for job $job_id"
-fi
+# Submit and monitor job (submit.sh auto-detects bench mode from script name)
+bash .github/workflows/$cluster/submit.sh \
+    .github/workflows/$cluster/bench.sh "$device" "$interface"
 
 # Verify the YAML output file was created
+job_slug="bench-$device-$interface"
 yaml_file="${job_slug}.yaml"
 if [ ! -f "$yaml_file" ]; then
-  echo "[$dir] ERROR: Expected output file not found: $yaml_file"
-  echo "[$dir] Directory contents:"
-  ls -la *.yaml 2>/dev/null || echo "  No YAML files found"
-  echo ""
-  echo "[$dir] Last 100 lines of job output ($output_file):"
-  echo "----------------------------------------"
-  tail -n 100 "$output_file" 2>/dev/null || echo "  Could not read output file"
-  echo "----------------------------------------"
-  exit 1
+    echo "[$dir] ERROR: Expected output file not found: $yaml_file"
+    echo "[$dir] Directory contents:"
+    ls -la *.yaml 2>/dev/null || echo "  No YAML files found"
+    echo ""
+    output_file="${job_slug}.out"
+    echo "[$dir] Last 100 lines of job output ($output_file):"
+    echo "----------------------------------------"
+    tail -n 100 "$output_file" 2>/dev/null || echo "  Could not read output file"
+    echo "----------------------------------------"
+    exit 1
 fi
 
 echo "[$dir] Verified output file exists: $yaml_file ($(stat -f%z "$yaml_file" 2>/dev/null || stat -c%s "$yaml_file" 2>/dev/null) bytes)"
-
@@ -13,6 +13,9 @@ concurrency:
 jobs:
   file-changes:
     name: Detect File Changes
+    if: >
+      github.event_name != 'pull_request_review' ||
+      github.event.review.user.type != 'Bot'
     runs-on: 'ubuntu-latest'
     outputs:
       checkall: ${{ steps.changes.outputs.checkall }}

@@ -16,7 +16,7 @@ if [ "$job_device" = "gpu" ]; then
 fi
 
 if [ "$job_device" = "gpu" ]; then
-    ./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks
+    ./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
 else
-    ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks
+    ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
 fi
@@ -3,6 +3,15 @@
 # Ignore SIGHUP to survive login node session drops
 trap '' HUP
 
+# Determine compiler flag from directory name
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cluster_name="$(basename "$SCRIPT_DIR")"
+case "$cluster_name" in
+    frontier)     compiler_flag="f" ;;
+    frontier_amd) compiler_flag="famd" ;;
+    *) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;;
+esac
+
 job_device=$1
 job_interface=$2
 run_bench=$3
@@ -16,28 +25,25 @@ if [ "$job_device" = "gpu" ]; then
   fi
 fi
 
-. ./mfc.sh load -c f -m g
+. ./mfc.sh load -c $compiler_flag -m g
 
 # Only set up build cache for test suite, not benchmarks
 if [ "$run_bench" != "bench" ]; then
-    source .github/scripts/setup-build-cache.sh frontier "$job_device" "$job_interface"
+    source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface"
 fi
 
 max_attempts=3
 attempt=1
 while [ $attempt -le $max_attempts ]; do
     echo "Build attempt $attempt of $max_attempts..."
     if [ "$run_bench" == "bench" ]; then
-        build_cmd_ok=true
-        for dir in benchmarks/*/; do
-            dirname=$(basename "$dir")
-            if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
-                build_cmd_ok=false
-                break
-            fi
-        done
+        if ./mfc.sh build -j 8 $build_opts; then
+            build_cmd_ok=true
+        else
+            build_cmd_ok=false
+        fi
     else
-        if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then
+        if ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts; then
             build_cmd_ok=true
         else
             build_cmd_ok=false

@@ -5,8 +5,17 @@ set -e
 # Ignore SIGHUP to survive login node session drops
 trap '' HUP
 
+# Determine compiler flag from directory name
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cluster_name="$(basename "$SCRIPT_DIR")"
+case "$cluster_name" in
+    frontier)     compiler_flag="f" ;;
+    frontier_amd) compiler_flag="famd" ;;
+    *) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;;
+esac
+
 usage() {
-    echo "Usage: $0 [script.sh] [cpu|gpu]"
+    echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp] [shard]"
 }
 
 if [ ! -z "$1" ]; then
@@ -16,6 +25,13 @@ else
     exit 1
 fi
 
+# Detect job type from submitted script basename
+script_basename="$(basename "$1" .sh)"
+case "$script_basename" in
+    bench*) job_type="bench" ;;
+    *)      job_type="test"  ;;
+esac
+
 if [ "$2" = "cpu" ]; then
     sbatch_device_opts="\
 #SBATCH -n 32                       # Number of cores required"
@@ -27,19 +43,36 @@ else
     exit 1
 fi
 
+# Select SBATCH params based on job type
+if [ "$job_type" = "bench" ]; then
+    sbatch_account="#SBATCH -A ENG160"
+    sbatch_time="#SBATCH -t 05:59:00"
+    sbatch_partition="#SBATCH -p extended"
+    sbatch_extra=""
+else
+    sbatch_account="#SBATCH -A CFD154"
+    sbatch_time="#SBATCH -t 01:59:00"
+    sbatch_partition="#SBATCH -p batch"
+    sbatch_extra="#SBATCH --qos=normal"
+fi
 
-job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
+shard_suffix=""
+if [ -n "$4" ]; then
+    shard_suffix="-$(echo "$4" | sed 's|/|-of-|')"
+fi
+job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3${shard_suffix}"
 output_file="$job_slug.out"
 
 submit_output=$(sbatch <<EOT
 #!/bin/bash
 #SBATCH -J MFC-$job_slug            # Job name
-#SBATCH -A ENG160                  # charge account
+$sbatch_account
 #SBATCH -N 1                       # Number of nodes required
 $sbatch_device_opts
-#SBATCH -t 05:59:00                # Duration of the job (Ex: 15 mins)
+$sbatch_time
 #SBATCH -o$output_file             # Combined output and error messages file
-#SBATCH -p extended                # Extended partition for shorter queues
+$sbatch_partition
+$sbatch_extra
 
 set -e
 set -x
@@ -50,8 +83,10 @@ echo "Running in $(pwd):"
 job_slug="$job_slug"
 job_device="$2"
 job_interface="$3"
+job_shard="$4"
+job_cluster="$cluster_name"
 
-. ./mfc.sh load -c f -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
+. ./mfc.sh load -c $compiler_flag -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
 
 $sbatch_script_contents
 
@@ -68,5 +103,4 @@ fi
 echo "Submitted batch job $job_id"
 
 # Use resilient monitoring instead of sbatch -W
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
@@ -13,8 +13,17 @@ if [ "$job_device" = "gpu" ]; then
     fi
 fi
 
+shard_opts=""
+if [ -n "$job_shard" ]; then
+    shard_opts="--shard $job_shard"
+fi
+
 if [ "$job_device" = "gpu" ]; then
-    ./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier
+    rdma_opts=""
+    if [ "$job_cluster" = "frontier" ]; then
+        rdma_opts="--rdma-mpi"
+    fi
+    ./mfc.sh test -v -a $rdma_opts --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c $job_cluster
 else
-    ./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier
+    ./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu $shard_opts -- -c $job_cluster
 fi
@@ -16,7 +16,7 @@ if [ "$job_device" = "gpu" ]; then
 fi
 
 if [ "$job_device" = "gpu" ]; then
-    ./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier_amd $device_opts -n $n_ranks
+    ./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
 else
-    ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c frontier_amd $device_opts -n $n_ranks
+    ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
 fi
@@ -3,6 +3,15 @@
 # Ignore SIGHUP to survive login node session drops
 trap '' HUP
 
+# Determine compiler flag from directory name
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cluster_name="$(basename "$SCRIPT_DIR")"
+case "$cluster_name" in
+    frontier)     compiler_flag="f" ;;
+    frontier_amd) compiler_flag="famd" ;;
+    *) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;;
+esac
+
 job_device=$1
 job_interface=$2
 run_bench=$3
@@ -16,28 +25,25 @@ if [ "$job_device" = "gpu" ]; then
   fi
 fi
 
-. ./mfc.sh load -c famd -m g
+. ./mfc.sh load -c $compiler_flag -m g
 
 # Only set up build cache for test suite, not benchmarks
 if [ "$run_bench" != "bench" ]; then
-    source .github/scripts/setup-build-cache.sh frontier_amd "$job_device" "$job_interface"
+    source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface"
 fi
 
 max_attempts=3
 attempt=1
 while [ $attempt -le $max_attempts ]; do
     echo "Build attempt $attempt of $max_attempts..."
     if [ "$run_bench" == "bench" ]; then
-        build_cmd_ok=true
-        for dir in benchmarks/*/; do
-            dirname=$(basename "$dir")
-            if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
-                build_cmd_ok=false
-                break
-            fi
-        done
+        if ./mfc.sh build -j 8 $build_opts; then
+            build_cmd_ok=true
+        else
+            build_cmd_ok=false
+        fi
     else
-        if ./mfc.sh test -v -a --dry-run -j 8 $build_opts; then
+        if ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts; then
             build_cmd_ok=true
         else
             build_cmd_ok=false