Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
4535fc3
Add test sharding, proactive clean, and retry logic for self-hosted CI
sbryngelson Feb 19, 2026
8b2f712
Add --shard and failed_uuids.txt support to test toolchain
sbryngelson Feb 19, 2026
9de84d6
Fix stale failed_uuids.txt on abort, guard empty retry, quote nproc
sbryngelson Feb 20, 2026
92deb41
Remove proactive clean from Phoenix test script
sbryngelson Feb 20, 2026
877127b
Skip benchmark workflow for bot review events
sbryngelson Feb 21, 2026
eaab95a
Fix CI edge cases: guard os.remove, skip bare -- flag, use -s for emp…
sbryngelson Feb 21, 2026
aa26048
Fix --only filter silently matching zero tests with multiple UUIDs
sbryngelson Feb 23, 2026
b5c095f
Remove redundant NUM_FAILED > 0 guard in test retry logic
sbryngelson Feb 26, 2026
6b43b9b
Address review findings: shard slug collision, script consolidation, …
sbryngelson Feb 26, 2026
adac688
Use AND logic for labels and OR logic for UUIDs in --only filter
sbryngelson Feb 26, 2026
06c0641
Consolidate CI submit scripts: merge submit-bench.sh into submit.sh
sbryngelson Feb 26, 2026
a1c55ed
Use normal QOS instead of hackathon for Frontier test jobs
sbryngelson Feb 26, 2026
d561283
Add zero-test guard after shard filtering and pin retry action to SHA
sbryngelson Feb 26, 2026
8b48e30
Merge branch 'master' into ci-test
sbryngelson Feb 26, 2026
46dcd73
Trigger CI
sbryngelson Feb 26, 2026
a2431bf
Rename ambiguous single-letter variable `l` to `label` in _filter_only
sbryngelson Feb 27, 2026
73fd804
Remove case-optimization from benchmarks, use single generic build
sbryngelson Feb 27, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 14 additions & 37 deletions .github/scripts/submit_and_monitor_bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,50 +14,27 @@ device="$2"
interface="$3"
cluster="$4"

# Get the directory where this script lives
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
cd "$dir"

# Submit job
submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
.github/workflows/$cluster/bench.sh "$device" "$interface" 2>&1)

job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p')
job_slug="bench-$device-$interface"
output_file="${job_slug}.out"

if [ -z "$job_id" ]; then
echo "[$dir] ERROR: Failed to submit job"
echo "$submit_output"
exit 1
fi

echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"

# Use the monitoring script from PR (where this script lives)
monitor_exit=0
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
if [ "$monitor_exit" -ne 0 ]; then
echo "[$dir] WARNING: SLURM job exited with code $monitor_exit"
else
echo "[$dir] Monitoring complete for job $job_id"
fi
# Submit and monitor job (submit.sh auto-detects bench mode from script name)
bash .github/workflows/$cluster/submit.sh \
.github/workflows/$cluster/bench.sh "$device" "$interface"

# Verify the YAML output file was created
job_slug="bench-$device-$interface"
yaml_file="${job_slug}.yaml"
if [ ! -f "$yaml_file" ]; then
echo "[$dir] ERROR: Expected output file not found: $yaml_file"
echo "[$dir] Directory contents:"
ls -la *.yaml 2>/dev/null || echo " No YAML files found"
echo ""
echo "[$dir] Last 100 lines of job output ($output_file):"
echo "----------------------------------------"
tail -n 100 "$output_file" 2>/dev/null || echo " Could not read output file"
echo "----------------------------------------"
exit 1
echo "[$dir] ERROR: Expected output file not found: $yaml_file"
echo "[$dir] Directory contents:"
ls -la *.yaml 2>/dev/null || echo " No YAML files found"
echo ""
output_file="${job_slug}.out"
echo "[$dir] Last 100 lines of job output ($output_file):"
echo "----------------------------------------"
tail -n 100 "$output_file" 2>/dev/null || echo " Could not read output file"
echo "----------------------------------------"
exit 1
fi

echo "[$dir] Verified output file exists: $yaml_file ($(stat -f%z "$yaml_file" 2>/dev/null || stat -c%s "$yaml_file" 2>/dev/null) bytes)"

3 changes: 3 additions & 0 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ concurrency:
jobs:
file-changes:
name: Detect File Changes
if: >
github.event_name != 'pull_request_review' ||
github.event.review.user.type != 'Bot'
runs-on: 'ubuntu-latest'
outputs:
checkall: ${{ steps.changes.outputs.checkall }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/frontier/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if [ "$job_device" = "gpu" ]; then
fi

if [ "$job_device" = "gpu" ]; then
./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks
./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
else
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
fi
28 changes: 17 additions & 11 deletions .github/workflows/frontier/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@
# Ignore SIGHUP to survive login node session drops
trap '' HUP

# Determine compiler flag from directory name
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cluster_name="$(basename "$SCRIPT_DIR")"
case "$cluster_name" in
frontier) compiler_flag="f" ;;
frontier_amd) compiler_flag="famd" ;;
*) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;;
esac

job_device=$1
job_interface=$2
run_bench=$3
Expand All @@ -16,28 +25,25 @@ if [ "$job_device" = "gpu" ]; then
fi
fi

. ./mfc.sh load -c f -m g
. ./mfc.sh load -c $compiler_flag -m g

# Only set up build cache for test suite, not benchmarks
if [ "$run_bench" != "bench" ]; then
source .github/scripts/setup-build-cache.sh frontier "$job_device" "$job_interface"
source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface"
fi

max_attempts=3
attempt=1
while [ $attempt -le $max_attempts ]; do
echo "Build attempt $attempt of $max_attempts..."
if [ "$run_bench" == "bench" ]; then
build_cmd_ok=true
for dir in benchmarks/*/; do
dirname=$(basename "$dir")
if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
build_cmd_ok=false
break
fi
done
if ./mfc.sh build -j 8 $build_opts; then
build_cmd_ok=true
else
build_cmd_ok=false
fi
else
if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then
if ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts; then
build_cmd_ok=true
else
build_cmd_ok=false
Expand Down
54 changes: 0 additions & 54 deletions .github/workflows/frontier/submit-bench.sh

This file was deleted.

48 changes: 41 additions & 7 deletions .github/workflows/frontier/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,17 @@ set -e
# Ignore SIGHUP to survive login node session drops
trap '' HUP

# Determine compiler flag from directory name
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cluster_name="$(basename "$SCRIPT_DIR")"
case "$cluster_name" in
frontier) compiler_flag="f" ;;
frontier_amd) compiler_flag="famd" ;;
*) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;;
esac

usage() {
echo "Usage: $0 [script.sh] [cpu|gpu]"
echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp] [shard]"
}

if [ ! -z "$1" ]; then
Expand All @@ -16,6 +25,13 @@ else
exit 1
fi

# Detect job type from submitted script basename
script_basename="$(basename "$1" .sh)"
case "$script_basename" in
bench*) job_type="bench" ;;
*) job_type="test" ;;
esac

if [ "$2" = "cpu" ]; then
sbatch_device_opts="\
#SBATCH -n 32 # Number of cores required"
Expand All @@ -27,19 +43,36 @@ else
exit 1
fi

# Select SBATCH params based on job type
if [ "$job_type" = "bench" ]; then
sbatch_account="#SBATCH -A ENG160"
sbatch_time="#SBATCH -t 05:59:00"
sbatch_partition="#SBATCH -p extended"
sbatch_extra=""
else
sbatch_account="#SBATCH -A CFD154"
sbatch_time="#SBATCH -t 01:59:00"
sbatch_partition="#SBATCH -p batch"
sbatch_extra="#SBATCH --qos=normal"
fi

job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
shard_suffix=""
if [ -n "$4" ]; then
shard_suffix="-$(echo "$4" | sed 's|/|-of-|')"
fi
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3${shard_suffix}"
output_file="$job_slug.out"

submit_output=$(sbatch <<EOT
#!/bin/bash
#SBATCH -J MFC-$job_slug # Job name
#SBATCH -A ENG160 # charge account
$sbatch_account
#SBATCH -N 1 # Number of nodes required
$sbatch_device_opts
#SBATCH -t 05:59:00 # Duration of the job (Ex: 15 mins)
$sbatch_time
#SBATCH -o$output_file # Combined output and error messages file
#SBATCH -p extended # Extended partition for shorter queues
$sbatch_partition
$sbatch_extra

set -e
set -x
Expand All @@ -50,8 +83,10 @@ echo "Running in $(pwd):"
job_slug="$job_slug"
job_device="$2"
job_interface="$3"
job_shard="$4"
job_cluster="$cluster_name"

. ./mfc.sh load -c f -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
. ./mfc.sh load -c $compiler_flag -m $([ "$2" = "gpu" ] && echo "g" || echo "c")

$sbatch_script_contents

Expand All @@ -68,5 +103,4 @@ fi
echo "Submitted batch job $job_id"

# Use resilient monitoring instead of sbatch -W
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
13 changes: 11 additions & 2 deletions .github/workflows/frontier/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,17 @@ if [ "$job_device" = "gpu" ]; then
fi
fi

shard_opts=""
if [ -n "$job_shard" ]; then
shard_opts="--shard $job_shard"
fi

if [ "$job_device" = "gpu" ]; then
./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier
rdma_opts=""
if [ "$job_cluster" = "frontier" ]; then
rdma_opts="--rdma-mpi"
fi
./mfc.sh test -v -a $rdma_opts --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c $job_cluster
else
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu $shard_opts -- -c $job_cluster
fi
4 changes: 2 additions & 2 deletions .github/workflows/frontier_amd/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if [ "$job_device" = "gpu" ]; then
fi

if [ "$job_device" = "gpu" ]; then
./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier_amd $device_opts -n $n_ranks
./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
else
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c frontier_amd $device_opts -n $n_ranks
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
fi
28 changes: 17 additions & 11 deletions .github/workflows/frontier_amd/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@
# Ignore SIGHUP to survive login node session drops
trap '' HUP

# Determine compiler flag from directory name
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cluster_name="$(basename "$SCRIPT_DIR")"
case "$cluster_name" in
frontier) compiler_flag="f" ;;
frontier_amd) compiler_flag="famd" ;;
*) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;;
esac

job_device=$1
job_interface=$2
run_bench=$3
Expand All @@ -16,28 +25,25 @@ if [ "$job_device" = "gpu" ]; then
fi
fi

. ./mfc.sh load -c famd -m g
. ./mfc.sh load -c $compiler_flag -m g

# Only set up build cache for test suite, not benchmarks
if [ "$run_bench" != "bench" ]; then
source .github/scripts/setup-build-cache.sh frontier_amd "$job_device" "$job_interface"
source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface"
fi

max_attempts=3
attempt=1
while [ $attempt -le $max_attempts ]; do
echo "Build attempt $attempt of $max_attempts..."
if [ "$run_bench" == "bench" ]; then
build_cmd_ok=true
for dir in benchmarks/*/; do
dirname=$(basename "$dir")
if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
build_cmd_ok=false
break
fi
done
if ./mfc.sh build -j 8 $build_opts; then
build_cmd_ok=true
else
build_cmd_ok=false
fi
else
if ./mfc.sh test -v -a --dry-run -j 8 $build_opts; then
if ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts; then
build_cmd_ok=true
else
build_cmd_ok=false
Expand Down
Loading
Loading