Build SWE-Bench Images #208

Workflow file for this run

.github/workflows/build-swebench-images.yml at 680ce0f

	name: Build SWE-Bench Images

	on:
	pull_request_target:
	types: [labeled]
	workflow_dispatch:
	inputs:
	dataset:
	description: 'Dataset name (e.g., princeton-nlp/SWE-bench_Verified)'
	required: true
	default: 'princeton-nlp/SWE-bench_Verified'
	type: string
	split:
	description: 'Dataset split (e.g., test, dev)'
	required: true
	default: 'test'
	type: string
	max-workers:
	description: 'Number of concurrent builds'
	required: false
	default: '12'
	type: string
	max-retries:
	description: 'Retries per image build'
	required: false
	default: '5'
	type: string
	n-limit:
	description: 'Limit number of images to build (for testing). Leave blank for no limit.'
	required: false
	default: ''
	type: string
	instance-ids:
	description: 'Comma-separated instance IDs to build (optional, overrides n-limit)'
	required: false
	default: ''
	type: string
	sdk-commit:
	description: 'Software Agent SDK commit/ref to use. Leave blank to use submodule default.'
	required: false
	default: ''
	type: string
	benchmarks-commit:
	description: 'Benchmarks repository commit/ref to use. Leave blank to use the PR head or main branch. Useful for evaluating older SDK versions that are incompatible with current benchmarks code (e.g., SDK versions before the critic module was added in commit 79868ae5).'
	required: false
	default: ''
	type: string

	# Defaults for automatic runs; keep INSTANCE_IDS/SELECT_FILE initialized so set -euo pipefail won't fail on unset vars.
	env:
	DATASET: princeton-nlp/SWE-bench_Verified
	SPLIT: test
	MAX_WORKERS: '12'
	MAX_RETRIES: '5'
	N_LIMIT: '500'
	INSTANCE_IDS: ''
	SELECT_FILE: ''
	BUILD_BATCH_SIZE: '15'
	BUILDKIT_PRUNE_KEEP_GB: '60'
	BUILDKIT_PRUNE_THRESHOLD_PCT: '60'

	concurrency:
	group: build-swe-bench-${{ github.ref }}
	cancel-in-progress: false

	jobs:
	build-and-push:
	if: >
	github.event_name == 'workflow_dispatch' \|\|
	(github.event_name == 'pull_request_target' &&
	(github.event.label.name == 'build-swebench' \|\|
	github.event.label.name == 'build-swebench-50' \|\|
	github.event.label.name == 'build-swebench-200'))

	runs-on:
	labels: blacksmith-32vcpu-ubuntu-2204

	# Allow pushing to GHCR and commenting on issues
	permissions:
	contents: read
	packages: write
	issues: write

	steps:
	- name: Determine checkout ref
	id: checkout-ref
	run: \|
	if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ -n "${{ inputs.benchmarks-commit }}" ]; then
	echo "ref=${{ inputs.benchmarks-commit }}" >> "$GITHUB_OUTPUT"
	echo "Using benchmarks-commit from workflow_dispatch: ${{ inputs.benchmarks-commit }}"
	elif [ -n "${{ github.event.pull_request.head.sha }}" ]; then
	echo "ref=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
	echo "Using PR head SHA: ${{ github.event.pull_request.head.sha }}"
	else
	# Empty ref means checkout the ref that triggered the workflow (e.g., main branch for workflow_dispatch)
	echo "ref=" >> "$GITHUB_OUTPUT"
	echo "Using default ref (the commit that triggered this workflow)"
	fi

	- uses: actions/checkout@v6
	with:
	# When ref is empty, actions/checkout uses the commit that triggered the workflow
	ref: ${{ steps.checkout-ref.outputs.ref }}
	submodules: recursive

	# If this was a manual dispatch, override defaults with provided inputs.
	- name: Apply workflow_dispatch overrides (if any)
	if: ${{ github.event_name == 'workflow_dispatch' }}
	run: \|
	if [ -n "${{ inputs.dataset }}" ]; then echo "DATASET=${{ inputs.dataset }}" >> "$GITHUB_ENV"; fi
	if [ -n "${{ inputs.split }}" ]; then echo "SPLIT=${{ inputs.split }}" >> "$GITHUB_ENV"; fi
	if [ -n "${{ inputs.max-workers }}" ]; then echo "MAX_WORKERS=${{ inputs.max-workers }}" >> "$GITHUB_ENV"; fi
	if [ -n "${{ inputs.max-retries }}" ]; then echo "MAX_RETRIES=${{ inputs.max-retries }}" >> "$GITHUB_ENV"; fi
	# Empty string means "no limit"
	if [ -n "${{ inputs.n-limit }}" ]; then echo "N_LIMIT=${{ inputs.n-limit }}" >> "$GITHUB_ENV"; else echo "N_LIMIT=" >> "$GITHUB_ENV"; fi
	if [ -n "${{ inputs.instance-ids }}" ]; then echo "INSTANCE_IDS=${{ inputs.instance-ids }}" >> "$GITHUB_ENV"; fi

	# Set N_LIMIT based on the label that triggered the workflow
	- name: Set N_LIMIT based on label
	if: ${{ github.event_name == 'pull_request_target' }}
	run: \|
	LABEL_NAME="${{ github.event.label.name }}"
	if [ "$LABEL_NAME" = "build-swebench-50" ]; then
	echo "N_LIMIT=50" >> "$GITHUB_ENV"
	echo "Building 50 images based on label: build-swebench-50"
	elif [ "$LABEL_NAME" = "build-swebench-200" ]; then
	echo "N_LIMIT=200" >> "$GITHUB_ENV"
	echo "Building 200 images based on label: build-swebench-200"
	elif [ "$LABEL_NAME" = "build-swebench" ]; then
	echo "N_LIMIT=" >> "$GITHUB_ENV"
	echo "Building all images based on label: build-swebench"
	fi

	- name: Build selected instances file
	run: \|
	set -euo pipefail

	if [ -z "${INSTANCE_IDS}" ]; then
	echo "No instance IDs provided; skipping select file creation."
	exit 0
	fi

	SELECT_FILE="${RUNNER_TEMP}/selected-instances.txt"
	echo "Creating selected instances file at ${SELECT_FILE}"

	echo "${INSTANCE_IDS}" \
	\| tr ',' '\n' \
	\| sed 's/^[[:space:]]//;s/[[:space:]]$//' \
	\| sed '/^$/d' > "${SELECT_FILE}"

	echo "SELECT_FILE=${SELECT_FILE}" >> "$GITHUB_ENV"
	# Skip n-limit when explicit instance IDs are provided to avoid double filtering
	echo "N_LIMIT=" >> "$GITHUB_ENV"

	echo "Selected instance IDs:"
	cat "${SELECT_FILE}"

	# Update SDK submodule to specific commit if provided
	# Must run BEFORE install dependencies so git submodule update works correctly
	- name: Update SDK submodule
	if: ${{ github.event_name == 'workflow_dispatch' && inputs.sdk-commit != '' }}
	run: \|
	cd vendor/software-agent-sdk
	git fetch origin ${{ inputs.sdk-commit }}
	git checkout FETCH_HEAD
	SDK_SHA=$(git rev-parse HEAD)
	cd ../..
	# Stage the submodule reference update so make build uses it
	git add vendor/software-agent-sdk
	echo "Updated SDK submodule to $SDK_SHA (from ${{ inputs.sdk-commit }})"

	- name: Set up Docker Buildx with Blacksmith
	uses: useblacksmith/setup-docker-builder@v1

	- name: Log in to GitHub Container Registry
	uses: docker/login-action@v3
	with:
	registry: ghcr.io
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}

	- name: Install uv
	uses: astral-sh/setup-uv@v7
	with:
	enable-cache: true

	- name: Install dependencies
	run: \|
	make build

	- name: "Preflight: prune cache and verify BuildKit disk"
	run: \|
	set -euo pipefail
	KEEP_GB=60
	echo "Pruning BuildKit cache (target max-storage ${KEEP_GB} GiB, no filters)..."
	# Prefer newer max-storage flag; fall back to keep-storage if not supported.
	if ! docker buildx prune --all --force --max-storage ${KEEP_GB}g; then
	docker buildx prune --all --force --keep-storage ${KEEP_GB}g \|\| true
	fi

	if df -B1 /var/lib/buildkit > /tmp/buildkit_df 2>/dev/null; then
	LINE=$(tail -n1 /tmp/buildkit_df)
	TOTAL=$(echo "$LINE" \| awk '{print $2}')
	USED=$(echo "$LINE" \| awk '{print $3}')
	FREE=$(echo "$LINE" \| awk '{print $4}')
	if [ -n "$TOTAL" ] && [ -n "$FREE" ]; then
	PCT=$(( 100 * USED / TOTAL ))
	echo "BuildKit disk: used ${USED} / ${TOTAL} bytes (${PCT}%); free ${FREE} bytes"
	MIN=$((75 * 1024 * 1024 * 1024))
	if [ "$FREE" -lt "$MIN" ]; then
	echo "::error::Not enough free space on /var/lib/buildkit (${FREE} bytes free, need >= ${MIN})"
	exit 1
	fi
	else
	echo "Warning: unable to parse df output for /var/lib/buildkit"
	fi
	else
	echo "Warning: /var/lib/buildkit not found; skipping disk check"
	fi

	- name: Build and push SWE-Bench images
	run: \|
	set -euo pipefail

	CMD="uv run benchmarks/swebench/build_images.py \
	--dataset '${DATASET}' \
	--split '${SPLIT}' \
	--image ghcr.io/openhands/eval-agent-server \
	--push \
	--max-workers '${MAX_WORKERS}' \
	--max-retries '${MAX_RETRIES}'"

	# Only include --n-limit if provided (non-empty)
	if [ -n "${N_LIMIT}" ]; then
	CMD="$CMD --n-limit '${N_LIMIT}'"
	fi
	if [ -n "${SELECT_FILE}" ]; then
	CMD="$CMD --select '${SELECT_FILE}'"
	fi

	echo "Running: $CMD"
	eval "$CMD"
	env:
	DOCKER_BUILDKIT: 1
	BUILDKIT_PROGRESS: plain
	BUILDKIT_RESET_ON_FAILURE: 1

	- name: Archive build logs
	if: always()
	run: \|
	if [ -d builds ]; then
	# Create tar archive to avoid filename restrictions (colons, etc.)
	tar -czf build-logs.tar.gz builds/
	echo "Build logs archived successfully"
	else
	echo "No builds directory found"
	fi

	- name: Upload build logs
	if: always()
	uses: actions/upload-artifact@v6
	with:
	name: build-logs-${{ github.run_id }}
	path: build-logs.tar.gz
	retention-days: 7
	if-no-files-found: warn

	- name: Display build summary
	if: always()
	run: \|
	# Find all manifest.jsonl files
	MANIFEST_FILES=$(find builds -name "manifest.jsonl" -type f 2>/dev/null)

	if [ -z "$MANIFEST_FILES" ]; then
	echo "No manifest.jsonl files found"
	exit 0
	fi

	# Generate summary from manifest files
	echo "## Build Summary" >> "$GITHUB_STEP_SUMMARY"
	echo "" >> "$GITHUB_STEP_SUMMARY"

	# Count successes and failures
	TOTAL=$(cat $MANIFEST_FILES 2>/dev/null \| wc -l)
	SUCCESSES=$(cat $MANIFEST_FILES 2>/dev/null \| python -c "
	import sys
	import json
	count = 0
	for line in sys.stdin:
	data = json.loads(line.strip())
	if data.get('error') is None and len(data.get('tags', [])) > 0:
	count += 1
	print(count)
	")
	FAILURES=$((TOTAL - SUCCESSES))

	echo "Total Images: $TOTAL" >> "$GITHUB_STEP_SUMMARY"
	echo "Successful Builds: ✅ $SUCCESSES" >> "$GITHUB_STEP_SUMMARY"
	echo "Failed Builds: ❌ $FAILURES" >> "$GITHUB_STEP_SUMMARY"
	echo "" >> "$GITHUB_STEP_SUMMARY"

	# Show failed builds if any
	if [ "$FAILURES" -gt 0 ]; then
	echo "### Failed Builds" >> "$GITHUB_STEP_SUMMARY"
	echo "" >> "$GITHUB_STEP_SUMMARY"
	cat $MANIFEST_FILES \| python3 - <<'PY' >> "$GITHUB_STEP_SUMMARY"
	import json
	import sys

	for line in sys.stdin:
	data = json.loads(line.strip())
	if data.get("error") or not data.get("tags"):
	base = data.get("base_image", "unknown")
	err = data.get("error") or "No tags generated"
	print(f"- `{base}`: {err}")
	PY
	fi

	if [ "$FAILURES" -gt 0 ]; then
	echo "::error::Detected $FAILURES failed or missing agent-server images out of $TOTAL"
	exit 1
	fi

	- name: Comment on tracker issue
	if: success()
	run: \|
	# Get SDK version from submodule
	SDK_SHA=$(git submodule status vendor/software-agent-sdk \| awk '{print $1}' \| sed 's/^[+-]//')

	# Find all manifest.jsonl files
	MANIFEST_FILES=$(find builds -name "manifest.jsonl" -type f 2>/dev/null)

	if [ -z "$MANIFEST_FILES" ]; then
	echo "No manifest.jsonl files found in builds directory"
	echo "Build may have completed but produced no images"
	exit 0
	fi

	# Count total images built
	TOTAL_IMAGES=$(cat $MANIFEST_FILES 2>/dev/null \| wc -l)

	if [ "$TOTAL_IMAGES" -eq 0 ]; then
	echo "No images found in manifest files"
	echo "Skipping comment as there are no built images to report"
	exit 0
	fi

	# Extract all tags and format them as a markdown list (one tag per image)
	TAGS=$(cat $MANIFEST_FILES \| python -c "
	import sys
	import json
	for line in sys.stdin:
	data = json.loads(line.strip())
	if data.get('tags') and len(data['tags']) > 0:
	# Only show the first tag per image to reduce clutter
	print(f'- \`{data[\"tags\"][0]}\`')
	")

	# Determine how the workflow was triggered
	if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
	TRIGGER="Manual trigger (workflow_dispatch)"
	elif [ "${{ github.event_name }}" = "pull_request" ]; then
	TRIGGER="Pull request [#${{ github.event.pull_request.number }}](${{ github.event.pull_request.html_url }})"
	else
	TRIGGER="${{ github.event_name }}"
	fi

	# Create the comment body
	COMMENT_BODY=$(cat <<EOF
	## Build Complete ✅

	Dataset: \`${DATASET}\`
	Split: \`${SPLIT}\`
	SDK Version: [\`${SDK_SHA:0:7}\`](https://github.com/OpenHands/software-agent-sdk/commit/${SDK_SHA})
	Workflow Run: [#${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
	Triggered by: ${TRIGGER}

	<details>
	<summary>Built Tags (${TOTAL_IMAGES} images)</summary>

	${TAGS}

	</details>
	EOF
	)

	# Post comment to issue #81
	curl -L -X POST \
	-H "Accept: application/vnd.github+json" \
	-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
	-H "X-GitHub-Api-Version: 2022-11-28" \
	"${{ github.api_url }}/repos/${{ github.repository }}/issues/81/comments" \
	-d "$(jq -n --arg body "$COMMENT_BODY" '{body: $body}')"
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Build SWE-Bench Images #208

Workflow file

Build SWE-Bench Images #208

Uh oh!

Workflow file for this run