Build SWE-Bench Images #208
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Build SWE-Bench Images | |
| on: | |
| pull_request_target: | |
| types: [labeled] | |
| workflow_dispatch: | |
| inputs: | |
| dataset: | |
| description: 'Dataset name (e.g., princeton-nlp/SWE-bench_Verified)' | |
| required: true | |
| default: 'princeton-nlp/SWE-bench_Verified' | |
| type: string | |
| split: | |
| description: 'Dataset split (e.g., test, dev)' | |
| required: true | |
| default: 'test' | |
| type: string | |
| max-workers: | |
| description: 'Number of concurrent builds' | |
| required: false | |
| default: '12' | |
| type: string | |
| max-retries: | |
| description: 'Retries per image build' | |
| required: false | |
| default: '5' | |
| type: string | |
| n-limit: | |
| description: 'Limit number of images to build (for testing). Leave blank for no limit.' | |
| required: false | |
| default: '' | |
| type: string | |
| instance-ids: | |
| description: 'Comma-separated instance IDs to build (optional, overrides n-limit)' | |
| required: false | |
| default: '' | |
| type: string | |
| sdk-commit: | |
| description: 'Software Agent SDK commit/ref to use. Leave blank to use submodule default.' | |
| required: false | |
| default: '' | |
| type: string | |
| benchmarks-commit: | |
| description: 'Benchmarks repository commit/ref to use. Leave blank to use the PR head or main branch. Useful for evaluating older SDK versions that are incompatible with current benchmarks code (e.g., SDK versions before the critic module was added in commit 79868ae5).' | |
| required: false | |
| default: '' | |
| type: string | |
| # Defaults for automatic runs; keep INSTANCE_IDS/SELECT_FILE initialized so set -euo pipefail won't fail on unset vars. | |
| env: | |
| DATASET: princeton-nlp/SWE-bench_Verified | |
| SPLIT: test | |
| MAX_WORKERS: '12' | |
| MAX_RETRIES: '5' | |
| N_LIMIT: '500' | |
| INSTANCE_IDS: '' | |
| SELECT_FILE: '' | |
| BUILD_BATCH_SIZE: '15' | |
| BUILDKIT_PRUNE_KEEP_GB: '60' | |
| BUILDKIT_PRUNE_THRESHOLD_PCT: '60' | |
| concurrency: | |
| group: build-swe-bench-${{ github.ref }} | |
| cancel-in-progress: false | |
| jobs: | |
| build-and-push: | |
| if: > | |
| github.event_name == 'workflow_dispatch' || | |
| (github.event_name == 'pull_request_target' && | |
| (github.event.label.name == 'build-swebench' || | |
| github.event.label.name == 'build-swebench-50' || | |
| github.event.label.name == 'build-swebench-200')) | |
| runs-on: | |
| labels: blacksmith-32vcpu-ubuntu-2204 | |
| # Allow pushing to GHCR and commenting on issues | |
| permissions: | |
| contents: read | |
| packages: write | |
| issues: write | |
| steps: | |
| - name: Determine checkout ref | |
| id: checkout-ref | |
| run: | | |
| if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ -n "${{ inputs.benchmarks-commit }}" ]; then | |
| echo "ref=${{ inputs.benchmarks-commit }}" >> "$GITHUB_OUTPUT" | |
| echo "Using benchmarks-commit from workflow_dispatch: ${{ inputs.benchmarks-commit }}" | |
| elif [ -n "${{ github.event.pull_request.head.sha }}" ]; then | |
| echo "ref=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT" | |
| echo "Using PR head SHA: ${{ github.event.pull_request.head.sha }}" | |
| else | |
| # Empty ref means checkout the ref that triggered the workflow (e.g., main branch for workflow_dispatch) | |
| echo "ref=" >> "$GITHUB_OUTPUT" | |
| echo "Using default ref (the commit that triggered this workflow)" | |
| fi | |
| - uses: actions/checkout@v6 | |
| with: | |
| # When ref is empty, actions/checkout uses the commit that triggered the workflow | |
| ref: ${{ steps.checkout-ref.outputs.ref }} | |
| submodules: recursive | |
| # If this was a manual dispatch, override defaults with provided inputs. | |
| - name: Apply workflow_dispatch overrides (if any) | |
| if: ${{ github.event_name == 'workflow_dispatch' }} | |
| run: | | |
| if [ -n "${{ inputs.dataset }}" ]; then echo "DATASET=${{ inputs.dataset }}" >> "$GITHUB_ENV"; fi | |
| if [ -n "${{ inputs.split }}" ]; then echo "SPLIT=${{ inputs.split }}" >> "$GITHUB_ENV"; fi | |
| if [ -n "${{ inputs.max-workers }}" ]; then echo "MAX_WORKERS=${{ inputs.max-workers }}" >> "$GITHUB_ENV"; fi | |
| if [ -n "${{ inputs.max-retries }}" ]; then echo "MAX_RETRIES=${{ inputs.max-retries }}" >> "$GITHUB_ENV"; fi | |
| # Empty string means "no limit" | |
| if [ -n "${{ inputs.n-limit }}" ]; then echo "N_LIMIT=${{ inputs.n-limit }}" >> "$GITHUB_ENV"; else echo "N_LIMIT=" >> "$GITHUB_ENV"; fi | |
| if [ -n "${{ inputs.instance-ids }}" ]; then echo "INSTANCE_IDS=${{ inputs.instance-ids }}" >> "$GITHUB_ENV"; fi | |
| # Set N_LIMIT based on the label that triggered the workflow | |
| - name: Set N_LIMIT based on label | |
| if: ${{ github.event_name == 'pull_request_target' }} | |
| run: | | |
| LABEL_NAME="${{ github.event.label.name }}" | |
| if [ "$LABEL_NAME" = "build-swebench-50" ]; then | |
| echo "N_LIMIT=50" >> "$GITHUB_ENV" | |
| echo "Building 50 images based on label: build-swebench-50" | |
| elif [ "$LABEL_NAME" = "build-swebench-200" ]; then | |
| echo "N_LIMIT=200" >> "$GITHUB_ENV" | |
| echo "Building 200 images based on label: build-swebench-200" | |
| elif [ "$LABEL_NAME" = "build-swebench" ]; then | |
| echo "N_LIMIT=" >> "$GITHUB_ENV" | |
| echo "Building all images based on label: build-swebench" | |
| fi | |
| - name: Build selected instances file | |
| run: | | |
| set -euo pipefail | |
| if [ -z "${INSTANCE_IDS}" ]; then | |
| echo "No instance IDs provided; skipping select file creation." | |
| exit 0 | |
| fi | |
| SELECT_FILE="${RUNNER_TEMP}/selected-instances.txt" | |
| echo "Creating selected instances file at ${SELECT_FILE}" | |
| echo "${INSTANCE_IDS}" \ | |
| | tr ',' '\n' \ | |
| | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \ | |
| | sed '/^$/d' > "${SELECT_FILE}" | |
| echo "SELECT_FILE=${SELECT_FILE}" >> "$GITHUB_ENV" | |
| # Skip n-limit when explicit instance IDs are provided to avoid double filtering | |
| echo "N_LIMIT=" >> "$GITHUB_ENV" | |
| echo "Selected instance IDs:" | |
| cat "${SELECT_FILE}" | |
| # Update SDK submodule to specific commit if provided | |
| # Must run BEFORE install dependencies so git submodule update works correctly | |
| - name: Update SDK submodule | |
| if: ${{ github.event_name == 'workflow_dispatch' && inputs.sdk-commit != '' }} | |
| run: | | |
| cd vendor/software-agent-sdk | |
| git fetch origin ${{ inputs.sdk-commit }} | |
| git checkout FETCH_HEAD | |
| SDK_SHA=$(git rev-parse HEAD) | |
| cd ../.. | |
| # Stage the submodule reference update so make build uses it | |
| git add vendor/software-agent-sdk | |
| echo "Updated SDK submodule to $SDK_SHA (from ${{ inputs.sdk-commit }})" | |
| - name: Set up Docker Buildx with Blacksmith | |
| uses: useblacksmith/setup-docker-builder@v1 | |
| - name: Log in to GitHub Container Registry | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| - name: Install dependencies | |
| run: | | |
| make build | |
| - name: "Preflight: prune cache and verify BuildKit disk" | |
| run: | | |
| set -euo pipefail | |
| KEEP_GB=60 | |
| echo "Pruning BuildKit cache (target max-storage ${KEEP_GB} GiB, no filters)..." | |
| # Prefer newer max-storage flag; fall back to keep-storage if not supported. | |
| if ! docker buildx prune --all --force --max-storage ${KEEP_GB}g; then | |
| docker buildx prune --all --force --keep-storage ${KEEP_GB}g || true | |
| fi | |
| if df -B1 /var/lib/buildkit > /tmp/buildkit_df 2>/dev/null; then | |
| LINE=$(tail -n1 /tmp/buildkit_df) | |
| TOTAL=$(echo "$LINE" | awk '{print $2}') | |
| USED=$(echo "$LINE" | awk '{print $3}') | |
| FREE=$(echo "$LINE" | awk '{print $4}') | |
| if [ -n "$TOTAL" ] && [ -n "$FREE" ]; then | |
| PCT=$(( 100 * USED / TOTAL )) | |
| echo "BuildKit disk: used ${USED} / ${TOTAL} bytes (${PCT}%); free ${FREE} bytes" | |
| MIN=$((75 * 1024 * 1024 * 1024)) | |
| if [ "$FREE" -lt "$MIN" ]; then | |
| echo "::error::Not enough free space on /var/lib/buildkit (${FREE} bytes free, need >= ${MIN})" | |
| exit 1 | |
| fi | |
| else | |
| echo "Warning: unable to parse df output for /var/lib/buildkit" | |
| fi | |
| else | |
| echo "Warning: /var/lib/buildkit not found; skipping disk check" | |
| fi | |
| - name: Build and push SWE-Bench images | |
| run: | | |
| set -euo pipefail | |
| CMD="uv run benchmarks/swebench/build_images.py \ | |
| --dataset '${DATASET}' \ | |
| --split '${SPLIT}' \ | |
| --image ghcr.io/openhands/eval-agent-server \ | |
| --push \ | |
| --max-workers '${MAX_WORKERS}' \ | |
| --max-retries '${MAX_RETRIES}'" | |
| # Only include --n-limit if provided (non-empty) | |
| if [ -n "${N_LIMIT}" ]; then | |
| CMD="$CMD --n-limit '${N_LIMIT}'" | |
| fi | |
| if [ -n "${SELECT_FILE}" ]; then | |
| CMD="$CMD --select '${SELECT_FILE}'" | |
| fi | |
| echo "Running: $CMD" | |
| eval "$CMD" | |
| env: | |
| DOCKER_BUILDKIT: 1 | |
| BUILDKIT_PROGRESS: plain | |
| BUILDKIT_RESET_ON_FAILURE: 1 | |
| - name: Archive build logs | |
| if: always() | |
| run: | | |
| if [ -d builds ]; then | |
| # Create tar archive to avoid filename restrictions (colons, etc.) | |
| tar -czf build-logs.tar.gz builds/ | |
| echo "Build logs archived successfully" | |
| else | |
| echo "No builds directory found" | |
| fi | |
| - name: Upload build logs | |
| if: always() | |
| uses: actions/upload-artifact@v6 | |
| with: | |
| name: build-logs-${{ github.run_id }} | |
| path: build-logs.tar.gz | |
| retention-days: 7 | |
| if-no-files-found: warn | |
| - name: Display build summary | |
| if: always() | |
| run: | | |
| # Find all manifest.jsonl files | |
| MANIFEST_FILES=$(find builds -name "manifest.jsonl" -type f 2>/dev/null) | |
| if [ -z "$MANIFEST_FILES" ]; then | |
| echo "No manifest.jsonl files found" | |
| exit 0 | |
| fi | |
| # Generate summary from manifest files | |
| echo "## Build Summary" >> "$GITHUB_STEP_SUMMARY" | |
| echo "" >> "$GITHUB_STEP_SUMMARY" | |
| # Count successes and failures | |
| TOTAL=$(cat $MANIFEST_FILES 2>/dev/null | wc -l) | |
| SUCCESSES=$(cat $MANIFEST_FILES 2>/dev/null | python -c " | |
| import sys | |
| import json | |
| count = 0 | |
| for line in sys.stdin: | |
| data = json.loads(line.strip()) | |
| if data.get('error') is None and len(data.get('tags', [])) > 0: | |
| count += 1 | |
| print(count) | |
| ") | |
| FAILURES=$((TOTAL - SUCCESSES)) | |
| echo "**Total Images:** $TOTAL" >> "$GITHUB_STEP_SUMMARY" | |
| echo "**Successful Builds:** ✅ $SUCCESSES" >> "$GITHUB_STEP_SUMMARY" | |
| echo "**Failed Builds:** ❌ $FAILURES" >> "$GITHUB_STEP_SUMMARY" | |
| echo "" >> "$GITHUB_STEP_SUMMARY" | |
| # Show failed builds if any | |
| if [ "$FAILURES" -gt 0 ]; then | |
| echo "### Failed Builds" >> "$GITHUB_STEP_SUMMARY" | |
| echo "" >> "$GITHUB_STEP_SUMMARY" | |
| cat $MANIFEST_FILES | python3 - <<'PY' >> "$GITHUB_STEP_SUMMARY" | |
| import json | |
| import sys | |
| for line in sys.stdin: | |
| data = json.loads(line.strip()) | |
| if data.get("error") or not data.get("tags"): | |
| base = data.get("base_image", "unknown") | |
| err = data.get("error") or "No tags generated" | |
| print(f"- `{base}`: {err}") | |
| PY | |
| fi | |
| if [ "$FAILURES" -gt 0 ]; then | |
| echo "::error::Detected $FAILURES failed or missing agent-server images out of $TOTAL" | |
| exit 1 | |
| fi | |
| - name: Comment on tracker issue | |
| if: success() | |
| run: | | |
| # Get SDK version from submodule | |
| SDK_SHA=$(git submodule status vendor/software-agent-sdk | awk '{print $1}' | sed 's/^[+-]//') | |
| # Find all manifest.jsonl files | |
| MANIFEST_FILES=$(find builds -name "manifest.jsonl" -type f 2>/dev/null) | |
| if [ -z "$MANIFEST_FILES" ]; then | |
| echo "No manifest.jsonl files found in builds directory" | |
| echo "Build may have completed but produced no images" | |
| exit 0 | |
| fi | |
| # Count total images built | |
| TOTAL_IMAGES=$(cat $MANIFEST_FILES 2>/dev/null | wc -l) | |
| if [ "$TOTAL_IMAGES" -eq 0 ]; then | |
| echo "No images found in manifest files" | |
| echo "Skipping comment as there are no built images to report" | |
| exit 0 | |
| fi | |
| # Extract all tags and format them as a markdown list (one tag per image) | |
| TAGS=$(cat $MANIFEST_FILES | python -c " | |
| import sys | |
| import json | |
| for line in sys.stdin: | |
| data = json.loads(line.strip()) | |
| if data.get('tags') and len(data['tags']) > 0: | |
| # Only show the first tag per image to reduce clutter | |
| print(f'- \`{data[\"tags\"][0]}\`') | |
| ") | |
| # Determine how the workflow was triggered | |
| if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then | |
| TRIGGER="Manual trigger (workflow_dispatch)" | |
| elif [ "${{ github.event_name }}" = "pull_request" ]; then | |
| TRIGGER="Pull request [#${{ github.event.pull_request.number }}](${{ github.event.pull_request.html_url }})" | |
| else | |
| TRIGGER="${{ github.event_name }}" | |
| fi | |
| # Create the comment body | |
| COMMENT_BODY=$(cat <<EOF | |
| ## Build Complete ✅ | |
| **Dataset:** \`${DATASET}\` | |
| **Split:** \`${SPLIT}\` | |
| **SDK Version:** [\`${SDK_SHA:0:7}\`](https://github.com/OpenHands/software-agent-sdk/commit/${SDK_SHA}) | |
| **Workflow Run:** [#${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) | |
| **Triggered by:** ${TRIGGER} | |
| <details> | |
| <summary>Built Tags (${TOTAL_IMAGES} images)</summary> | |
| ${TAGS} | |
| </details> | |
| EOF | |
| ) | |
| # Post comment to issue #81 | |
| curl -L -X POST \ | |
| -H "Accept: application/vnd.github+json" \ | |
| -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ | |
| -H "X-GitHub-Api-Version: 2022-11-28" \ | |
| "${{ github.api_url }}/repos/${{ github.repository }}/issues/81/comments" \ | |
| -d "$(jq -n --arg body "$COMMENT_BODY" '{body: $body}')" | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |