[remote-eval][buffbench] feat: add comprehensive logging to remote ev… #7
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Remote Evaluations | |
| on: | |
| push: | |
| branches: ['**'] | |
| workflow_dispatch: | |
| inputs: | |
| eval_file: | |
| description: 'Eval file to run (e.g., eval-codebuff.json)' | |
| required: false | |
| default: 'eval-codebuff.json' | |
| type: string | |
| commit_index: | |
| description: 'Commit index to evaluate (0-based)' | |
| required: false | |
| default: '0' | |
| type: string | |
| mode: | |
| description: 'Auth mode (seed or bypass)' | |
| required: false | |
| default: 'bypass' | |
| type: choice | |
| options: | |
| - 'bypass' | |
| - 'seed' | |
| jobs: | |
| remote-evals: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 60 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Check commit message | |
| id: check_commit | |
| env: | |
| COMMIT_MESSAGE: ${{ github.event.head_commit.message }} | |
| run: | | |
| shopt -s nocasematch | |
| if [[ "$COMMIT_MESSAGE" == *"[remote-eval]"* ]] || [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then | |
| echo "should_run_evals=true" >> $GITHUB_OUTPUT | |
| echo "Will run remote evaluations" | |
| else | |
| echo "should_run_evals=false" >> $GITHUB_OUTPUT | |
| echo "Skipping remote evaluations (add [remote-eval] to commit message to trigger)" | |
| fi | |
| - name: Set up Bun | |
| if: steps.check_commit.outputs.should_run_evals == 'true' | |
| uses: oven-sh/setup-bun@v2 | |
| with: | |
| bun-version: '1.2.12' | |
| - name: Install dependencies | |
| if: steps.check_commit.outputs.should_run_evals == 'true' | |
| run: bun install --frozen-lockfile | |
| - name: Run remote evaluation | |
| if: steps.check_commit.outputs.should_run_evals == 'true' | |
| env: | |
| EVAL_FILE: ${{ inputs.eval_file || 'eval-codebuff.json' }} | |
| COMMIT_INDEX: ${{ inputs.commit_index || '0' }} | |
| MODE: ${{ inputs.mode || 'bypass' }} | |
| run: | | |
| echo "🚀 Remote Evaluation Starting" | |
| echo "📋 GitHub Actions Environment:" | |
| echo " Runner: ${{ runner.os }}" | |
| echo " SHA: ${{ github.sha }}" | |
| echo " Ref: ${{ github.ref }}" | |
| echo " Event: ${{ github.event_name }}" | |
| echo " Eval File: $EVAL_FILE" | |
| echo " Commit Index: $COMMIT_INDEX" | |
| echo " Mode: $MODE" | |
| echo "🐳 Docker Info:" | |
| docker --version | |
| docker compose version | |
| echo "💾 Disk Space:" | |
| df -h | |
| echo "🔧 Starting evaluation..." | |
| bash evals/scripts/run-remote-parameterized.sh "$MODE" "$EVAL_FILE" "$COMMIT_INDEX" | |
| - name: Dump logs on failure | |
| if: failure() && steps.check_commit.outputs.should_run_evals == 'true' | |
| run: | | |
| echo "❌ Evaluation failed - dumping diagnostic information" | |
| echo "🐳 Docker containers status:" | |
| docker ps -a || true | |
| echo "📋 Backend container logs:" | |
| docker compose -f evals/docker-compose.evals.yml logs backend --tail=200 || true | |
| echo "📋 Database container logs:" | |
| docker compose -f evals/docker-compose.evals.yml logs db --tail=100 || true | |
| echo "💾 Disk usage:" | |
| df -h || true | |
| echo "🧠 Memory usage:" | |
| free -h || true | |
| - name: Upload evaluation logs | |
| if: always() && steps.check_commit.outputs.should_run_evals == 'true' | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: remote-eval-logs-${{ github.sha }} | |
| path: | | |
| evals/test-repos/ | |
| debug/ | |
| ~/.cache/bun/ | |
| retention-days: 7 | |
| - name: Cleanup containers | |
| if: always() && steps.check_commit.outputs.should_run_evals == 'true' | |
| run: | | |
| echo "🧹 Final cleanup - removing all containers and volumes..." | |
| docker compose -f evals/docker-compose.evals.yml down -v || true | |
| docker system prune -f || true | |
| echo "✅ Cleanup completed" | |
| # Optional: Matrix job to run multiple evaluations in parallel | |
| remote-evals-matrix: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 90 | |
| if: contains(github.event.head_commit.message, '[remote-eval-all]') || (github.event_name == 'workflow_dispatch' && inputs.mode == 'matrix') | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| eval: | |
| - { file: 'eval-codebuff.json', index: '0' } | |
| - { file: 'eval-codebuff.json', index: '1' } | |
| - { file: 'eval-manifold.json', index: '0' } | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Bun | |
| uses: oven-sh/setup-bun@v2 | |
| with: | |
| bun-version: '1.2.12' | |
| - name: Install dependencies | |
| run: bun install --frozen-lockfile | |
| - name: Run evaluation matrix | |
| env: | |
| EVAL_FILE: ${{ matrix.eval.file }} | |
| COMMIT_INDEX: ${{ matrix.eval.index }} | |
| run: | | |
| echo "🚀 Running matrix evaluation..." | |
| bash evals/scripts/run-remote-parameterized.sh "bypass" "$EVAL_FILE" "$COMMIT_INDEX" | |
| - name: Upload matrix evaluation results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: remote-eval-matrix-${{ matrix.eval.file }}-${{ matrix.eval.index }}-${{ github.sha }} | |
| path: | | |
| evals/test-repos/ | |
| debug/ | |
| retention-days: 7 | |
| - name: Cleanup containers | |
| if: always() | |
| run: | | |
| docker compose -f evals/docker-compose.evals.yml down -v || true | |
| docker system prune -f || true |