[remote-eval][buffbench] feat: add comprehensive logging to remote ev… #7

Workflow file for this run

.github/workflows/remote-evals.yml at 94be8b0

	name: Remote Evaluations

	on:
	push:
	branches: ['**']
	workflow_dispatch:
	inputs:
	eval_file:
	description: 'Eval file to run (e.g., eval-codebuff.json)'
	required: false
	default: 'eval-codebuff.json'
	type: string
	commit_index:
	description: 'Commit index to evaluate (0-based)'
	required: false
	default: '0'
	type: string
	mode:
	description: 'Auth mode (seed or bypass)'
	required: false
	default: 'bypass'
	type: choice
	options:
	- 'bypass'
	- 'seed'

	jobs:
	remote-evals:
	runs-on: ubuntu-latest
	timeout-minutes: 60

	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Check commit message
	id: check_commit
	env:
	COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
	run: \|
	shopt -s nocasematch
	if [[ "$COMMIT_MESSAGE" == "[remote-eval]" ]] \|\| [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
	echo "should_run_evals=true" >> $GITHUB_OUTPUT
	echo "Will run remote evaluations"
	else
	echo "should_run_evals=false" >> $GITHUB_OUTPUT
	echo "Skipping remote evaluations (add [remote-eval] to commit message to trigger)"
	fi

	- name: Set up Bun
	if: steps.check_commit.outputs.should_run_evals == 'true'
	uses: oven-sh/setup-bun@v2
	with:
	bun-version: '1.2.12'

	- name: Install dependencies
	if: steps.check_commit.outputs.should_run_evals == 'true'
	run: bun install --frozen-lockfile

	- name: Run remote evaluation
	if: steps.check_commit.outputs.should_run_evals == 'true'
	env:
	EVAL_FILE: ${{ inputs.eval_file \|\| 'eval-codebuff.json' }}
	COMMIT_INDEX: ${{ inputs.commit_index \|\| '0' }}
	MODE: ${{ inputs.mode \|\| 'bypass' }}
	run: \|
	echo "🚀 Remote Evaluation Starting"
	echo "📋 GitHub Actions Environment:"
	echo " Runner: ${{ runner.os }}"
	echo " SHA: ${{ github.sha }}"
	echo " Ref: ${{ github.ref }}"
	echo " Event: ${{ github.event_name }}"
	echo " Eval File: $EVAL_FILE"
	echo " Commit Index: $COMMIT_INDEX"
	echo " Mode: $MODE"
	echo "🐳 Docker Info:"
	docker --version
	docker compose version
	echo "💾 Disk Space:"
	df -h
	echo "🔧 Starting evaluation..."
	bash evals/scripts/run-remote-parameterized.sh "$MODE" "$EVAL_FILE" "$COMMIT_INDEX"

	- name: Dump logs on failure
	if: failure() && steps.check_commit.outputs.should_run_evals == 'true'
	run: \|
	echo "❌ Evaluation failed - dumping diagnostic information"
	echo "🐳 Docker containers status:"
	docker ps -a \|\| true
	echo "📋 Backend container logs:"
	docker compose -f evals/docker-compose.evals.yml logs backend --tail=200 \|\| true
	echo "📋 Database container logs:"
	docker compose -f evals/docker-compose.evals.yml logs db --tail=100 \|\| true
	echo "💾 Disk usage:"
	df -h \|\| true
	echo "🧠 Memory usage:"
	free -h \|\| true

	- name: Upload evaluation logs
	if: always() && steps.check_commit.outputs.should_run_evals == 'true'
	uses: actions/upload-artifact@v4
	with:
	name: remote-eval-logs-${{ github.sha }}
	path: \|
	evals/test-repos/
	debug/
	~/.cache/bun/
	retention-days: 7

	- name: Cleanup containers
	if: always() && steps.check_commit.outputs.should_run_evals == 'true'
	run: \|
	echo "🧹 Final cleanup - removing all containers and volumes..."
	docker compose -f evals/docker-compose.evals.yml down -v \|\| true
	docker system prune -f \|\| true
	echo "✅ Cleanup completed"

	# Optional: Matrix job to run multiple evaluations in parallel
	remote-evals-matrix:
	runs-on: ubuntu-latest
	timeout-minutes: 90
	if: contains(github.event.head_commit.message, '[remote-eval-all]') \|\| (github.event_name == 'workflow_dispatch' && inputs.mode == 'matrix')

	strategy:
	fail-fast: false
	matrix:
	eval:
	- { file: 'eval-codebuff.json', index: '0' }
	- { file: 'eval-codebuff.json', index: '1' }
	- { file: 'eval-manifold.json', index: '0' }

	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Set up Bun
	uses: oven-sh/setup-bun@v2
	with:
	bun-version: '1.2.12'

	- name: Install dependencies
	run: bun install --frozen-lockfile

	- name: Run evaluation matrix
	env:
	EVAL_FILE: ${{ matrix.eval.file }}
	COMMIT_INDEX: ${{ matrix.eval.index }}
	run: \|
	echo "🚀 Running matrix evaluation..."
	bash evals/scripts/run-remote-parameterized.sh "bypass" "$EVAL_FILE" "$COMMIT_INDEX"

	- name: Upload matrix evaluation results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: remote-eval-matrix-${{ matrix.eval.file }}-${{ matrix.eval.index }}-${{ github.sha }}
	path: \|
	evals/test-repos/
	debug/
	retention-days: 7

	- name: Cleanup containers
	if: always()
	run: \|
	docker compose -f evals/docker-compose.evals.yml down -v \|\| true
	docker system prune -f \|\| true

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[remote-eval][buffbench] feat: add comprehensive logging to remote ev… #7

Workflow file

[remote-eval][buffbench] feat: add comprehensive logging to remote ev… #7

Uh oh!

Workflow file for this run