Run Evaluation #50

Workflow file for this run

.github/workflows/eval-runner.yml at 89406ba

	name: Run Evaluation

	on:
	pull_request:
	types: [labeled]
	schedule:
	- cron: "0 1 * * *" # Run daily at 1 AM UTC
	workflow_dispatch:
	inputs:
	reason:
	description: "Reason for manual trigger"
	required: true
	default: ""

	env:
	N_PROCESSES: 32 # Global configuration for number of parallel processes for evaluation

	jobs:
	run-evaluation:
	if: github.event.label.name == 'eval-this' \|\| github.event_name != 'pull_request'
	runs-on: ubuntu-latest
	permissions:
	contents: "read"
	id-token: "write"
	pull-requests: "write"
	issues: "write"
	strategy:
	matrix:
	python-version: ["3.12"]
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Install poetry via pipx
	run: pipx install poetry

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: ${{ matrix.python-version }}
	cache: "poetry"

	- name: Comment on PR if 'eval-this' label is present
	if: github.event_name == 'pull_request' && github.event.label.name == 'eval-this'
	uses: KeisukeYamashita/create-comment@v1
	with:
	unique: false
	comment: \|
	Hi! I started running the evaluation on your PR. You will receive a comment with the results shortly.

	- name: Install Python dependencies using Poetry
	run: poetry install

	- name: Configure config.toml for evaluation
	env:
	DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_LLM_API_KEY }}
	run: \|
	echo "[llm.eval]" > config.toml
	echo "model = \"deepseek/deepseek-chat\"" >> config.toml
	echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml
	echo "temperature = 0.0" >> config.toml

	- name: Run integration test evaluation
	env:
	ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
	RUNTIME: remote
	SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
	EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images

	run: \|
	poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES

	# get evaluation report
	REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f \| head -n 1)
	echo "REPORT_FILE: $REPORT_FILE"
	echo "INTEGRATION_TEST_REPORT<<EOF" >> $GITHUB_ENV
	cat $REPORT_FILE >> $GITHUB_ENV
	echo >> $GITHUB_ENV
	echo "EOF" >> $GITHUB_ENV

	- name: Run SWE-Bench evaluation
	env:
	ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
	RUNTIME: remote
	SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
	EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images

	run: \|
	poetry run ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
	OUTPUT_FOLDER=$(find evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Lite-test/CodeActAgent -name "deepseek-chat_maxiter_50_N_*-no-hint-run_1" -type d \| head -n 1)
	echo "OUTPUT_FOLDER for SWE-bench evaluation: $OUTPUT_FOLDER"
	poetry run ./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test

	poetry run ./evaluation/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1
	echo "SWEBENCH_REPORT<<EOF" >> $GITHUB_ENV
	cat summarize_outputs.log >> $GITHUB_ENV
	echo "EOF" >> $GITHUB_ENV

	- name: Create tar.gz of evaluation outputs
	run: \|
	TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
	tar -czvf evaluation_outputs_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs

	- name: Upload evaluation results as artifact
	uses: actions/upload-artifact@v4
	id: upload_results_artifact
	with:
	name: evaluation-outputs
	path: evaluation_outputs_*.tar.gz

	- name: Get artifact URL
	run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV

	- name: Authenticate to Google Cloud
	uses: 'google-github-actions/auth@v2'
	with:
	credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }}

	- name: Set timestamp and trigger reason
	run: \|
	echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
	if [[ "${{ github.event_name }}" == "pull_request" ]]; then
	echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
	elif [[ "${{ github.event_name }}" == "schedule" ]]; then
	echo "TRIGGER_REASON=schedule" >> $GITHUB_ENV
	else
	echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
	fi

	- name: Upload evaluation results to Google Cloud Storage
	uses: 'google-github-actions/upload-cloud-storage@v2'
	with:
	path: 'evaluation/evaluation_outputs/outputs'
	destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}'

	- name: Comment with evaluation results and artifact link
	id: create_comment
	uses: KeisukeYamashita/create-comment@v1
	with:
	number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number \|\| 4504 }}
	unique: false
	comment: \|
	Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) \|\| github.event_name == 'schedule' && 'Daily Schedule' \|\| format('Manual Trigger: {0}', github.event.inputs.reason) }}
	Commit: ${{ github.sha }}
	SWE-Bench Evaluation Report
	${{ env.SWEBENCH_REPORT }}
	---
	Integration Tests Evaluation Report
	${{ env.INTEGRATION_TEST_REPORT }}
	---
	You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).

	- name: Post to a Slack channel
	id: slack
	uses: slackapi/slack-github-action@v1.27.0
	with:
	channel-id: 'C07SVQSCR6F'
	slack-message: "Evaluation Trigger: ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) \|\| github.event_name == 'schedule' && 'Daily Schedule' \|\| format('Manual Trigger: {0}', github.event.inputs.reason) }}\n\nLink to summary: [here](https://github.com/${{ github.repository }}/issues/${{ github.event_name == 'pull_request' && github.event.pull_request.number \|\| 4504 }}#issuecomment-${{ steps.create_comment.outputs.comment-id }})"
	env:
	SLACK_BOT_TOKEN: ${{ secrets.EVAL_NOTIF_SLACK_BOT_TOKEN }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Run Evaluation #50

Workflow file

Run Evaluation #50

Jobs

Run details

Workflow file for this run