.github/workflows/eval-runner.yml

name: Run Evaluation

on:
  pull_request:
    types: [labeled]
  schedule:
    - cron: "0 1 * * *" # Run daily at 1 AM UTC
  workflow_dispatch:
    inputs:
      reason:
        description: "Reason for manual trigger"
        required: true
        default: ""

env:
  N_PROCESSES: 32 # Global configuration for number of parallel processes for evaluation

jobs:
  run-evaluation:
    if: github.event.label.name == 'eval-this' || github.event_name != 'pull_request'
    runs-on: ubuntu-latest
    permissions:
      contents: "read"
      id-token: "write"
      pull-requests: "write"
      issues: "write"
    strategy:
      matrix:
        python-version: ["3.12"]
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Install poetry via pipx
        run: pipx install poetry

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
          cache: "poetry"

      - name: Comment on PR if 'eval-this' label is present
        if: github.event_name == 'pull_request' && github.event.label.name == 'eval-this'
        uses: KeisukeYamashita/create-comment@v1
        with:
          unique: false
          comment: |
            Hi! I started running the evaluation on your PR. You will receive a comment with the results shortly.

      - name: Install Python dependencies using Poetry
        run: poetry install

      - name: Configure config.toml for evaluation
        env:
          DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_LLM_API_KEY }}
        run: |
          echo "[llm.eval]" > config.toml
          echo "model = \"deepseek/deepseek-chat\"" >> config.toml
          echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml
          echo "temperature = 0.0" >> config.toml

      - name: Run integration test evaluation
        env:
          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
          RUNTIME: remote
          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
          EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images

        run: |
          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES

          # get evaluation report
          REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1)
          echo "REPORT_FILE: $REPORT_FILE"
          echo "INTEGRATION_TEST_REPORT<<EOF" >> $GITHUB_ENV
          cat $REPORT_FILE >> $GITHUB_ENV
          echo >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV

      - name: Run SWE-Bench evaluation
        env:
          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
          RUNTIME: remote
          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
          EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images

        run: |
          poetry run ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
          OUTPUT_FOLDER=$(find evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Lite-test/CodeActAgent -name "deepseek-chat_maxiter_50_N_*-no-hint-run_1" -type d | head -n 1)
          echo "OUTPUT_FOLDER for SWE-bench evaluation: $OUTPUT_FOLDER"
          poetry run ./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test

          poetry run ./evaluation/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1
          echo "SWEBENCH_REPORT<<EOF" >> $GITHUB_ENV
          cat summarize_outputs.log >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV

      - name: Create tar.gz of evaluation outputs
        run: |
          TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
          tar -czvf evaluation_outputs_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs

      - name: Upload evaluation results as artifact
        uses: actions/upload-artifact@v4
        id: upload_results_artifact
        with:
          name: evaluation-outputs
          path: evaluation_outputs_*.tar.gz

      - name: Get artifact URL
        run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV

      - name: Authenticate to Google Cloud
        uses: 'google-github-actions/auth@v2'
        with:
          credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }}

      - name: Set timestamp and trigger reason
        run: |
          echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
            echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
          elif [[ "${{ github.event_name }}" == "schedule" ]]; then
            echo "TRIGGER_REASON=schedule" >> $GITHUB_ENV
          else
            echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
          fi

      - name: Upload evaluation results to Google Cloud Storage
        uses: 'google-github-actions/upload-cloud-storage@v2'
        with:
          path: 'evaluation/evaluation_outputs/outputs'
          destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}'

      - name: Comment with evaluation results and artifact link
        id: create_comment
        uses: KeisukeYamashita/create-comment@v1
        with:
          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}
          unique: false
          comment: |
              Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}
              Commit: ${{ github.sha }}
              **SWE-Bench Evaluation Report**
              ${{ env.SWEBENCH_REPORT }}
              ---
              **Integration Tests Evaluation Report**
              ${{ env.INTEGRATION_TEST_REPORT }}
              ---
              You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).

      - name: Post to a Slack channel
        id: slack
        uses: slackapi/slack-github-action@v1.27.0
        with:
          channel-id: 'C07SVQSCR6F'
          slack-message: "*Evaluation Trigger:* ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}\n\nLink to summary: [here](https://github.com/${{ github.repository }}/issues/${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}#issuecomment-${{ steps.create_comment.outputs.comment-id }})"
        env:
          SLACK_BOT_TOKEN: ${{ secrets.EVAL_NOTIF_SLACK_BOT_TOKEN }}