Skip to content

add tool correctness metric #82

add tool correctness metric

add tool correctness metric #82

Workflow file for this run

name: Run Evaluations
on:
pull_request:
types: [opened, synchronize, reopened]
workflow_dispatch:
jobs:
run-evals:
runs-on: ubuntu-latest
# Only run on PRs from the same repository, not forks.
if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install uv
uses: astral-sh/setup-uv@v5
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.GCP_CREDENTIALS }}
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v2
- name: Install Claude Code
run: |
curl -fsSL https://claude.ai/install.sh | bash
echo "$HOME/.local/bin" >> $GITHUB_PATH
- name: Install Upsun CLI
run: |
curl -fsSL https://raw.githubusercontent.com/platformsh/cli/main/installer.sh | VENDOR=upsun bash
echo "$HOME/.platformsh/bin" >> $GITHUB_PATH
- name: Install using-upsun skill
run: |
mkdir -p "$HOME/.claude/skills/"
cp -r plugins/upsun/skills/. "$HOME/.claude/skills/"
- name: Install dependencies
working-directory: ./evals
run: |
uv sync
- name: Run evaluations
working-directory: ./evals
env:
GOOGLE_APPLICATION_CREDENTIALS: ${{ steps.auth.outputs.credentials_file_path }}
ANTHROPIC_VERTEX_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
ANTHROPIC_VERTEX_REGION: "global"
ANTHROPIC_DEFAULT_HAIKU_MODEL: "claude-haiku-4-5"
ANTHROPIC_DEFAULT_SONNET_MODEL: "claude-sonnet-4-6"
ANTHROPIC_DEFAULT_OPUS_MODEL: "claude-opus-4-6"
CLAUDE_CODE_SUBAGENT_MODEL: "claude-sonnet-4-6"
CLAUDE_CODE_USE_VERTEX: "1"
run: |
source .venv/bin/activate
deepeval set-gemini \
--model=gemini-3.1-pro-preview \
--project=${{ secrets.GCP_PROJECT_ID }} \
--location=global
deepeval test run . \
--junitxml=results.xml \
--html=report.html \
--self-contained-html
- name: Upload test results
uses: actions/upload-artifact@v4
if: always()
with:
name: deepeval-results
path: |
evals/results.xml
evals/report.html