add tool correctness metric #82

Workflow file for this run

.github/workflows/run-evals.yml at 0ab829c

	name: Run Evaluations

	on:
	pull_request:
	types: [opened, synchronize, reopened]
	workflow_dispatch:

	jobs:
	run-evals:
	runs-on: ubuntu-latest
	# Only run on PRs from the same repository, not forks.
	if: github.event_name != 'pull_request' \|\| github.event.pull_request.head.repo.full_name == github.repository

	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Install uv
	uses: astral-sh/setup-uv@v5

	- name: Authenticate to Google Cloud
	id: auth
	uses: google-github-actions/auth@v2
	with:
	credentials_json: ${{ secrets.GCP_CREDENTIALS }}

	- name: Set up Cloud SDK
	uses: google-github-actions/setup-gcloud@v2

	- name: Install Claude Code
	run: \|
	curl -fsSL https://claude.ai/install.sh \| bash
	echo "$HOME/.local/bin" >> $GITHUB_PATH

	- name: Install Upsun CLI
	run: \|
	curl -fsSL https://raw.githubusercontent.com/platformsh/cli/main/installer.sh \| VENDOR=upsun bash
	echo "$HOME/.platformsh/bin" >> $GITHUB_PATH

	- name: Install using-upsun skill
	run: \|
	mkdir -p "$HOME/.claude/skills/"
	cp -r plugins/upsun/skills/. "$HOME/.claude/skills/"

	- name: Install dependencies
	working-directory: ./evals
	run: \|
	uv sync

	- name: Run evaluations
	working-directory: ./evals
	env:
	GOOGLE_APPLICATION_CREDENTIALS: ${{ steps.auth.outputs.credentials_file_path }}
	ANTHROPIC_VERTEX_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
	ANTHROPIC_VERTEX_REGION: "global"
	ANTHROPIC_DEFAULT_HAIKU_MODEL: "claude-haiku-4-5"
	ANTHROPIC_DEFAULT_SONNET_MODEL: "claude-sonnet-4-6"
	ANTHROPIC_DEFAULT_OPUS_MODEL: "claude-opus-4-6"
	CLAUDE_CODE_SUBAGENT_MODEL: "claude-sonnet-4-6"
	CLAUDE_CODE_USE_VERTEX: "1"

	run: \|
	source .venv/bin/activate
	deepeval set-gemini \
	--model=gemini-3.1-pro-preview \
	--project=${{ secrets.GCP_PROJECT_ID }} \
	--location=global
	deepeval test run . \
	--junitxml=results.xml \
	--html=report.html \
	--self-contained-html

	- name: Upload test results
	uses: actions/upload-artifact@v4
	if: always()
	with:
	name: deepeval-results
	path: \|
	evals/results.xml
	evals/report.html

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

add tool correctness metric #82

Workflow file

add tool correctness metric #82

Uh oh!

Workflow file for this run