Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 86 additions & 19 deletions .github/workflows/eval-pr.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: 'Evals: PR Evaluation & Regression'

on:
pull_request:
pull_request_target:
types: ['opened', 'synchronize', 'reopened', 'ready_for_review']
paths:
- 'packages/core/src/prompts/**'
Expand All @@ -23,13 +23,73 @@ permissions:
actions: 'read'

jobs:
detect-changes:
name: 'Detect Steering Changes'
runs-on: 'gemini-cli-ubuntu-16-core'
# Security: pull_request_target allows secrets, so we must gate carefully.
# Detection should not run code from the fork.
if: "github.repository == 'google-gemini/gemini-cli' && github.event.pull_request.draft == false"
outputs:
SHOULD_RUN: '${{ steps.detect.outputs.SHOULD_RUN }}'
STEERING_DETECTED: '${{ steps.detect.outputs.STEERING_DETECTED }}'
steps:
- name: 'Checkout'
uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v5
with:
# Check out the trusted code from main for detection
fetch-depth: 0

- name: 'Detect Steering Changes'
id: 'detect'
env:
# Use the PR's head SHA for comparison without checking it out
PR_HEAD_SHA: '${{ github.event.pull_request.head.sha }}'
run: |
# Fetch the fork's PR branch for analysis
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-head

# Run the trusted script from main
SHOULD_RUN=$(node scripts/changed_prompt.js)
STEERING_DETECTED=$(node scripts/changed_prompt.js --steering-only)
echo "SHOULD_RUN=$SHOULD_RUN" >> "$GITHUB_OUTPUT"
echo "STEERING_DETECTED=$STEERING_DETECTED" >> "$GITHUB_OUTPUT"

- name: 'Notify Approval Required'
if: "steps.detect.outputs.SHOULD_RUN == 'true'"
env:
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
run: |
RUN_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
COMMENT_BODY="### 🛑 Action Required: Evaluation Approval

Steering changes have been detected in this PR. To prevent regressions, a maintainer must approve the evaluation run before this PR can be merged.

**Maintainers:**
1. Go to the [**Workflow Run Summary**]($RUN_URL).
2. Click the yellow **'Review deployments'** button.
3. Select the **'eval-gate'** environment and click **'Approve'**.

Once approved, the evaluation results will be posted here automatically.

<!-- eval-approval-notification -->"

# Check if comment already exists to avoid spamming
COMMENT_ID=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '.comments[] | select(.body | contains("<!-- eval-approval-notification -->")) | .url' | grep -oE "[0-9]+$" | head -n 1)

if [ -z "$COMMENT_ID" ]; then
gh pr comment ${{ github.event.pull_request.number }} --body "$COMMENT_BODY"
else
echo "Updating existing notification comment $COMMENT_ID..."
gh api -X PATCH "repos/${{ github.repository }}/issues/comments/$COMMENT_ID" -F body="$COMMENT_BODY"
fi

pr-evaluation:
name: 'Evaluate Steering & Regressions'
needs: 'detect-changes'
if: "needs.detect-changes.outputs.SHOULD_RUN == 'true'"
# Manual approval gate via environment
environment: 'eval-gate'
runs-on: 'gemini-cli-ubuntu-16-core'
if: "github.repository == 'google-gemini/gemini-cli' && (github.event_name != 'pull_request' || (github.event.pull_request.draft == false && github.event.pull_request.head.repo.full_name == github.repository))"
# External contributors' PRs will wait for approval in this environment
environment: |-
${{ (github.event.pull_request.head.repo.full_name == github.repository) && 'internal' || 'external-evals' }}
env:
# CENTRALIZED MODEL LIST
MODEL_LIST: 'gemini-3-flash-preview'
Expand All @@ -38,32 +98,40 @@ jobs:
- name: 'Checkout'
uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v5
with:
# Check out the fork's PR code for the actual evaluation
# This only runs AFTER manual approval
ref: '${{ github.event.pull_request.head.sha }}'
fetch-depth: 0

- name: 'Remove Approval Notification'
# Run even if other steps fail, to ensure we clean up the "Action Required" message
if: 'always()'
env:
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
PR_NUMBER: '${{ github.event.pull_request.number }}'
run: |
echo "Debug: PR_NUMBER is '$PR_NUMBER'"
# Search for the notification comment by its hidden tag
COMMENT_ID=$(gh pr view "$PR_NUMBER" --json comments --jq '.comments[] | select(.body | contains("<!-- eval-approval-notification -->")) | .url' | grep -oE "[0-9]+$" | head -n 1)
if [ -n "$COMMENT_ID" ]; then
echo "Removing notification comment $COMMENT_ID now that run is approved..."
gh api -X DELETE "repos/${{ github.repository }}/issues/comments/$COMMENT_ID"
fi

- name: 'Set up Node.js'
uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4.4.0
with:
node-version-file: '.nvmrc'
cache: 'npm'

- name: 'Detect Steering Changes'
id: 'detect'
run: |
SHOULD_RUN=$(node scripts/changed_prompt.js)
STEERING_DETECTED=$(node scripts/changed_prompt.js --steering-only)
echo "SHOULD_RUN=$SHOULD_RUN" >> "$GITHUB_OUTPUT"
echo "STEERING_DETECTED=$STEERING_DETECTED" >> "$GITHUB_OUTPUT"

- name: 'Install dependencies'
if: "steps.detect.outputs.SHOULD_RUN == 'true'"
run: 'npm ci'

- name: 'Build project'
if: "steps.detect.outputs.SHOULD_RUN == 'true'"
run: 'npm run build'

- name: 'Analyze PR Content (Guidance)'
if: "steps.detect.outputs.STEERING_DETECTED == 'true'"
if: "needs.detect-changes.outputs.STEERING_DETECTED == 'true'"
id: 'analysis'
env:
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
Expand All @@ -81,7 +149,6 @@ jobs:
fi

- name: 'Execute Regression Check'
if: "steps.detect.outputs.SHOULD_RUN == 'true'"
env:
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
Expand All @@ -96,7 +163,7 @@ jobs:
fi

- name: 'Post or Update PR Comment'
if: "always() && (steps.detect.outputs.STEERING_DETECTED == 'true' || env.REPORT_FILE != '')"
if: "always() && (needs.detect-changes.outputs.STEERING_DETECTED == 'true' || env.REPORT_FILE != '')"
env:
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
run: |
Expand All @@ -107,7 +174,7 @@ jobs:
echo ""
fi

if [[ "${{ steps.detect.outputs.STEERING_DETECTED }}" == "true" ]]; then
if [[ "${{ needs.detect-changes.outputs.STEERING_DETECTED }}" == "true" ]]; then
echo "### 🧠 Model Steering Guidance"
echo ""
echo "This PR modifies files that affect the model's behavior (prompts, tools, or instructions)."
Expand Down
5 changes: 3 additions & 2 deletions scripts/changed_prompt.js
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ function main() {
});

// Get changed files using the triple-dot syntax which correctly handles merge commits
const changedFiles = execSync(`git diff --name-only FETCH_HEAD...HEAD`, {
const head = process.env.PR_HEAD_SHA || 'HEAD';
const changedFiles = execSync(`git diff --name-only FETCH_HEAD...${head}`, {
encoding: 'utf-8',
})
.split('\n')
Expand Down Expand Up @@ -70,7 +71,7 @@ function main() {
if (coreChanges.length > 0) {
// Get the actual diff content for core files
const diff = execSync(
`git diff -U0 FETCH_HEAD...HEAD -- packages/core/src/`,
`git diff -U0 FETCH_HEAD...${head} -- packages/core/src/`,
{ encoding: 'utf-8' },
);
for (const sig of STEERING_SIGNATURES) {
Expand Down
Loading