feat(ci): performance explorations #805
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: 'Evals: PR Evaluation & Regression' | |
| on: | |
| pull_request_target: | |
| types: ['opened', 'synchronize', 'reopened', 'ready_for_review'] | |
| paths: | |
| - 'packages/core/src/prompts/**' | |
| - 'packages/core/src/tools/**' | |
| - 'packages/core/src/agents/**' | |
| - 'evals/**' | |
| - '!**/*.test.ts' | |
| - '!**/*.test.tsx' | |
| workflow_dispatch: | |
| # Prevents multiple runs for the same PR simultaneously (saves tokens) | |
| concurrency: | |
| group: '${{ github.workflow }}-${{ github.head_ref || github.ref }}' | |
| cancel-in-progress: true | |
| permissions: | |
| pull-requests: 'write' | |
| contents: 'read' | |
| actions: 'read' | |
| jobs: | |
| detect-changes: | |
| name: 'Detect Steering Changes' | |
| runs-on: 'gemini-cli-ubuntu-16-core' | |
| # Security: pull_request_target allows secrets, so we must gate carefully. | |
| # Detection should not run code from the fork. | |
| if: "github.repository == 'google-gemini/gemini-cli' && github.event.pull_request.draft == false" | |
| outputs: | |
| SHOULD_RUN: '${{ steps.detect.outputs.SHOULD_RUN }}' | |
| STEERING_DETECTED: '${{ steps.detect.outputs.STEERING_DETECTED }}' | |
| steps: | |
| - name: 'Checkout' | |
| uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v5 | |
| with: | |
| # Check out the trusted code from main for detection | |
| fetch-depth: 0 | |
| - name: 'Detect Steering Changes' | |
| id: 'detect' | |
| env: | |
| # Use the PR's head SHA for comparison without checking it out | |
| PR_HEAD_SHA: '${{ github.event.pull_request.head.sha }}' | |
| run: | | |
| # Fetch the fork's PR branch for analysis | |
| git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-head | |
| # Run the trusted script from main | |
| SHOULD_RUN=$(node scripts/changed_prompt.js) | |
| STEERING_DETECTED=$(node scripts/changed_prompt.js --steering-only) | |
| echo "SHOULD_RUN=$SHOULD_RUN" >> "$GITHUB_OUTPUT" | |
| echo "STEERING_DETECTED=$STEERING_DETECTED" >> "$GITHUB_OUTPUT" | |
| - name: 'Notify Approval Required' | |
| if: "steps.detect.outputs.SHOULD_RUN == 'true'" | |
| env: | |
| GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' | |
| run: | | |
| RUN_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| COMMENT_BODY="### 🛑 Action Required: Evaluation Approval | |
| Steering changes have been detected in this PR. To prevent regressions, a maintainer must approve the evaluation run before this PR can be merged. | |
| **Maintainers:** | |
| 1. Go to the [**Workflow Run Summary**]($RUN_URL). | |
| 2. Click the yellow **'Review deployments'** button. | |
| 3. Select the **'eval-gate'** environment and click **'Approve'**. | |
| Once approved, the evaluation results will be posted here automatically. | |
| <!-- eval-approval-notification -->" | |
| # Check if comment already exists to avoid spamming | |
| COMMENT_ID=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '.comments[] | select(.body | contains("<!-- eval-approval-notification -->")) | .url' | grep -oE "[0-9]+$" | head -n 1) | |
| if [ -z "$COMMENT_ID" ]; then | |
| gh pr comment ${{ github.event.pull_request.number }} --body "$COMMENT_BODY" | |
| else | |
| echo "Updating existing notification comment $COMMENT_ID..." | |
| gh api -X PATCH "repos/${{ github.repository }}/issues/comments/$COMMENT_ID" -F body="$COMMENT_BODY" | |
| fi | |
| pr-evaluation: | |
| name: 'Evaluate Steering & Regressions' | |
| needs: 'detect-changes' | |
| if: "needs.detect-changes.outputs.SHOULD_RUN == 'true'" | |
| # Manual approval gate via environment | |
| environment: 'eval-gate' | |
| runs-on: 'gemini-cli-ubuntu-16-core' | |
| env: | |
| # CENTRALIZED MODEL LIST | |
| MODEL_LIST: 'gemini-3-flash-preview' | |
| steps: | |
| - name: 'Checkout' | |
| uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v5 | |
| with: | |
| # Check out the fork's PR code for the actual evaluation | |
| # This only runs AFTER manual approval | |
| ref: '${{ github.event.pull_request.head.sha }}' | |
| fetch-depth: 0 | |
| - name: 'Remove Approval Notification' | |
| # Run even if other steps fail, to ensure we clean up the "Action Required" message | |
| if: 'always()' | |
| env: | |
| GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' | |
| PR_NUMBER: '${{ github.event.pull_request.number }}' | |
| run: | | |
| echo "Debug: PR_NUMBER is '$PR_NUMBER'" | |
| # Search for the notification comment by its hidden tag | |
| COMMENT_ID=$(gh pr view "$PR_NUMBER" --json comments --jq '.comments[] | select(.body | contains("<!-- eval-approval-notification -->")) | .url' | grep -oE "[0-9]+$" | head -n 1) | |
| if [ -n "$COMMENT_ID" ]; then | |
| echo "Removing notification comment $COMMENT_ID now that run is approved..." | |
| gh api -X DELETE "repos/${{ github.repository }}/issues/comments/$COMMENT_ID" | |
| fi | |
| - name: 'Set up Node.js' | |
| uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4.4.0 | |
| with: | |
| node-version-file: '.nvmrc' | |
| cache: 'npm' | |
| - name: 'Install dependencies' | |
| run: 'npm ci' | |
| - name: 'Build project' | |
| run: 'npm run build' | |
| - name: 'Analyze PR Content (Guidance)' | |
| if: "needs.detect-changes.outputs.STEERING_DETECTED == 'true'" | |
| id: 'analysis' | |
| env: | |
| GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' | |
| run: | | |
| # Check for behavioral eval changes | |
| EVAL_CHANGES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD | grep "^evals/" || true) | |
| if [ -z "$EVAL_CHANGES" ]; then | |
| echo "MISSING_EVALS=true" >> "$GITHUB_OUTPUT" | |
| fi | |
| # Check if user is a maintainer | |
| USER_PERMISSION=$(gh api repos/${{ github.repository }}/collaborators/${{ github.actor }}/permission --jq '.permission') | |
| if [[ "$USER_PERMISSION" == "admin" || "$USER_PERMISSION" == "write" ]]; then | |
| echo "IS_MAINTAINER=true" >> "$GITHUB_OUTPUT" | |
| fi | |
| - name: 'Execute Regression Check' | |
| env: | |
| GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}' | |
| GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' | |
| MODEL_LIST: '${{ env.MODEL_LIST }}' | |
| run: | | |
| # Run the regression check loop. The script saves the report to a file. | |
| node scripts/run_eval_regression.js | |
| # Use the generated report file if it exists | |
| if [[ -f eval_regression_report.md ]]; then | |
| echo "REPORT_FILE=eval_regression_report.md" >> "$GITHUB_ENV" | |
| fi | |
| - name: 'Post or Update PR Comment' | |
| if: "always() && (needs.detect-changes.outputs.STEERING_DETECTED == 'true' || env.REPORT_FILE != '')" | |
| env: | |
| GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' | |
| run: | | |
| # 1. Build the full comment body | |
| { | |
| if [[ -f eval_regression_report.md ]]; then | |
| cat eval_regression_report.md | |
| echo "" | |
| fi | |
| if [[ "${{ needs.detect-changes.outputs.STEERING_DETECTED }}" == "true" ]]; then | |
| echo "### 🧠 Model Steering Guidance" | |
| echo "" | |
| echo "This PR modifies files that affect the model's behavior (prompts, tools, or instructions)." | |
| echo "" | |
| if [[ "${{ steps.analysis.outputs.MISSING_EVALS }}" == "true" ]]; then | |
| echo "- ⚠️ **Consider adding Evals:** No behavioral evaluations (\`evals/*.eval.ts\`) were added or updated in this PR. Consider [adding a test case](https://github.com/google-gemini/gemini-cli/blob/main/evals/README.md#creating-an-evaluation) to verify the new behavior and prevent regressions." | |
| fi | |
| if [[ "${{ steps.analysis.outputs.IS_MAINTAINER }}" == "true" ]]; then | |
| echo "- 🚀 **Maintainer Reminder:** Please ensure that these changes do not regress results on benchmark evals before merging." | |
| fi | |
| fi | |
| echo "" | |
| echo "---" | |
| echo "*This is an automated guidance message triggered by steering logic signatures.*" | |
| echo "<!-- eval-pr-report -->" | |
| } > full_comment.md | |
| # 2. Find if a comment with our unique tag already exists | |
| # We extract the numeric ID from the URL to ensure compatibility with the REST API | |
| COMMENT_ID=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '.comments[] | select(.body | contains("<!-- eval-pr-report -->")) | .url' | grep -oE "[0-9]+$" | head -n 1) | |
| # 3. Update or Create the comment | |
| if [ -n "$COMMENT_ID" ]; then | |
| echo "Updating existing comment $COMMENT_ID via API..." | |
| gh api -X PATCH "repos/${{ github.repository }}/issues/comments/$COMMENT_ID" -F body=@full_comment.md | |
| else | |
| echo "Creating new PR comment..." | |
| gh pr comment ${{ github.event.pull_request.number }} --body-file full_comment.md | |
| fi |