Update README.md #138
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Tests & Evaluation | |
on: | |
push: | |
branches: [ main ] | |
pull_request: | |
# Run on all pull requests, regardless of target branch | |
# Add permissions to allow PR comments | |
permissions: | |
contents: read | |
pull-requests: write | |
actions: read | |
jobs: | |
test: | |
name: Test | |
runs-on: ubuntu-latest | |
strategy: | |
matrix: | |
node-version: [18.x, 20.x] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Use Node.js ${{ matrix.node-version }} | |
uses: actions/setup-node@v4 | |
with: | |
node-version: ${{ matrix.node-version }} | |
# Setup PNPM - must be before setting up Node.js cache | |
- name: Setup PNPM | |
uses: pnpm/action-setup@v2 | |
# Setup Node.js cache after PNPM is installed | |
- name: Setup Node.js with cache | |
uses: actions/setup-node@v4 | |
with: | |
node-version: ${{ matrix.node-version }} | |
cache: 'pnpm' | |
- name: Install dependencies | |
run: pnpm install | |
- name: Typecheck (entire codebase) | |
run: pnpm typecheck | |
- name: Run tests | |
run: pnpm test | |
- name: Run test with coverage | |
run: pnpm test:coverage | |
- name: Build | |
run: pnpm build | |
# New job that runs after all test matrix jobs complete | |
evaluate: | |
name: Run Evaluations | |
# This job will only run if all test jobs succeed | |
needs: test | |
runs-on: ubuntu-latest | |
# Special handling for main branch | |
if: success() | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup PNPM | |
uses: pnpm/action-setup@v2 | |
- name: Setup Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: '18' | |
cache: 'pnpm' | |
- name: Install dependencies | |
run: pnpm install | |
- name: Build project for evaluation | |
run: pnpm run build | |
- name: Configure MCP environment | |
run: echo "Using environment variable-based configuration" | |
# Verify the build file exists before running evals | |
- name: Verify build file exists | |
run: | | |
mkdir -p eval/reports | |
if [ ! -f "build/index.mjs" ]; then | |
echo "ERROR: build/index.mjs does not exist after build step!" | |
echo '<!DOCTYPE html>' > eval/reports/build-failed.html | |
echo '<html><head><title>Build Failed</title></head>' >> eval/reports/build-failed.html | |
echo '<body><h1>Evaluation Failed</h1>' >> eval/reports/build-failed.html | |
echo '<p>The MCP build output file does not exist. Check the build step for errors.</p>' >> eval/reports/build-failed.html | |
echo '</body></html>' >> eval/reports/build-failed.html | |
exit 1 | |
else | |
echo "Build file found, proceeding with evaluation" | |
fi | |
- name: Run evaluations | |
id: run_evals | |
run: | | |
echo "Running evaluations..." | |
if ! pnpm run eval; then | |
echo "::error::Evaluation failed during execution" | |
echo "EVAL_OUTCOME=failed" >> $GITHUB_ENV | |
# Create a failure report but don't exit yet - we want to collect all artifacts | |
mkdir -p eval/reports | |
echo '<!DOCTYPE html>' > eval/reports/eval-failed.html | |
echo '<html><head><title>Evaluation Failed</title></head>' >> eval/reports/eval-failed.html | |
echo '<body><h1>Evaluation Failed</h1>' >> eval/reports/eval-failed.html | |
echo '<p>The evaluation process encountered an error. Check the logs for details.</p>' >> eval/reports/eval-failed.html | |
echo '<h2>Configuration Information</h2>' >> eval/reports/eval-failed.html | |
echo '<pre>' >> eval/reports/eval-failed.html | |
if [ -n "$HONEYCOMB_API_KEY" ]; then | |
echo "Honeycomb API key is set (length: ${#HONEYCOMB_API_KEY})" >> eval/reports/eval-failed.html | |
else | |
echo "Honeycomb API key is not set!" >> eval/reports/eval-failed.html | |
echo "Make sure HONEYCOMB_API_KEY is set in GitHub secrets and passed to the workflow" >> eval/reports/eval-failed.html | |
fi | |
echo '</pre>' >> eval/reports/eval-failed.html | |
echo '</body></html>' >> eval/reports/eval-failed.html | |
# Print environment variables (excluding secrets) for debugging | |
echo "Environment variables for debugging:" | |
env | grep -v -E "HONEYCOMB_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY" | sort | |
else | |
echo "EVAL_OUTCOME=success" >> $GITHUB_ENV | |
fi | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
# Use Honeycomb API key for environment variable-based config | |
HONEYCOMB_API_KEY: ${{ secrets.HONEYCOMB_API_KEY }} | |
# Use only limited models for CI to save costs | |
EVAL_MODELS: '{"openai":"gpt-4o-mini","anthropic":"claude-3-5-haiku-latest"}' | |
EVAL_CONCURRENCY: 2 | |
EVAL_JUDGE_PROVIDER: "anthropic" | |
EVAL_JUDGE_MODEL: "claude-3-5-haiku-latest" | |
MCP_SERVER_COMMAND: "node build/index.mjs" | |
- name: Ensure reports directory exists | |
run: mkdir -p eval/reports | |
- name: Create index file if no reports are generated | |
run: | | |
# Check if any HTML reports exist | |
if [ -z "$(find eval/reports -name '*.html' 2>/dev/null)" ]; then | |
echo "No reports were generated, creating a placeholder" | |
echo '<!DOCTYPE html>' > eval/reports/no-reports.html | |
echo '<html><head><title>No Reports</title></head>' >> eval/reports/no-reports.html | |
echo '<body><h1>No evaluation reports generated</h1>' >> eval/reports/no-reports.html | |
echo '<p>This could be due to missing API keys or configuration.</p>' >> eval/reports/no-reports.html | |
echo '</body></html>' >> eval/reports/no-reports.html | |
fi | |
- name: Find latest report | |
id: find-report | |
run: | | |
LATEST_REPORT=$(ls -t eval/reports/*.html 2>/dev/null | head -1 || echo "eval/reports/no-reports.html") | |
echo "latest_report=$LATEST_REPORT" >> $GITHUB_OUTPUT | |
- name: Post report summary | |
run: | | |
if [ "$EVAL_OUTCOME" == "failed" ]; then | |
echo "## ❌ Evaluation Failed" > $GITHUB_STEP_SUMMARY | |
echo "The evaluation process encountered errors. See logs for details." >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "Error report: $(basename ${{ steps.find-report.outputs.latest_report }})" >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "The error report is available as a workflow artifact." >> $GITHUB_STEP_SUMMARY | |
else | |
echo "## ✅ Evaluation Results" > $GITHUB_STEP_SUMMARY | |
echo "Ran evaluations with OpenAI and Anthropic models." >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "### Summary" >> $GITHUB_STEP_SUMMARY | |
echo "Latest report: $(basename ${{ steps.find-report.outputs.latest_report }})" >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "The full report is available as a workflow artifact." >> $GITHUB_STEP_SUMMARY | |
fi | |
# Add PR comment if we're on a PR | |
if [ "${{ github.event_name }}" == "pull_request" ]; then | |
# Start with basic PR comment header | |
echo "## Honeycomb MCP Evaluation Results" > pr_comment.txt | |
echo "" >> pr_comment.txt | |
if [ "$EVAL_OUTCOME" == "failed" ]; then | |
echo "❌ Evaluation process failed" >> pr_comment.txt | |
echo "" >> pr_comment.txt | |
echo "The evaluation process encountered errors. See workflow logs for details." >> pr_comment.txt | |
else | |
# Find the latest summary file | |
LATEST_SUMMARY=$(find eval/results -name "summary-*.json" -type f | sort -r | head -1) | |
if [ -n "$LATEST_SUMMARY" ] && [ -f "$LATEST_SUMMARY" ]; then | |
echo "Found summary file: $LATEST_SUMMARY" | |
# Extract key metrics | |
RATE=$(jq -r '.successRate' "$LATEST_SUMMARY" 2>/dev/null || echo "0") | |
# Calculate percentage with bc (more reliable than jq for math) | |
SUCCESS_RATE=$(echo "$RATE * 100" | bc -l | awk '{printf "%.1f", $0}') | |
PASSED=$(jq -r '.passed' "$LATEST_SUMMARY" 2>/dev/null || echo "N/A") | |
TOTAL=$(jq -r '.totalTests' "$LATEST_SUMMARY" 2>/dev/null || echo "N/A") | |
# Use bc for reliable floating point comparison | |
if (( $(echo "$RATE >= 0.75" | bc -l) )); then | |
echo "✅ Evaluations completed successfully: **${SUCCESS_RATE}%** pass rate (${PASSED}/${TOTAL} tests)" >> pr_comment.txt | |
elif (( $(echo "$RATE >= 0.5" | bc -l) )); then | |
echo "⚠️ Evaluations completed with mixed results: **${SUCCESS_RATE}%** pass rate (${PASSED}/${TOTAL} tests)" >> pr_comment.txt | |
else | |
echo "❌ Evaluations completed with poor results: **${SUCCESS_RATE}%** pass rate (${PASSED}/${TOTAL} tests)" >> pr_comment.txt | |
fi | |
echo "" >> pr_comment.txt | |
# Basic metrics table | |
echo "### Evaluation Summary" >> pr_comment.txt | |
echo "" >> pr_comment.txt | |
echo "| Metric | Value |" >> pr_comment.txt | |
echo "|--------|-------|" >> pr_comment.txt | |
echo "| Success Rate | ${SUCCESS_RATE}% |" >> pr_comment.txt | |
echo "| Tests Passed | $PASSED / $TOTAL |" >> pr_comment.txt | |
# Add latency if available | |
AVG_LATENCY=$(jq -r '.averageLatency' "$LATEST_SUMMARY" 2>/dev/null || echo "N/A") | |
if [ "$AVG_LATENCY" != "N/A" ] && [ "$AVG_LATENCY" != "null" ]; then | |
AVG_LATENCY_INT=$(echo "$AVG_LATENCY" | awk '{printf "%.0f", $0}') | |
echo "| Avg Latency | ${AVG_LATENCY_INT}ms |" >> pr_comment.txt | |
fi | |
# Add basic model information | |
echo "" >> pr_comment.txt | |
echo "### Models Tested" >> pr_comment.txt | |
echo "" >> pr_comment.txt | |
# Extract providers directly | |
echo "| Provider | Model |" >> pr_comment.txt | |
echo "|----------|-------|" >> pr_comment.txt | |
# OpenAI models | |
OPENAI_MODELS=$(jq -r '.results[] | select(.provider == "openai") | .model' "$LATEST_SUMMARY" 2>/dev/null | sort -u) | |
if [ -n "$OPENAI_MODELS" ]; then | |
while read -r model; do | |
if [ -n "$model" ]; then | |
echo "| OpenAI | $model |" >> pr_comment.txt | |
fi | |
done <<< "$OPENAI_MODELS" | |
fi | |
# Anthropic models | |
ANTHROPIC_MODELS=$(jq -r '.results[] | select(.provider == "anthropic") | .model' "$LATEST_SUMMARY" 2>/dev/null | sort -u) | |
if [ -n "$ANTHROPIC_MODELS" ]; then | |
while read -r model; do | |
if [ -n "$model" ]; then | |
echo "| Anthropic | $model |" >> pr_comment.txt | |
fi | |
done <<< "$ANTHROPIC_MODELS" | |
fi | |
else | |
echo "✅ Evaluations completed successfully" >> pr_comment.txt | |
echo "" >> pr_comment.txt | |
echo "No detailed metrics available" >> pr_comment.txt | |
fi | |
# Always add a link to the artifacts | |
echo "" >> pr_comment.txt | |
echo "📊 [View full report in workflow artifacts](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})" >> pr_comment.txt | |
fi | |
# Post the comment to the PR | |
gh pr comment ${{ github.event.pull_request.number }} --body-file pr_comment.txt | |
fi | |
env: | |
GH_TOKEN: ${{ github.token }} | |
# Create report index if it doesn't exist | |
- name: Generate report index if needed | |
run: | | |
if [ ! -f "eval/reports/index.html" ]; then | |
echo "Generating index.html for reports using the update-index script" | |
pnpm run eval:update-index | |
fi | |
# Upload evaluation reports as artifacts | |
- name: Upload evaluation reports | |
uses: actions/upload-artifact@v4 | |
with: | |
name: evaluation-reports | |
path: eval/reports/ | |
retention-days: 30 | |
# Final step to fail the job if evaluations failed | |
- name: Check final evaluation status | |
if: env.EVAL_OUTCOME == 'failed' | |
run: | | |
echo "::error::Evaluation failed - see artifacts for error report" | |
exit 1 |