Update README.md #138

Workflow file for this run

	name: Tests & Evaluation

	on:
	push:
	branches: [ main ]
	pull_request:
	# Run on all pull requests, regardless of target branch

	# Add permissions to allow PR comments
	permissions:
	contents: read
	pull-requests: write
	actions: read

	jobs:
	test:
	name: Test
	runs-on: ubuntu-latest

	strategy:
	matrix:
	node-version: [18.x, 20.x]

	steps:
	- uses: actions/checkout@v4

	- name: Use Node.js ${{ matrix.node-version }}
	uses: actions/setup-node@v4
	with:
	node-version: ${{ matrix.node-version }}

	# Setup PNPM - must be before setting up Node.js cache
	- name: Setup PNPM
	uses: pnpm/action-setup@v2

	# Setup Node.js cache after PNPM is installed
	- name: Setup Node.js with cache
	uses: actions/setup-node@v4
	with:
	node-version: ${{ matrix.node-version }}
	cache: 'pnpm'

	- name: Install dependencies
	run: pnpm install

	- name: Typecheck (entire codebase)
	run: pnpm typecheck

	- name: Run tests
	run: pnpm test

	- name: Run test with coverage
	run: pnpm test:coverage

	- name: Build
	run: pnpm build

	# New job that runs after all test matrix jobs complete
	evaluate:
	name: Run Evaluations
	# This job will only run if all test jobs succeed
	needs: test
	runs-on: ubuntu-latest
	# Special handling for main branch
	if: success()

	steps:
	- uses: actions/checkout@v4

	- name: Setup PNPM
	uses: pnpm/action-setup@v2

	- name: Setup Node.js
	uses: actions/setup-node@v4
	with:
	node-version: '18'
	cache: 'pnpm'

	- name: Install dependencies
	run: pnpm install

	- name: Build project for evaluation
	run: pnpm run build

	- name: Configure MCP environment
	run: echo "Using environment variable-based configuration"

	# Verify the build file exists before running evals
	- name: Verify build file exists
	run: \|
	mkdir -p eval/reports
	if [ ! -f "build/index.mjs" ]; then
	echo "ERROR: build/index.mjs does not exist after build step!"
	echo '<!DOCTYPE html>' > eval/reports/build-failed.html
	echo '<html><head><title>Build Failed</title></head>' >> eval/reports/build-failed.html
	echo '<body><h1>Evaluation Failed</h1>' >> eval/reports/build-failed.html
	echo '<p>The MCP build output file does not exist. Check the build step for errors.</p>' >> eval/reports/build-failed.html
	echo '</body></html>' >> eval/reports/build-failed.html
	exit 1
	else
	echo "Build file found, proceeding with evaluation"
	fi

	- name: Run evaluations
	id: run_evals
	run: \|
	echo "Running evaluations..."
	if ! pnpm run eval; then
	echo "::error::Evaluation failed during execution"
	echo "EVAL_OUTCOME=failed" >> $GITHUB_ENV
	# Create a failure report but don't exit yet - we want to collect all artifacts
	mkdir -p eval/reports
	echo '<!DOCTYPE html>' > eval/reports/eval-failed.html
	echo '<html><head><title>Evaluation Failed</title></head>' >> eval/reports/eval-failed.html
	echo '<body><h1>Evaluation Failed</h1>' >> eval/reports/eval-failed.html
	echo '<p>The evaluation process encountered an error. Check the logs for details.</p>' >> eval/reports/eval-failed.html
	echo '<h2>Configuration Information</h2>' >> eval/reports/eval-failed.html
	echo '<pre>' >> eval/reports/eval-failed.html
	if [ -n "$HONEYCOMB_API_KEY" ]; then
	echo "Honeycomb API key is set (length: ${#HONEYCOMB_API_KEY})" >> eval/reports/eval-failed.html
	else
	echo "Honeycomb API key is not set!" >> eval/reports/eval-failed.html
	echo "Make sure HONEYCOMB_API_KEY is set in GitHub secrets and passed to the workflow" >> eval/reports/eval-failed.html
	fi
	echo '</pre>' >> eval/reports/eval-failed.html
	echo '</body></html>' >> eval/reports/eval-failed.html
	# Print environment variables (excluding secrets) for debugging
	echo "Environment variables for debugging:"
	env \| grep -v -E "HONEYCOMB_API_KEY\|OPENAI_API_KEY\|ANTHROPIC_API_KEY" \| sort
	else
	echo "EVAL_OUTCOME=success" >> $GITHUB_ENV
	fi
	env:
	OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
	ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
	# Use Honeycomb API key for environment variable-based config
	HONEYCOMB_API_KEY: ${{ secrets.HONEYCOMB_API_KEY }}
	# Use only limited models for CI to save costs
	EVAL_MODELS: '{"openai":"gpt-4o-mini","anthropic":"claude-3-5-haiku-latest"}'
	EVAL_CONCURRENCY: 2
	EVAL_JUDGE_PROVIDER: "anthropic"
	EVAL_JUDGE_MODEL: "claude-3-5-haiku-latest"
	MCP_SERVER_COMMAND: "node build/index.mjs"

	- name: Ensure reports directory exists
	run: mkdir -p eval/reports

	- name: Create index file if no reports are generated
	run: \|
	# Check if any HTML reports exist
	if [ -z "$(find eval/reports -name '*.html' 2>/dev/null)" ]; then
	echo "No reports were generated, creating a placeholder"
	echo '<!DOCTYPE html>' > eval/reports/no-reports.html
	echo '<html><head><title>No Reports</title></head>' >> eval/reports/no-reports.html
	echo '<body><h1>No evaluation reports generated</h1>' >> eval/reports/no-reports.html
	echo '<p>This could be due to missing API keys or configuration.</p>' >> eval/reports/no-reports.html
	echo '</body></html>' >> eval/reports/no-reports.html
	fi

	- name: Find latest report
	id: find-report
	run: \|
	LATEST_REPORT=$(ls -t eval/reports/*.html 2>/dev/null \| head -1 \|\| echo "eval/reports/no-reports.html")
	echo "latest_report=$LATEST_REPORT" >> $GITHUB_OUTPUT

	- name: Post report summary
	run: \|
	if [ "$EVAL_OUTCOME" == "failed" ]; then
	echo "## ❌ Evaluation Failed" > $GITHUB_STEP_SUMMARY
	echo "The evaluation process encountered errors. See logs for details." >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "Error report: $(basename ${{ steps.find-report.outputs.latest_report }})" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "The error report is available as a workflow artifact." >> $GITHUB_STEP_SUMMARY
	else
	echo "## ✅ Evaluation Results" > $GITHUB_STEP_SUMMARY
	echo "Ran evaluations with OpenAI and Anthropic models." >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "### Summary" >> $GITHUB_STEP_SUMMARY
	echo "Latest report: $(basename ${{ steps.find-report.outputs.latest_report }})" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "The full report is available as a workflow artifact." >> $GITHUB_STEP_SUMMARY
	fi

	# Add PR comment if we're on a PR
	if [ "${{ github.event_name }}" == "pull_request" ]; then
	# Start with basic PR comment header
	echo "## Honeycomb MCP Evaluation Results" > pr_comment.txt
	echo "" >> pr_comment.txt

	if [ "$EVAL_OUTCOME" == "failed" ]; then
	echo "❌ Evaluation process failed" >> pr_comment.txt
	echo "" >> pr_comment.txt
	echo "The evaluation process encountered errors. See workflow logs for details." >> pr_comment.txt
	else
	# Find the latest summary file
	LATEST_SUMMARY=$(find eval/results -name "summary-*.json" -type f \| sort -r \| head -1)

	if [ -n "$LATEST_SUMMARY" ] && [ -f "$LATEST_SUMMARY" ]; then
	echo "Found summary file: $LATEST_SUMMARY"

	# Extract key metrics
	RATE=$(jq -r '.successRate' "$LATEST_SUMMARY" 2>/dev/null \|\| echo "0")
	# Calculate percentage with bc (more reliable than jq for math)
	SUCCESS_RATE=$(echo "$RATE * 100" \| bc -l \| awk '{printf "%.1f", $0}')
	PASSED=$(jq -r '.passed' "$LATEST_SUMMARY" 2>/dev/null \|\| echo "N/A")
	TOTAL=$(jq -r '.totalTests' "$LATEST_SUMMARY" 2>/dev/null \|\| echo "N/A")

	# Use bc for reliable floating point comparison
	if (( $(echo "$RATE >= 0.75" \| bc -l) )); then
	echo "✅ Evaluations completed successfully: ${SUCCESS_RATE}% pass rate (${PASSED}/${TOTAL} tests)" >> pr_comment.txt
	elif (( $(echo "$RATE >= 0.5" \| bc -l) )); then
	echo "⚠️ Evaluations completed with mixed results: ${SUCCESS_RATE}% pass rate (${PASSED}/${TOTAL} tests)" >> pr_comment.txt
	else
	echo "❌ Evaluations completed with poor results: ${SUCCESS_RATE}% pass rate (${PASSED}/${TOTAL} tests)" >> pr_comment.txt
	fi
	echo "" >> pr_comment.txt

	# Basic metrics table
	echo "### Evaluation Summary" >> pr_comment.txt
	echo "" >> pr_comment.txt
	echo "\| Metric \| Value \|" >> pr_comment.txt
	echo "\|--------\|-------\|" >> pr_comment.txt
	echo "\| Success Rate \| ${SUCCESS_RATE}% \|" >> pr_comment.txt
	echo "\| Tests Passed \| $PASSED / $TOTAL \|" >> pr_comment.txt

	# Add latency if available
	AVG_LATENCY=$(jq -r '.averageLatency' "$LATEST_SUMMARY" 2>/dev/null \|\| echo "N/A")
	if [ "$AVG_LATENCY" != "N/A" ] && [ "$AVG_LATENCY" != "null" ]; then
	AVG_LATENCY_INT=$(echo "$AVG_LATENCY" \| awk '{printf "%.0f", $0}')
	echo "\| Avg Latency \| ${AVG_LATENCY_INT}ms \|" >> pr_comment.txt
	fi

	# Add basic model information
	echo "" >> pr_comment.txt
	echo "### Models Tested" >> pr_comment.txt
	echo "" >> pr_comment.txt

	# Extract providers directly
	echo "\| Provider \| Model \|" >> pr_comment.txt
	echo "\|----------\|-------\|" >> pr_comment.txt

	# OpenAI models
	OPENAI_MODELS=$(jq -r '.results[] \| select(.provider == "openai") \| .model' "$LATEST_SUMMARY" 2>/dev/null \| sort -u)
	if [ -n "$OPENAI_MODELS" ]; then
	while read -r model; do
	if [ -n "$model" ]; then
	echo "\| OpenAI \| $model \|" >> pr_comment.txt
	fi
	done <<< "$OPENAI_MODELS"
	fi

	# Anthropic models
	ANTHROPIC_MODELS=$(jq -r '.results[] \| select(.provider == "anthropic") \| .model' "$LATEST_SUMMARY" 2>/dev/null \| sort -u)
	if [ -n "$ANTHROPIC_MODELS" ]; then
	while read -r model; do
	if [ -n "$model" ]; then
	echo "\| Anthropic \| $model \|" >> pr_comment.txt
	fi
	done <<< "$ANTHROPIC_MODELS"
	fi
	else
	echo "✅ Evaluations completed successfully" >> pr_comment.txt
	echo "" >> pr_comment.txt
	echo "No detailed metrics available" >> pr_comment.txt
	fi

	# Always add a link to the artifacts
	echo "" >> pr_comment.txt
	echo "📊 [View full report in workflow artifacts](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})" >> pr_comment.txt
	fi

	# Post the comment to the PR
	gh pr comment ${{ github.event.pull_request.number }} --body-file pr_comment.txt
	fi
	env:
	GH_TOKEN: ${{ github.token }}

	# Create report index if it doesn't exist
	- name: Generate report index if needed
	run: \|
	if [ ! -f "eval/reports/index.html" ]; then
	echo "Generating index.html for reports using the update-index script"
	pnpm run eval:update-index
	fi

	# Upload evaluation reports as artifacts
	- name: Upload evaluation reports
	uses: actions/upload-artifact@v4
	with:
	name: evaluation-reports
	path: eval/reports/
	retention-days: 30

	# Final step to fail the job if evaluations failed
	- name: Check final evaluation status
	if: env.EVAL_OUTCOME == 'failed'
	run: \|
	echo "::error::Evaluation failed - see artifacts for error report"
	exit 1

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Update README.md #138

Workflow file

Update README.md #138

Uh oh!

Jobs

Run details

Workflow file for this run