Skip to content

Showcase: Smoke Monitor #137

Showcase: Smoke Monitor

Showcase: Smoke Monitor #137

name: "Showcase: Smoke Monitor"
on:
schedule:
- cron: "*/15 * * * *"
workflow_dispatch: {}
jobs:
smoke-check:
name: Smoke Check
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest
timeout-minutes: 5
permissions:
packages: read
actions: write
steps:
- name: Restore state from cache
id: cache-restore
uses: actions/cache/restore@v4
with:
path: smoke-state.json
key: smoke-monitor-state-impossible-match
restore-keys: |
smoke-monitor-state-
- name: Initialize state if missing
run: |
if [ ! -f smoke-state.json ]; then
cat > smoke-state.json <<'INIT'
{
"langgraph-python": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
"langgraph-typescript": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
"langgraph-fastapi": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
"mastra": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
"crewai-crews": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
"pydantic-ai": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
"google-adk": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
"agno": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
"ag2": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
"llamaindex": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
"strands": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
"ms-agent-python": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
"ms-agent-dotnet": { "status": "ok", "fail_count": 0, "first_failure_at": "" }
}
INIT
fi
- name: Run smoke checks
id: smoke
run: |
declare -A URLS
URLS[langgraph-python]="https://showcase-langgraph-python-production.up.railway.app"
URLS[langgraph-typescript]="https://showcase-langgraph-typescript-production.up.railway.app"
URLS[langgraph-fastapi]="https://showcase-langgraph-fastapi-production.up.railway.app"
URLS[mastra]="https://showcase-mastra-production.up.railway.app"
URLS[crewai-crews]="https://showcase-crewai-crews-production.up.railway.app"
URLS[pydantic-ai]="https://showcase-pydantic-ai-production.up.railway.app"
URLS[google-adk]="https://showcase-google-adk-production.up.railway.app"
URLS[agno]="https://showcase-agno-production.up.railway.app"
URLS[ag2]="https://showcase-ag2-production.up.railway.app"
URLS[llamaindex]="https://showcase-llamaindex-production.up.railway.app"
URLS[strands]="https://showcase-strands-production.up.railway.app"
URLS[ms-agent-python]="https://showcase-ms-agent-python-production.up.railway.app"
URLS[ms-agent-dotnet]="https://showcase-ms-agent-dotnet-production.up.railway.app"
NOW="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
ALERTS=""
STATE="$(cat smoke-state.json)"
RUN_URL="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
for SLUG in "${!URLS[@]}"; do
URL="${URLS[$SLUG]}/api/smoke"
PREV_STATUS=$(echo "$STATE" | jq -r --arg s "$SLUG" '.[$s].status // "ok"')
PREV_FAIL_COUNT=$(echo "$STATE" | jq -r --arg s "$SLUG" '.[$s].fail_count // 0')
PREV_FIRST_FAILURE=$(echo "$STATE" | jq -r --arg s "$SLUG" '.[$s].first_failure_at // ""')
# Hit the endpoint
HTTP_RESPONSE=$(curl -s -w "\n%{http_code}" --max-time 45 "$URL" 2>&1) || true
HTTP_BODY=$(echo "$HTTP_RESPONSE" | head -n -1)
HTTP_CODE=$(echo "$HTTP_RESPONSE" | tail -n 1)
# Determine if healthy
HEALTHY=false
if [[ "$HTTP_CODE" =~ ^2[0-9][0-9]$ ]]; then
# Try to parse JSON and check status field
SMOKE_STATUS=$(echo "$HTTP_BODY" | jq -r '.status // empty' 2>/dev/null) || true
if [ "$SMOKE_STATUS" = "ok" ] || [ "$SMOKE_STATUS" = "healthy" ]; then
HEALTHY=true
elif [ -z "$SMOKE_STATUS" ]; then
# No status field but 2xx — treat as healthy
HEALTHY=true
fi
fi
if [ "$HEALTHY" = true ]; then
# Recovery case
if [ "$PREV_STATUS" = "failing" ]; then
ALERTS="${ALERTS}:white_check_mark: *${SLUG}* recovered (was down since ${PREV_FIRST_FAILURE})\n"
fi
STATE=$(echo "$STATE" | jq --arg s "$SLUG" '.[$s] = {"status":"ok","fail_count":0,"first_failure_at":""}')
else
# Failure case
NEW_FAIL_COUNT=$((PREV_FAIL_COUNT + 1))
FIRST_FAILURE="$PREV_FIRST_FAILURE"
if [ -z "$FIRST_FAILURE" ] || [ "$FIRST_FAILURE" = "" ]; then
FIRST_FAILURE="$NOW"
fi
# Build error description
if [[ "$HTTP_CODE" =~ ^[0-9]+$ ]] && [ "$HTTP_CODE" -gt 0 ] 2>/dev/null; then
ERROR_DESC="HTTP ${HTTP_CODE}"
else
ERROR_DESC="connection failed"
fi
SVC_URL="${URLS[$SLUG]}"
ALERTS="${ALERTS}:red_circle: *${SLUG}* — attempt: ${NEW_FAIL_COUNT}, error: ${ERROR_DESC} (<${SVC_URL}/api/smoke|smoke> · <${SVC_URL}/api/health|health>)\n"
# Escalation at 4 consecutive failures (1 hour at 15-min intervals)
if [ "$NEW_FAIL_COUNT" -eq 4 ]; then
ALERTS="${ALERTS}<!channel> :rotating_light: *${SLUG}* has been failing for 1 hour (since ${FIRST_FAILURE})\n"
fi
STATE=$(echo "$STATE" | jq \
--arg s "$SLUG" \
--argjson fc "$NEW_FAIL_COUNT" \
--arg ff "$FIRST_FAILURE" \
'.[$s] = {"status":"failing","fail_count":$fc,"first_failure_at":$ff}')
fi
done
# Write updated state
echo "$STATE" | jq '.' > smoke-state.json
# Write alerts for the Slack step
if [ -n "$ALERTS" ]; then
{
echo "has_alerts=true"
echo "run_url=${RUN_URL}"
} >> "$GITHUB_OUTPUT"
# Write alerts to a file to avoid escaping issues
printf "%b" "$ALERTS" > alerts.txt
echo "" >> alerts.txt
echo "<https://showcase.copilotkit.ai|Showcase> · <${RUN_URL}|Workflow run>" >> alerts.txt
else
echo "has_alerts=false" >> "$GITHUB_OUTPUT"
fi
- name: Build Slack payload
if: steps.smoke.outputs.has_alerts == 'true'
run: |
jq -n --rawfile text alerts.txt '{"text": $text}' > slack-payload.json
- name: Post to Slack
if: steps.smoke.outputs.has_alerts == 'true'
uses: slackapi/slack-github-action@v2.1.0
with:
webhook: ${{ secrets.SLACK_WEBHOOK_OSS_ALERTS }}
webhook-type: incoming-webhook
payload-file-path: slack-payload.json
- name: Check image drift
id: image_drift
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Skip if the latest main commit is <20 min old (deploy is probably still building)
COMMIT_DATE=$(gh api "/repos/${{ github.repository }}/commits/main" --jq '.commit.committer.date' 2>/dev/null) || true
if [ -n "$COMMIT_DATE" ]; then
COMMIT_TS=$(date -d "$COMMIT_DATE" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$COMMIT_DATE" +%s 2>/dev/null) || true
NOW_TS=$(date +%s)
if [ -z "$COMMIT_TS" ]; then COMMIT_TS=$NOW_TS; fi
AGE_MIN=$(( (NOW_TS - COMMIT_TS) / 60 ))
echo "Latest main commit is ${AGE_MIN}m old"
if [ "$AGE_MIN" -lt 20 ]; then
echo "Skipping drift check — deploy likely still in progress"
echo "has_stale=false" >> "$GITHUB_OUTPUT"
exit 0
fi
fi
STALE=""
STALE_LIST=""
SERVICES=(
shell langgraph-python langgraph-typescript langgraph-fastapi
mastra crewai-crews pydantic-ai google-adk ag2 agno llamaindex
strands ms-agent-python ms-agent-dotnet claude-sdk-python
claude-sdk-typescript langroid spring-ai aimock
)
# Compare against the last commits that touched showcase-related paths,
# NOT main HEAD. Deploys only trigger on showcase/ and examples/integrations/
# changes, so non-showcase commits shouldn't make images appear stale.
SHOWCASE_SHA=$(gh api "repos/${{ github.repository }}/commits?sha=main&path=showcase&per_page=1" --jq '.[0].sha // empty') || {
echo "::warning::Failed to fetch showcase/ commit SHA"
SHOWCASE_SHA=""
}
EXAMPLES_SHA=$(gh api "repos/${{ github.repository }}/commits?sha=main&path=examples/integrations&per_page=1" --jq '.[0].sha // empty') || {
echo "::warning::Failed to fetch examples/integrations/ commit SHA"
EXAMPLES_SHA=""
}
echo "Last showcase/ SHA: ${SHOWCASE_SHA:0:8}, Last examples/integrations/ SHA: ${EXAMPLES_SHA:0:8}"
if [ -z "$SHOWCASE_SHA" ] && [ -z "$EXAMPLES_SHA" ]; then
echo "::warning::Could not resolve any path-specific SHAs — skipping drift check"
echo "has_stale=false" >> "$GITHUB_OUTPUT"
else
for SVC in "${SERVICES[@]}"; do
PKG="showcase-${SVC}"
# Get tags for the latest version via GitHub Packages API
TAGS=$(gh api "/orgs/copilotkit/packages/container/${PKG}/versions?per_page=1" \
--jq '.[0].metadata.container.tags | join(" ")' 2>/dev/null) || true
if [ -z "$TAGS" ]; then
continue # No package versions — new service, not yet built
fi
# Image is up to date if it matches EITHER the last showcase/ or
# examples/integrations/ commit (deploy triggers on both paths)
UP_TO_DATE=false
if [ -n "$SHOWCASE_SHA" ] && echo "$TAGS" | grep -q "$SHOWCASE_SHA"; then
UP_TO_DATE=true
fi
if [ -n "$EXAMPLES_SHA" ] && echo "$TAGS" | grep -q "$EXAMPLES_SHA"; then
UP_TO_DATE=true
fi
if [ "$UP_TO_DATE" = true ]; then
continue
fi
echo " ${SVC}: stale (tags: ${TAGS:0:60})"
STALE="${STALE}:warning: *${SVC}* — image stale\n"
STALE_LIST="${STALE_LIST} ${SVC}"
done
# Compare against previous drift state to avoid alerting repeatedly
PREV_STALE=$(jq -r '.image_drift_services // ""' smoke-state.json 2>/dev/null) || true
SORTED_STALE=$(echo "$STALE_LIST" | tr ' ' '\n' | sort | tr '\n' ' ' | xargs)
if [ -n "$STALE_LIST" ]; then
echo "Image drift detected:${STALE_LIST}"
echo "stale_services=${STALE_LIST}" >> "$GITHUB_OUTPUT"
printf "%b" "$STALE" > image-drift.txt
if [ "$SORTED_STALE" != "$PREV_STALE" ]; then
echo "Stale set changed (was: '${PREV_STALE}', now: '${SORTED_STALE}') — alerting"
echo "has_stale=true" >> "$GITHUB_OUTPUT"
else
echo "Same stale set as last run — suppressing alert"
echo "has_stale=false" >> "$GITHUB_OUTPUT"
fi
# Persist current stale set
STATE=$(cat smoke-state.json)
echo "$STATE" | jq --arg ds "$SORTED_STALE" '. + {"image_drift_services": $ds}' > smoke-state.json
else
echo "All images up to date"
echo "has_stale=false" >> "$GITHUB_OUTPUT"
# Clear drift state
STATE=$(cat smoke-state.json)
echo "$STATE" | jq 'del(.image_drift_services)' > smoke-state.json
fi
fi # end SHA guard
- name: Trigger rebuild for stale services
id: rebuild
if: steps.image_drift.outputs.has_stale == 'true'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
FAILED=""
FAILED_COUNT=0
: > rebuild-failures.txt
COUNT=0
for SVC in ${{ steps.image_drift.outputs.stale_services }}; do
echo "Triggering rebuild for ${SVC}..."
RC=0
ERR_OUTPUT=$(gh workflow run showcase_deploy.yml --repo "${{ github.repository }}" -f service="${SVC}" 2>&1) || RC=$?
if [ "$RC" -eq 0 ]; then
COUNT=$((COUNT + 1))
else
echo "::warning::Failed to trigger rebuild for ${SVC}: ${ERR_OUTPUT}"
# Collapse whitespace/newlines in the error so it fits on one Slack line
REASON=$(echo "$ERR_OUTPUT" | tr '\n' ' ' | sed 's/ */ /g' | sed 's/^ *//;s/ *$//')
if [ -z "$REASON" ]; then
REASON="unknown error"
fi
FAILED="${FAILED} ${SVC}"
FAILED_COUNT=$((FAILED_COUNT + 1))
printf ":x: *%s* — %s\n" "$SVC" "$REASON" >> rebuild-failures.txt
fi
done
echo "triggered_count=${COUNT}" >> "$GITHUB_OUTPUT"
echo "failed_count=${FAILED_COUNT}" >> "$GITHUB_OUTPUT"
if [ "$FAILED_COUNT" -gt 0 ]; then
# Surface failures via annotation and fail the job so the run shows
# RED in GitHub Actions UI / `gh run list`. Slack dedup is handled
# by guarding the failure() notifier with
# `steps.image_drift.outputs.has_stale != 'true'`, so this exit 1
# does NOT cause double-posting — the detailed drift-alert step
# covers the drift path.
echo "::error::Failed to trigger rebuilds for:${FAILED}"
echo "has_failures=true" >> "$GITHUB_OUTPUT"
exit 1
else
echo "has_failures=false" >> "$GITHUB_OUTPUT"
fi
- name: Alert image drift to Slack
if: always() && steps.image_drift.outputs.has_stale == 'true'
run: |
RUN_URL="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
COUNT="${{ steps.rebuild.outputs.triggered_count }}"
FAILED_COUNT="${{ steps.rebuild.outputs.failed_count }}"
if [ -z "$COUNT" ]; then
# Fallback: rebuild step didn't run or didn't set the output; count from stale_services
COUNT=$(echo "${{ steps.image_drift.outputs.stale_services }}" | wc -w | tr -d ' ')
fi
if [ -z "$FAILED_COUNT" ]; then
FAILED_COUNT=0
fi
NOUN="rebuilds"
if [ "$COUNT" = "1" ]; then NOUN="rebuild"; fi
if [ "${{ steps.rebuild.outputs.has_failures }}" = "true" ] && [ -s rebuild-failures.txt ]; then
# Failure case: list only the services that failed to rebuild, with reasons.
# COUNT reflects only successfully-triggered rebuilds; FAILED_COUNT is the rest.
{
printf ":package: *Image drift detected — %s %s triggered, %s failed:*\n" "$COUNT" "$NOUN" "$FAILED_COUNT"
cat rebuild-failures.txt
printf "<%s|Workflow run>\n" "$RUN_URL"
} > drift-message.txt
else
# Success case: just summarize the count with a link to the run
printf ":package: Image drift detected — %s %s triggered (<%s|run>)\n" "$COUNT" "$NOUN" "$RUN_URL" > drift-message.txt
fi
jq -n --rawfile text drift-message.txt '{"text": $text}' > drift-payload.json
- name: Post image drift to Slack
if: always() && steps.image_drift.outputs.has_stale == 'true'
uses: slackapi/slack-github-action@v2.1.0
with:
webhook: ${{ secrets.SLACK_WEBHOOK_OSS_ALERTS }}
webhook-type: incoming-webhook
payload-file-path: drift-payload.json
- name: Notify Slack (workflow failure)
# Suppress when the drift-alert path is active — that step already
# posts a detailed Slack message covering per-service rebuild results.
# This generic notifier still fires for non-drift failures (e.g., the
# smoke-check step itself fails before drift detection runs).
if: failure() && steps.image_drift.outputs.has_stale != 'true'
uses: slackapi/slack-github-action@v2.1.0
with:
webhook: ${{ secrets.SLACK_WEBHOOK_OSS_ALERTS }}
webhook-type: incoming-webhook
payload: |
{ "text": ":x: *Smoke monitor*: workflow failed | <https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>" }
- name: Save state to cache
if: always()
uses: actions/cache/save@v4
with:
path: smoke-state.json
key: smoke-monitor-state-${{ github.run_id }}