Showcase: Smoke Monitor #137

Workflow file for this run

.github/workflows/showcase_smoke-monitor.yml at 423275b

	name: "Showcase: Smoke Monitor"

	on:
	schedule:
	- cron: "/15 * * *"
	workflow_dispatch: {}

	jobs:
	smoke-check:
	name: Smoke Check
	if: github.event_name == 'schedule' \|\| github.event_name == 'workflow_dispatch'
	runs-on: ubuntu-latest
	timeout-minutes: 5
	permissions:
	packages: read
	actions: write
	steps:
	- name: Restore state from cache
	id: cache-restore
	uses: actions/cache/restore@v4
	with:
	path: smoke-state.json
	key: smoke-monitor-state-impossible-match
	restore-keys: \|
	smoke-monitor-state-

	- name: Initialize state if missing
	run: \|
	if [ ! -f smoke-state.json ]; then
	cat > smoke-state.json <<'INIT'
	{
	"langgraph-python": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
	"langgraph-typescript": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
	"langgraph-fastapi": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
	"mastra": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
	"crewai-crews": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
	"pydantic-ai": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
	"google-adk": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
	"agno": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
	"ag2": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
	"llamaindex": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
	"strands": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
	"ms-agent-python": { "status": "ok", "fail_count": 0, "first_failure_at": "" },
	"ms-agent-dotnet": { "status": "ok", "fail_count": 0, "first_failure_at": "" }
	}
	INIT
	fi

	- name: Run smoke checks
	id: smoke
	run: \|
	declare -A URLS
	URLS[langgraph-python]="https://showcase-langgraph-python-production.up.railway.app"
	URLS[langgraph-typescript]="https://showcase-langgraph-typescript-production.up.railway.app"
	URLS[langgraph-fastapi]="https://showcase-langgraph-fastapi-production.up.railway.app"
	URLS[mastra]="https://showcase-mastra-production.up.railway.app"
	URLS[crewai-crews]="https://showcase-crewai-crews-production.up.railway.app"
	URLS[pydantic-ai]="https://showcase-pydantic-ai-production.up.railway.app"
	URLS[google-adk]="https://showcase-google-adk-production.up.railway.app"
	URLS[agno]="https://showcase-agno-production.up.railway.app"
	URLS[ag2]="https://showcase-ag2-production.up.railway.app"
	URLS[llamaindex]="https://showcase-llamaindex-production.up.railway.app"
	URLS[strands]="https://showcase-strands-production.up.railway.app"
	URLS[ms-agent-python]="https://showcase-ms-agent-python-production.up.railway.app"
	URLS[ms-agent-dotnet]="https://showcase-ms-agent-dotnet-production.up.railway.app"

	NOW="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
	ALERTS=""
	STATE="$(cat smoke-state.json)"
	RUN_URL="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"

	for SLUG in "${!URLS[@]}"; do
	URL="${URLS[$SLUG]}/api/smoke"
	PREV_STATUS=$(echo "$STATE" \| jq -r --arg s "$SLUG" '.[$s].status // "ok"')
	PREV_FAIL_COUNT=$(echo "$STATE" \| jq -r --arg s "$SLUG" '.[$s].fail_count // 0')
	PREV_FIRST_FAILURE=$(echo "$STATE" \| jq -r --arg s "$SLUG" '.[$s].first_failure_at // ""')

	# Hit the endpoint
	HTTP_RESPONSE=$(curl -s -w "\n%{http_code}" --max-time 45 "$URL" 2>&1) \|\| true
	HTTP_BODY=$(echo "$HTTP_RESPONSE" \| head -n -1)
	HTTP_CODE=$(echo "$HTTP_RESPONSE" \| tail -n 1)

	# Determine if healthy
	HEALTHY=false
	if [[ "$HTTP_CODE" =~ ^2[0-9][0-9]$ ]]; then
	# Try to parse JSON and check status field
	SMOKE_STATUS=$(echo "$HTTP_BODY" \| jq -r '.status // empty' 2>/dev/null) \|\| true
	if [ "$SMOKE_STATUS" = "ok" ] \|\| [ "$SMOKE_STATUS" = "healthy" ]; then
	HEALTHY=true
	elif [ -z "$SMOKE_STATUS" ]; then
	# No status field but 2xx — treat as healthy
	HEALTHY=true
	fi
	fi

	if [ "$HEALTHY" = true ]; then
	# Recovery case
	if [ "$PREV_STATUS" = "failing" ]; then
	ALERTS="${ALERTS}:white_check_mark: ${SLUG} recovered (was down since ${PREV_FIRST_FAILURE})\n"
	fi
	STATE=$(echo "$STATE" \| jq --arg s "$SLUG" '.[$s] = {"status":"ok","fail_count":0,"first_failure_at":""}')
	else
	# Failure case
	NEW_FAIL_COUNT=$((PREV_FAIL_COUNT + 1))
	FIRST_FAILURE="$PREV_FIRST_FAILURE"
	if [ -z "$FIRST_FAILURE" ] \|\| [ "$FIRST_FAILURE" = "" ]; then
	FIRST_FAILURE="$NOW"
	fi

	# Build error description
	if [[ "$HTTP_CODE" =~ ^[0-9]+$ ]] && [ "$HTTP_CODE" -gt 0 ] 2>/dev/null; then
	ERROR_DESC="HTTP ${HTTP_CODE}"
	else
	ERROR_DESC="connection failed"
	fi

	SVC_URL="${URLS[$SLUG]}"
	ALERTS="${ALERTS}:red_circle: ${SLUG} — attempt: ${NEW_FAIL_COUNT}, error: ${ERROR_DESC} (<${SVC_URL}/api/smoke\|smoke> · <${SVC_URL}/api/health\|health>)\n"

	# Escalation at 4 consecutive failures (1 hour at 15-min intervals)
	if [ "$NEW_FAIL_COUNT" -eq 4 ]; then
	ALERTS="${ALERTS}<!channel> :rotating_light: ${SLUG} has been failing for 1 hour (since ${FIRST_FAILURE})\n"
	fi

	STATE=$(echo "$STATE" \| jq \
	--arg s "$SLUG" \
	--argjson fc "$NEW_FAIL_COUNT" \
	--arg ff "$FIRST_FAILURE" \
	'.[$s] = {"status":"failing","fail_count":$fc,"first_failure_at":$ff}')
	fi
	done

	# Write updated state
	echo "$STATE" \| jq '.' > smoke-state.json

	# Write alerts for the Slack step
	if [ -n "$ALERTS" ]; then
	{
	echo "has_alerts=true"
	echo "run_url=${RUN_URL}"
	} >> "$GITHUB_OUTPUT"

	# Write alerts to a file to avoid escaping issues
	printf "%b" "$ALERTS" > alerts.txt
	echo "" >> alerts.txt
	echo "<https://showcase.copilotkit.ai\|Showcase> · <${RUN_URL}\|Workflow run>" >> alerts.txt
	else
	echo "has_alerts=false" >> "$GITHUB_OUTPUT"
	fi

	- name: Build Slack payload
	if: steps.smoke.outputs.has_alerts == 'true'
	run: \|
	jq -n --rawfile text alerts.txt '{"text": $text}' > slack-payload.json

	- name: Post to Slack
	if: steps.smoke.outputs.has_alerts == 'true'
	uses: slackapi/slack-github-action@v2.1.0
	with:
	webhook: ${{ secrets.SLACK_WEBHOOK_OSS_ALERTS }}
	webhook-type: incoming-webhook
	payload-file-path: slack-payload.json

	- name: Check image drift
	id: image_drift
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	# Skip if the latest main commit is <20 min old (deploy is probably still building)
	COMMIT_DATE=$(gh api "/repos/${{ github.repository }}/commits/main" --jq '.commit.committer.date' 2>/dev/null) \|\| true
	if [ -n "$COMMIT_DATE" ]; then
	COMMIT_TS=$(date -d "$COMMIT_DATE" +%s 2>/dev/null \|\| date -j -f "%Y-%m-%dT%H:%M:%SZ" "$COMMIT_DATE" +%s 2>/dev/null) \|\| true
	NOW_TS=$(date +%s)
	if [ -z "$COMMIT_TS" ]; then COMMIT_TS=$NOW_TS; fi
	AGE_MIN=$(( (NOW_TS - COMMIT_TS) / 60 ))
	echo "Latest main commit is ${AGE_MIN}m old"
	if [ "$AGE_MIN" -lt 20 ]; then
	echo "Skipping drift check — deploy likely still in progress"
	echo "has_stale=false" >> "$GITHUB_OUTPUT"
	exit 0
	fi
	fi

	STALE=""
	STALE_LIST=""
	SERVICES=(
	shell langgraph-python langgraph-typescript langgraph-fastapi
	mastra crewai-crews pydantic-ai google-adk ag2 agno llamaindex
	strands ms-agent-python ms-agent-dotnet claude-sdk-python
	claude-sdk-typescript langroid spring-ai aimock
	)
	# Compare against the last commits that touched showcase-related paths,
	# NOT main HEAD. Deploys only trigger on showcase/ and examples/integrations/
	# changes, so non-showcase commits shouldn't make images appear stale.
	SHOWCASE_SHA=$(gh api "repos/${{ github.repository }}/commits?sha=main&path=showcase&per_page=1" --jq '.[0].sha // empty') \|\| {
	echo "::warning::Failed to fetch showcase/ commit SHA"
	SHOWCASE_SHA=""
	}
	EXAMPLES_SHA=$(gh api "repos/${{ github.repository }}/commits?sha=main&path=examples/integrations&per_page=1" --jq '.[0].sha // empty') \|\| {
	echo "::warning::Failed to fetch examples/integrations/ commit SHA"
	EXAMPLES_SHA=""
	}
	echo "Last showcase/ SHA: ${SHOWCASE_SHA:0:8}, Last examples/integrations/ SHA: ${EXAMPLES_SHA:0:8}"

	if [ -z "$SHOWCASE_SHA" ] && [ -z "$EXAMPLES_SHA" ]; then
	echo "::warning::Could not resolve any path-specific SHAs — skipping drift check"
	echo "has_stale=false" >> "$GITHUB_OUTPUT"
	else

	for SVC in "${SERVICES[@]}"; do
	PKG="showcase-${SVC}"
	# Get tags for the latest version via GitHub Packages API
	TAGS=$(gh api "/orgs/copilotkit/packages/container/${PKG}/versions?per_page=1" \
	--jq '.[0].metadata.container.tags \| join(" ")' 2>/dev/null) \|\| true

	if [ -z "$TAGS" ]; then
	continue # No package versions — new service, not yet built
	fi

	# Image is up to date if it matches EITHER the last showcase/ or
	# examples/integrations/ commit (deploy triggers on both paths)
	UP_TO_DATE=false
	if [ -n "$SHOWCASE_SHA" ] && echo "$TAGS" \| grep -q "$SHOWCASE_SHA"; then
	UP_TO_DATE=true
	fi
	if [ -n "$EXAMPLES_SHA" ] && echo "$TAGS" \| grep -q "$EXAMPLES_SHA"; then
	UP_TO_DATE=true
	fi
	if [ "$UP_TO_DATE" = true ]; then
	continue
	fi

	echo " ${SVC}: stale (tags: ${TAGS:0:60})"
	STALE="${STALE}:warning: ${SVC} — image stale\n"
	STALE_LIST="${STALE_LIST} ${SVC}"
	done

	# Compare against previous drift state to avoid alerting repeatedly
	PREV_STALE=$(jq -r '.image_drift_services // ""' smoke-state.json 2>/dev/null) \|\| true
	SORTED_STALE=$(echo "$STALE_LIST" \| tr ' ' '\n' \| sort \| tr '\n' ' ' \| xargs)

	if [ -n "$STALE_LIST" ]; then
	echo "Image drift detected:${STALE_LIST}"
	echo "stale_services=${STALE_LIST}" >> "$GITHUB_OUTPUT"
	printf "%b" "$STALE" > image-drift.txt

	if [ "$SORTED_STALE" != "$PREV_STALE" ]; then
	echo "Stale set changed (was: '${PREV_STALE}', now: '${SORTED_STALE}') — alerting"
	echo "has_stale=true" >> "$GITHUB_OUTPUT"
	else
	echo "Same stale set as last run — suppressing alert"
	echo "has_stale=false" >> "$GITHUB_OUTPUT"
	fi

	# Persist current stale set
	STATE=$(cat smoke-state.json)
	echo "$STATE" \| jq --arg ds "$SORTED_STALE" '. + {"image_drift_services": $ds}' > smoke-state.json
	else
	echo "All images up to date"
	echo "has_stale=false" >> "$GITHUB_OUTPUT"
	# Clear drift state
	STATE=$(cat smoke-state.json)
	echo "$STATE" \| jq 'del(.image_drift_services)' > smoke-state.json
	fi

	fi # end SHA guard

	- name: Trigger rebuild for stale services
	id: rebuild
	if: steps.image_drift.outputs.has_stale == 'true'
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	FAILED=""
	FAILED_COUNT=0
	: > rebuild-failures.txt
	COUNT=0
	for SVC in ${{ steps.image_drift.outputs.stale_services }}; do
	echo "Triggering rebuild for ${SVC}..."
	RC=0
	ERR_OUTPUT=$(gh workflow run showcase_deploy.yml --repo "${{ github.repository }}" -f service="${SVC}" 2>&1) \|\| RC=$?
	if [ "$RC" -eq 0 ]; then
	COUNT=$((COUNT + 1))
	else
	echo "::warning::Failed to trigger rebuild for ${SVC}: ${ERR_OUTPUT}"
	# Collapse whitespace/newlines in the error so it fits on one Slack line
	REASON=$(echo "$ERR_OUTPUT" \| tr '\n' ' ' \| sed 's/ / /g' \| sed 's/^ //;s/ *$//')
	if [ -z "$REASON" ]; then
	REASON="unknown error"
	fi
	FAILED="${FAILED} ${SVC}"
	FAILED_COUNT=$((FAILED_COUNT + 1))
	printf ":x: %s — %s\n" "$SVC" "$REASON" >> rebuild-failures.txt
	fi
	done
	echo "triggered_count=${COUNT}" >> "$GITHUB_OUTPUT"
	echo "failed_count=${FAILED_COUNT}" >> "$GITHUB_OUTPUT"
	if [ "$FAILED_COUNT" -gt 0 ]; then
	# Surface failures via annotation and fail the job so the run shows
	# RED in GitHub Actions UI / `gh run list`. Slack dedup is handled
	# by guarding the failure() notifier with
	# `steps.image_drift.outputs.has_stale != 'true'`, so this exit 1
	# does NOT cause double-posting — the detailed drift-alert step
	# covers the drift path.
	echo "::error::Failed to trigger rebuilds for:${FAILED}"
	echo "has_failures=true" >> "$GITHUB_OUTPUT"
	exit 1
	else
	echo "has_failures=false" >> "$GITHUB_OUTPUT"
	fi

	- name: Alert image drift to Slack
	if: always() && steps.image_drift.outputs.has_stale == 'true'
	run: \|
	RUN_URL="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
	COUNT="${{ steps.rebuild.outputs.triggered_count }}"
	FAILED_COUNT="${{ steps.rebuild.outputs.failed_count }}"
	if [ -z "$COUNT" ]; then
	# Fallback: rebuild step didn't run or didn't set the output; count from stale_services
	COUNT=$(echo "${{ steps.image_drift.outputs.stale_services }}" \| wc -w \| tr -d ' ')
	fi
	if [ -z "$FAILED_COUNT" ]; then
	FAILED_COUNT=0
	fi
	NOUN="rebuilds"
	if [ "$COUNT" = "1" ]; then NOUN="rebuild"; fi

	if [ "${{ steps.rebuild.outputs.has_failures }}" = "true" ] && [ -s rebuild-failures.txt ]; then
	# Failure case: list only the services that failed to rebuild, with reasons.
	# COUNT reflects only successfully-triggered rebuilds; FAILED_COUNT is the rest.
	{
	printf ":package: Image drift detected — %s %s triggered, %s failed:\n" "$COUNT" "$NOUN" "$FAILED_COUNT"
	cat rebuild-failures.txt
	printf "<%s\|Workflow run>\n" "$RUN_URL"
	} > drift-message.txt
	else
	# Success case: just summarize the count with a link to the run
	printf ":package: Image drift detected — %s %s triggered (<%s\|run>)\n" "$COUNT" "$NOUN" "$RUN_URL" > drift-message.txt
	fi
	jq -n --rawfile text drift-message.txt '{"text": $text}' > drift-payload.json

	- name: Post image drift to Slack
	if: always() && steps.image_drift.outputs.has_stale == 'true'
	uses: slackapi/slack-github-action@v2.1.0
	with:
	webhook: ${{ secrets.SLACK_WEBHOOK_OSS_ALERTS }}
	webhook-type: incoming-webhook
	payload-file-path: drift-payload.json

	- name: Notify Slack (workflow failure)
	# Suppress when the drift-alert path is active — that step already
	# posts a detailed Slack message covering per-service rebuild results.
	# This generic notifier still fires for non-drift failures (e.g., the
	# smoke-check step itself fails before drift detection runs).
	if: failure() && steps.image_drift.outputs.has_stale != 'true'
	uses: slackapi/slack-github-action@v2.1.0
	with:
	webhook: ${{ secrets.SLACK_WEBHOOK_OSS_ALERTS }}
	webhook-type: incoming-webhook
	payload: \|
	{ "text": ":x: Smoke monitor: workflow failed \| <https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}\|View run>" }

	- name: Save state to cache
	if: always()
	uses: actions/cache/save@v4
	with:
	path: smoke-state.json
	key: smoke-monitor-state-${{ github.run_id }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Showcase: Smoke Monitor #137

Workflow file

Showcase: Smoke Monitor #137

Uh oh!

Workflow file for this run