Skip to content

Commit 35e80fe

Browse files
authored
Merge branch 'staging' into feat/add-native-bridges
2 parents 9d8bd01 + ae56c9b commit 35e80fe

File tree

155 files changed

+24865
-3965
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

155 files changed

+24865
-3965
lines changed
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
name: 'Post-Deploy Health Check'
2+
description: 'Verify ECS services are healthy after Terraform deployment'
3+
4+
inputs:
5+
aws-region:
6+
description: 'AWS region where the services are deployed'
7+
required: true
8+
ecs-cluster:
9+
description: 'ECS cluster name'
10+
required: true
11+
services:
12+
description: 'Comma-separated list of ECS service names to check'
13+
required: true
14+
aws-access-key-id:
15+
description: 'AWS Access Key ID'
16+
required: true
17+
aws-secret-access-key:
18+
description: 'AWS Secret Access Key'
19+
required: true
20+
stability-timeout:
21+
description: 'Max seconds to wait for service stability'
22+
required: false
23+
default: '300'
24+
health-endpoint:
25+
description: 'Optional HTTP health endpoint URL to check'
26+
required: false
27+
default: ''
28+
29+
runs:
30+
using: 'composite'
31+
steps:
32+
- name: Configure AWS Credentials
33+
uses: aws-actions/configure-aws-credentials@v1
34+
with:
35+
aws-region: ${{ inputs.aws-region }}
36+
aws-access-key-id: ${{ inputs.aws-access-key-id }}
37+
aws-secret-access-key: ${{ inputs.aws-secret-access-key }}
38+
39+
- name: Wait for ECS Services Stability
40+
shell: bash
41+
env:
42+
CLUSTER: ${{ inputs.ecs-cluster }}
43+
SERVICES: ${{ inputs.services }}
44+
REGION: ${{ inputs.aws-region }}
45+
TIMEOUT: ${{ inputs.stability-timeout }}
46+
run: |
47+
echo "============================================"
48+
echo "🔍 Post-Deploy Health Check"
49+
echo "============================================"
50+
echo "Cluster : $CLUSTER"
51+
echo "Region : $REGION"
52+
echo "Services: $SERVICES"
53+
echo "Timeout : ${TIMEOUT}s"
54+
echo "============================================"
55+
56+
IFS=',' read -ra SERVICE_LIST <<< "$SERVICES"
57+
58+
FAILED=0
59+
60+
for SERVICE in "${SERVICE_LIST[@]}"; do
61+
SERVICE=$(echo "$SERVICE" | xargs) # trim whitespace
62+
echo ""
63+
echo "▶ Checking service: $SERVICE"
64+
65+
# ------------------------------------------------
66+
# Step 1: Verify the service exists
67+
# ------------------------------------------------
68+
SERVICE_INFO=$(aws ecs describe-services \
69+
--cluster "$CLUSTER" \
70+
--services "$SERVICE" \
71+
--region "$REGION" \
72+
--query 'services[0]' \
73+
--output json 2>&1) || {
74+
echo " ⏭️ Service '$SERVICE' not found in cluster, skipping"
75+
continue
76+
}
77+
78+
STATUS=$(echo "$SERVICE_INFO" | jq -r '.status // "MISSING"')
79+
if [[ "$STATUS" == "MISSING" || "$STATUS" == "null" ]]; then
80+
echo " ⏭️ Service '$SERVICE' does not exist, skipping"
81+
continue
82+
elif [[ "$STATUS" != "ACTIVE" ]]; then
83+
echo " ❌ Service status is '$STATUS' (expected ACTIVE)"
84+
FAILED=1
85+
continue
86+
fi
87+
88+
# ------------------------------------------------
89+
# Step 2: Check deployment status
90+
# ------------------------------------------------
91+
DEPLOYMENT_COUNT=$(echo "$SERVICE_INFO" | jq '.deployments | length')
92+
PRIMARY_STATUS=$(echo "$SERVICE_INFO" | jq -r '.deployments[] | select(.status == "PRIMARY") | .rolloutState // "UNKNOWN"')
93+
94+
echo " Deployments in progress: $DEPLOYMENT_COUNT"
95+
echo " Primary deployment state: $PRIMARY_STATUS"
96+
97+
if [[ "$PRIMARY_STATUS" == "FAILED" ]]; then
98+
echo " ❌ Primary deployment has FAILED status"
99+
FAILED_REASON=$(echo "$SERVICE_INFO" | jq -r '.deployments[] | select(.status == "PRIMARY") | .rolloutStateReason // "unknown"')
100+
echo " Reason: $FAILED_REASON"
101+
FAILED=1
102+
continue
103+
fi
104+
105+
# ------------------------------------------------
106+
# Step 3: Wait for service stability
107+
# ------------------------------------------------
108+
echo " ⏳ Waiting for service stability (max ${TIMEOUT}s)..."
109+
if aws ecs wait services-stable \
110+
--cluster "$CLUSTER" \
111+
--services "$SERVICE" \
112+
--region "$REGION" 2>&1; then
113+
echo " ✅ Service '$SERVICE' is stable"
114+
else
115+
echo " ❌ Service '$SERVICE' did not stabilize within timeout"
116+
FAILED=1
117+
continue
118+
fi
119+
120+
# ------------------------------------------------
121+
# Step 4: Verify running vs desired count
122+
# ------------------------------------------------
123+
REFRESHED=$(aws ecs describe-services \
124+
--cluster "$CLUSTER" \
125+
--services "$SERVICE" \
126+
--region "$REGION" \
127+
--query 'services[0]' \
128+
--output json)
129+
130+
RUNNING=$(echo "$REFRESHED" | jq '.runningCount')
131+
DESIRED=$(echo "$REFRESHED" | jq '.desiredCount')
132+
PENDING=$(echo "$REFRESHED" | jq '.pendingCount')
133+
134+
echo " Running: $RUNNING | Desired: $DESIRED | Pending: $PENDING"
135+
136+
if [[ "$RUNNING" -lt "$DESIRED" ]]; then
137+
echo " ⚠️ Running count ($RUNNING) < Desired count ($DESIRED)"
138+
FAILED=1
139+
elif [[ "$RUNNING" -eq "$DESIRED" && "$PENDING" -eq 0 ]]; then
140+
echo " ✅ Task counts healthy"
141+
fi
142+
143+
# ------------------------------------------------
144+
# Step 5: Check for recent task failures (last 5 events)
145+
# ------------------------------------------------
146+
echo " 📋 Recent service events:"
147+
echo "$REFRESHED" | jq -r '.events[:5][] | " \(.createdAt): \(.message)"'
148+
149+
STOPPED_TASKS=$(aws ecs list-tasks \
150+
--cluster "$CLUSTER" \
151+
--service-name "$SERVICE" \
152+
--desired-status STOPPED \
153+
--region "$REGION" \
154+
--query 'taskArns' \
155+
--output json)
156+
157+
STOPPED_COUNT=$(echo "$STOPPED_TASKS" | jq 'length')
158+
if [[ "$STOPPED_COUNT" -gt 0 ]]; then
159+
echo " ⚠️ $STOPPED_COUNT recently stopped tasks detected"
160+
161+
# Get stop reasons for the most recent stopped tasks (up to 3)
162+
TASK_ARNS=$(echo "$STOPPED_TASKS" | jq -r '.[:3][]')
163+
if [[ -n "$TASK_ARNS" ]]; then
164+
TASK_DETAILS=$(aws ecs describe-tasks \
165+
--cluster "$CLUSTER" \
166+
--tasks $TASK_ARNS \
167+
--region "$REGION" \
168+
--query 'tasks[].{taskArn:taskArn,stopCode:stopCode,stoppedReason:stoppedReason,lastStatus:lastStatus}' \
169+
--output json)
170+
echo " Recent stopped task details:"
171+
echo "$TASK_DETAILS" | jq -r '.[] | " Status: \(.lastStatus) | Code: \(.stopCode) | Reason: \(.stoppedReason)"'
172+
fi
173+
fi
174+
175+
echo " ────────────────────────────────────"
176+
done
177+
178+
echo ""
179+
echo "============================================"
180+
if [[ "$FAILED" -eq 1 ]]; then
181+
echo "❌ HEALTH CHECK FAILED — one or more services unhealthy"
182+
echo "============================================"
183+
exit 1
184+
else
185+
echo "✅ ALL SERVICES HEALTHY"
186+
echo "============================================"
187+
fi
188+
189+
- name: HTTP Health Endpoint Check
190+
if: inputs.health-endpoint != ''
191+
shell: bash
192+
env:
193+
HEALTH_URL: ${{ inputs.health-endpoint }}
194+
run: |
195+
echo "🌐 Checking HTTP health endpoint: $HEALTH_URL"
196+
197+
MAX_RETRIES=6
198+
RETRY_DELAY=10
199+
200+
for i in $(seq 1 $MAX_RETRIES); do
201+
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$HEALTH_URL" 2>/dev/null || echo "000")
202+
203+
if [[ "$HTTP_CODE" -ge 200 && "$HTTP_CODE" -lt 300 ]]; then
204+
echo " ✅ Health endpoint returned HTTP $HTTP_CODE"
205+
exit 0
206+
fi
207+
208+
echo " Attempt $i/$MAX_RETRIES: HTTP $HTTP_CODE — retrying in ${RETRY_DELAY}s..."
209+
sleep $RETRY_DELAY
210+
done
211+
212+
echo " ❌ Health endpoint failed after $MAX_RETRIES attempts"
213+
exit 1

.github/workflows/ci.yml

Lines changed: 96 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ jobs:
3030
- name: Enable Corepack for Yarn 3
3131
run: corepack enable
3232

33+
- name: Prepare Yarn version
34+
run: corepack prepare yarn@3.3.1 --activate
35+
3336
- name: Check Yarn version
3437
run: yarn --version
3538

@@ -73,6 +76,9 @@ jobs:
7376
- name: Enable Corepack for Yarn 3
7477
run: corepack enable
7578

79+
- name: Prepare Yarn version
80+
run: corepack prepare yarn@3.3.1 --activate
81+
7682
- name: Check Yarn version
7783
run: yarn --version
7884

@@ -103,6 +109,8 @@ jobs:
103109
REGISTRY: 679752396206.dkr.ecr.${{ matrix.environment.region }}.amazonaws.com
104110
POLLER_REPOSITORY: mark-poller
105111
POLLER_IMAGE_TAG: mark-poller-${{ github.sha }}
112+
HANDLER_REPOSITORY: mark-handler
113+
HANDLER_IMAGE_TAG: mark-handler-${{ github.sha }}
106114
ADMIN_REPOSITORY: mark-admin
107115
ADMIN_IMAGE_TAG: mark-admin-${{ github.sha }}
108116
permissions:
@@ -126,16 +134,35 @@ jobs:
126134
with:
127135
mask-password: 'true'
128136

137+
- name: Ensure ECR repositories exist
138+
run: |
139+
aws ecr describe-repositories --repository-names $ADMIN_REPOSITORY --region $AWS_REGION || \
140+
aws ecr create-repository --repository-name $ADMIN_REPOSITORY --region $AWS_REGION --image-scanning-configuration scanOnPush=true --image-tag-mutability MUTABLE
141+
aws ecr describe-repositories --repository-names $HANDLER_REPOSITORY --region $AWS_REGION || \
142+
aws ecr create-repository --repository-name $HANDLER_REPOSITORY --region $AWS_REGION --image-scanning-configuration scanOnPush=true --image-tag-mutability MUTABLE
143+
aws ecr describe-repositories --repository-names $POLLER_REPOSITORY --region $AWS_REGION || \
144+
aws ecr create-repository --repository-name $POLLER_REPOSITORY --region $AWS_REGION --image-scanning-configuration scanOnPush=true --image-tag-mutability MUTABLE
145+
129146
- name: Build and push Admin Docker image
130147
run: |
131-
docker build -f docker/admin/Dockerfile -t $REGISTRY/$ADMIN_REPOSITORY:$ADMIN_IMAGE_TAG .
148+
docker build --provenance=false --sbom=false -f docker/admin/Dockerfile -t $REGISTRY/$ADMIN_REPOSITORY:$ADMIN_IMAGE_TAG .
132149
docker push $REGISTRY/$ADMIN_REPOSITORY:$ADMIN_IMAGE_TAG
133150
134151
- name: Build and push Poller Docker image
135152
run: |
136-
docker build -f docker/poller/Dockerfile -t $REGISTRY/$POLLER_REPOSITORY:$POLLER_IMAGE_TAG .
153+
docker build --provenance=false --sbom=false -f docker/poller/Dockerfile -t $REGISTRY/$POLLER_REPOSITORY:$POLLER_IMAGE_TAG .
137154
docker push $REGISTRY/$POLLER_REPOSITORY:$POLLER_IMAGE_TAG
138155
156+
- name: Build and push Invoice Handler Docker image
157+
run: |
158+
docker build -f docker/handler/Dockerfile -t $REGISTRY/$HANDLER_REPOSITORY:$HANDLER_IMAGE_TAG .
159+
docker push $REGISTRY/$HANDLER_REPOSITORY:$HANDLER_IMAGE_TAG
160+
161+
# Remove the main poller Lambda before deploying the invoice handler (prevents duplicate intent creation)
162+
- name: Remove Main Poller Lambda Function
163+
run: |
164+
bash ops/scripts/remove-poller-lambda.sh ${{ matrix.environment.name }} mainnet prod $AWS_REGION
165+
139166
- name: Use Node.js
140167
uses: actions/setup-node@v4
141168
with:
@@ -178,6 +205,7 @@ jobs:
178205
AWS_PROFILE: aws-deployer-connext
179206
run: |
180207
terraform apply \
208+
-var "handler_image_uri=${REGISTRY}/${HANDLER_REPOSITORY}:${HANDLER_IMAGE_TAG}" \
181209
-var "image_uri=${REGISTRY}/${POLLER_REPOSITORY}:${POLLER_IMAGE_TAG}" \
182210
-var "admin_image_uri=${REGISTRY}/${ADMIN_REPOSITORY}:${ADMIN_IMAGE_TAG}" \
183211
-auto-approve > /dev/null 2>&1
@@ -189,6 +217,23 @@ jobs:
189217
echo "Admin API Endpoint URL for ${{ matrix.environment.name }}:"
190218
terraform output -raw admin_api_endpoint
191219
220+
- name: Show Invoice Handler URL
221+
if: success()
222+
working-directory: ${{ matrix.environment.terraform_dir }}
223+
run: |
224+
echo "Invoice Handler URL for ${{ matrix.environment.name }}:"
225+
terraform output -raw invoice_handler_url
226+
227+
- name: Post-Deploy Health Check
228+
if: success()
229+
uses: ./.github/actions/post-deploy-health-check
230+
with:
231+
aws-region: ${{ matrix.environment.region }}
232+
ecs-cluster: ${{ matrix.environment.name }}-ecs-mainnet-prod
233+
services: "${{ matrix.environment.name }}-web3signer-mainnet-prod,${{ matrix.environment.name }}-fillservice-web3signer-mainnet-prod,${{ matrix.environment.name }}-handler-mainnet-prod,${{ matrix.environment.name }}-prometheus-mainnet-prod,${{ matrix.environment.name }}-pushgateway-mainnet-prod"
234+
aws-access-key-id: ${{ secrets.DEPLOYER_AWS_ACCESS_KEY_ID }}
235+
aws-secret-access-key: ${{ secrets.DEPLOYER_AWS_SECRET_ACCESS_KEY }}
236+
192237
# Staging deployment (mason) - triggered on staging branch
193238
build-and-deploy-staging:
194239
if: github.ref == 'refs/heads/staging'
@@ -198,6 +243,8 @@ jobs:
198243
REGISTRY: 679752396206.dkr.ecr.sa-east-1.amazonaws.com
199244
POLLER_REPOSITORY: mark-poller
200245
POLLER_IMAGE_TAG: mark-poller-${{ github.sha }}
246+
HANDLER_REPOSITORY: mark-handler
247+
HANDLER_IMAGE_TAG: mark-handler-${{ github.sha }}
201248
ADMIN_REPOSITORY: mark-admin
202249
ADMIN_IMAGE_TAG: mark-admin-${{ github.sha }}
203250
permissions:
@@ -221,16 +268,43 @@ jobs:
221268
with:
222269
mask-password: 'true'
223270

271+
- name: Ensure ECR repositories exist
272+
run: |
273+
# Create repositories if they don't exist
274+
aws ecr describe-repositories --repository-names $ADMIN_REPOSITORY --region $AWS_REGION || \
275+
aws ecr create-repository --repository-name $ADMIN_REPOSITORY --region $AWS_REGION --image-scanning-configuration scanOnPush=true --image-tag-mutability MUTABLE
276+
aws ecr describe-repositories --repository-names $HANDLER_REPOSITORY --region $AWS_REGION || \
277+
aws ecr create-repository --repository-name $HANDLER_REPOSITORY --region $AWS_REGION --image-scanning-configuration scanOnPush=true --image-tag-mutability MUTABLE
278+
aws ecr describe-repositories --repository-names $POLLER_REPOSITORY --region $AWS_REGION || \
279+
aws ecr create-repository --repository-name $POLLER_REPOSITORY --region $AWS_REGION --image-scanning-configuration scanOnPush=true --image-tag-mutability MUTABLE
280+
224281
- name: Build and push Admin Docker image
225282
run: |
226-
docker build -f docker/admin/Dockerfile -t $REGISTRY/$ADMIN_REPOSITORY:$ADMIN_IMAGE_TAG .
283+
docker build --provenance=false --sbom=false -f docker/admin/Dockerfile -t $REGISTRY/$ADMIN_REPOSITORY:$ADMIN_IMAGE_TAG .
227284
docker push $REGISTRY/$ADMIN_REPOSITORY:$ADMIN_IMAGE_TAG
228285
286+
- name: Build and push Invoice Handler Docker image
287+
run: |
288+
docker build -f docker/handler/Dockerfile -t $REGISTRY/$HANDLER_REPOSITORY:$HANDLER_IMAGE_TAG .
289+
docker push $REGISTRY/$HANDLER_REPOSITORY:$HANDLER_IMAGE_TAG
290+
229291
- name: Build and push Poller Docker image
230292
run: |
231-
docker build -f docker/poller/Dockerfile -t $REGISTRY/$POLLER_REPOSITORY:$POLLER_IMAGE_TAG .
293+
docker build --provenance=false --sbom=false -f docker/poller/Dockerfile -t $REGISTRY/$POLLER_REPOSITORY:$POLLER_IMAGE_TAG .
232294
docker push $REGISTRY/$POLLER_REPOSITORY:$POLLER_IMAGE_TAG
233295
296+
# ============================================================================
297+
# POLLER REMOVAL - TEMPORARY
298+
# ============================================================================
299+
# Remove only the main poller Lambda function (mark_poller) before deploying the invoice
300+
# handler to prevent duplicate intent creation. Other poller Lambdas remain active.
301+
#
302+
# TODO: Remove this step once poller migration is complete
303+
# ============================================================================
304+
- name: Remove Main Poller Lambda Function
305+
run: |
306+
bash ops/scripts/remove-poller-lambda.sh mason mainnet staging $AWS_REGION
307+
234308
- name: Use Node.js
235309
uses: actions/setup-node@v4
236310
with:
@@ -273,6 +347,7 @@ jobs:
273347
AWS_PROFILE: aws-deployer-connext
274348
run: |
275349
terraform apply \
350+
-var "handler_image_uri=${REGISTRY}/${HANDLER_REPOSITORY}:${HANDLER_IMAGE_TAG}" \
276351
-var "image_uri=${REGISTRY}/${POLLER_REPOSITORY}:${POLLER_IMAGE_TAG}" \
277352
-var "admin_image_uri=${REGISTRY}/${ADMIN_REPOSITORY}:${ADMIN_IMAGE_TAG}" \
278353
-auto-approve > /dev/null 2>&1
@@ -283,3 +358,20 @@ jobs:
283358
run: |
284359
echo "Admin API Endpoint URL for mason (staging):"
285360
terraform output -raw admin_api_endpoint
361+
362+
- name: Show Invoice Handler URL
363+
if: success()
364+
working-directory: ./ops/mainnet/mason
365+
run: |
366+
echo "Invoice Handler URL for mason (staging):"
367+
terraform output -raw invoice_handler_url
368+
369+
- name: Post-Deploy Health Check
370+
if: success()
371+
uses: ./.github/actions/post-deploy-health-check
372+
with:
373+
aws-region: sa-east-1
374+
ecs-cluster: mason-ecs-mainnet-staging
375+
services: "mason-web3signer-mainnet-staging,mason-fillservice-web3signer-mainnet-staging,mason-handler-mainnet-staging,mason-prometheus-mainnet-staging,mason-pushgateway-mainnet-staging"
376+
aws-access-key-id: ${{ secrets.DEPLOYER_AWS_ACCESS_KEY_ID }}
377+
aws-secret-access-key: ${{ secrets.DEPLOYER_AWS_SECRET_ACCESS_KEY }}

0 commit comments

Comments
 (0)