Skip to content
This repository was archived by the owner on Oct 23, 2025. It is now read-only.

Commit f56e9f9

Browse files
authored
Add support for rolling update of stateful Elastigroups (#572)
1 parent f673f7d commit f56e9f9

File tree

5 files changed

+345
-12
lines changed

5 files changed

+345
-12
lines changed

senza/cli.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1933,11 +1933,16 @@ def patch_spotinst_elastigroup(properties, elastigroup_id, region, stack_name):
19331933
metavar="PERCENTAGE",
19341934
type=click.IntRange(0, 100, clamp=True),
19351935
help="Percentage (int value) of the ElastiGroup cluster that is respawned in each step."
1936-
" Valid only for ElastiGroups. The default value for this of 20.",
1936+
" Valid only for stateless ElastiGroups. The default value for this of 20.",
1937+
)
1938+
@click.option(
1939+
"--batch-per-subnet",
1940+
is_flag=True,
1941+
help="Recycle ElastiGroup instances in batches per subnet. Valid only for stateful ElastiGroups.",
19371942
)
19381943
@region_option
19391944
@stacktrace_visible_option
1940-
def respawn_instances(stack_ref, inplace, force, batch_size_percentage, region):
1945+
def respawn_instances(stack_ref, inplace, force, batch_size_percentage, batch_per_subnet, region):
19411946
"""Replace all EC2 instances in Auto Scaling Group(s)
19421947
19431948
Performs a rolling update to prevent downtimes."""
@@ -1954,7 +1959,7 @@ def respawn_instances(stack_ref, inplace, force, batch_size_percentage, region):
19541959
)
19551960
elif group["type"] == ELASTIGROUP_RESOURCE_TYPE:
19561961
respawn.respawn_elastigroup(
1957-
group["resource_id"], group["stack_name"], region, batch_size_percentage
1962+
group["resource_id"], group["stack_name"], region, batch_size_percentage, batch_per_subnet
19581963
)
19591964

19601965

senza/respawn.py

Lines changed: 104 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
Functions to scale and respawn Auto Scale Groups
33
"""
4+
import collections
45
import time
56

67
from clickclick import Action, info
@@ -14,6 +15,7 @@
1415
ELASTIGROUP_TERMINATED_DEPLOY_STATUS = ["stopped", "failed"]
1516

1617
DEFAULT_BATCH_SIZE = 20
18+
WAIT_FOR_ELASTIGROUP_SEC = 10
1719

1820

1921
def get_auto_scaling_group(asg, asg_name: str):
@@ -177,19 +179,117 @@ def respawn_auto_scaling_group(
177179

178180

179181
def respawn_elastigroup(
180-
elastigroup_id: str, stack_name: str, region: str, batch_size: int
182+
elastigroup_id: str, stack_name: str, region: str, batch_size: int, batch_per_subnet: bool
181183
):
182184
"""
183185
Respawn all instances in the ElastiGroup.
184186
"""
185187

188+
spotinst_account = elastigroup_api.get_spotinst_account_data(region, stack_name)
189+
190+
stateful_instances = elastigroup_api.get_stateful_instances(elastigroup_id, spotinst_account)
191+
if stateful_instances:
192+
if batch_size is not None:
193+
raise Exception("Batch size is not supported when respawning stateful ElastiGroups")
194+
195+
respawn_stateful_elastigroup(elastigroup_id, stack_name, region, batch_per_subnet, stateful_instances, spotinst_account)
196+
else:
197+
if batch_per_subnet:
198+
raise Exception("Batch per subnet is not supported when respawning stateless ElastiGroups")
199+
200+
respawn_stateless_elastigroup(elastigroup_id, stack_name, batch_size, spotinst_account)
201+
202+
203+
def respawn_stateful_elastigroup(
204+
elastigroup_id: str, stack_name: str, region: str, batch_per_subnet: bool, stateful_instances: list, spotinst_account
205+
):
206+
"""
207+
Recycles stateful instances of the ElastiGroup.
208+
"""
209+
210+
if not stateful_elastigroup_ready(stateful_instances):
211+
raise Exception(
212+
"Stateful ElastiGroup {} is not ready: some instances are not in the ACTIVE state".format(elastigroup_id)
213+
)
214+
215+
info(
216+
"Recycling {} stateful instances for ElastiGroup {} (ID {})".format(
217+
len(stateful_instances), stack_name, elastigroup_id
218+
)
219+
)
220+
221+
if batch_per_subnet:
222+
instances_by_subnet = stateful_elastigroup_instances_by_subnet(region, stateful_instances)
223+
for subnet, subnet_instances in sorted(instances_by_subnet.items(), key=lambda item: item[0]):
224+
info("Recycling ALL stateful instances in subnet: {}".format(subnet))
225+
226+
for instance in sorted(subnet_instances, key=lambda i: i['privateIp']):
227+
time.sleep(WAIT_FOR_ELASTIGROUP_SEC)
228+
recycle_stateful_elastigroup_instance(elastigroup_id, instance, spotinst_account)
229+
230+
wait_for_stateful_elastigroup(elastigroup_id, spotinst_account)
231+
232+
else:
233+
for instance in sorted(stateful_instances, key=lambda i: i['privateIp']):
234+
recycle_stateful_elastigroup_instance(elastigroup_id, instance, spotinst_account)
235+
wait_for_stateful_elastigroup(elastigroup_id, spotinst_account)
236+
237+
238+
def stateful_elastigroup_instances_by_subnet(region: str, stateful_instances: list):
239+
instances_by_subnet = collections.defaultdict(list)
240+
instances_by_ec2_id = {i['instanceId']: i for i in stateful_instances}
241+
242+
ec2 = BotoClientProxy("ec2", region)
243+
ec2_instances = ec2.describe_instances(InstanceIds=list(instances_by_ec2_id.keys()))
244+
for r in ec2_instances['Reservations']:
245+
for i in r['Instances']:
246+
subnet = "{} | {}".format(
247+
i['Placement']['AvailabilityZone'], i['SubnetId']
248+
)
249+
instance = instances_by_ec2_id[i['InstanceId']]
250+
instances_by_subnet[subnet].append(instance)
251+
252+
return instances_by_subnet
253+
254+
255+
def stateful_elastigroup_ready(stateful_instances: list):
256+
return all(i['state'] == elastigroup_api.STATEFUL_STATE_ACTIVE for i in stateful_instances)
257+
258+
259+
def wait_for_stateful_elastigroup(elastigroup_id: str, spotinst_account):
260+
"""
261+
Waits for all stateful instances of the ElastiGroup to be in the ACTIVE state.
262+
"""
263+
with Action("Waiting for all stateful instances to be in the ACTIVE state") as act:
264+
while True:
265+
time.sleep(WAIT_FOR_ELASTIGROUP_SEC)
266+
act.progress()
267+
stateful_instances = elastigroup_api.get_stateful_instances(elastigroup_id, spotinst_account)
268+
if stateful_elastigroup_ready(stateful_instances):
269+
break
270+
271+
272+
def recycle_stateful_elastigroup_instance(elastigroup_id: str, instance: dict, spotinst_account):
273+
info(
274+
"Recycling stateful instance {} ({} | {})".format(
275+
instance['id'], instance['instanceId'], instance['privateIp']
276+
)
277+
)
278+
elastigroup_api.recycle_stateful_instance(elastigroup_id, instance['id'], spotinst_account)
279+
280+
281+
def respawn_stateless_elastigroup(
282+
elastigroup_id: str, stack_name: str, batch_size: int, spotinst_account
283+
):
284+
"""
285+
Start a deployment of the ElastiGroup and wait for it to complete.
286+
"""
287+
186288
if batch_size is None or batch_size < 1:
187289
batch_size = DEFAULT_BATCH_SIZE
188290

189-
spotinst_account = elastigroup_api.get_spotinst_account_data(region, stack_name)
190-
191291
info(
192-
"Redeploying the cluster for ElastiGroup {} (ID {})".format(
292+
"Redeploying instances for ElastiGroup {} (ID {})".format(
193293
stack_name, elastigroup_id
194294
)
195295
)

senza/spotinst/components/elastigroup_api.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
DEPLOY_STRATEGY_REPLACE = 'REPLACE_SERVER'
1515
DEFAULT_CONNECT_TIMEOUT = 9
1616
DEFAULT_READ_TIMEOUT = 30
17+
STATEFUL_STATE_ACTIVE = 'ACTIVE'
1718

1819

1920
class SpotInstAccountData:
@@ -112,6 +113,56 @@ def get_elastigroup(elastigroup_id, spotinst_account_data):
112113
return groups
113114

114115

116+
def get_stateful_instances(elastigroup_id, spotinst_account_data):
117+
'''
118+
Returns a list containing the description of the stateful instances of an ElastiGroup.
119+
Exceptions will be thrown for HTTP errors.
120+
121+
https://docs.spot.io/spotinst-api/elastigroup/amazon-web-services/stateful-api/list-stateful-instances/
122+
'''
123+
headers = {
124+
"Authorization": "Bearer {}".format(spotinst_account_data.access_token),
125+
"Content-Type": "application/json"
126+
}
127+
128+
response = requests.get(
129+
'{}/aws/ec2/group/{}/statefulInstance?accountId={}'.format(
130+
SPOTINST_API_URL, elastigroup_id, spotinst_account_data.account_id
131+
),
132+
headers=headers, timeout=(DEFAULT_CONNECT_TIMEOUT, DEFAULT_READ_TIMEOUT))
133+
response.raise_for_status()
134+
data = response.json()
135+
stateful_instances = data.get("response", {}).get("items", [])
136+
137+
return stateful_instances
138+
139+
140+
def recycle_stateful_instance(elastigroup_id, stateful_instance_id, spotinst_account_data):
141+
'''
142+
Triggers recycling of a single stateful instance of an ElastiGroup.
143+
Returns operation status.
144+
145+
Exceptions will be thrown for HTTP errors.
146+
147+
https://docs.spot.io/spotinst-api/elastigroup/amazon-web-services/stateful-api/recycle-stateful-instance/
148+
'''
149+
headers = {
150+
"Authorization": "Bearer {}".format(spotinst_account_data.access_token),
151+
"Content-Type": "application/json"
152+
}
153+
154+
response = requests.put(
155+
'{}/aws/ec2/group/{}/statefulInstance/{}/recycle?accountId={}'.format(
156+
SPOTINST_API_URL, elastigroup_id, stateful_instance_id, spotinst_account_data.account_id
157+
),
158+
headers=headers, timeout=(DEFAULT_CONNECT_TIMEOUT, DEFAULT_READ_TIMEOUT))
159+
response.raise_for_status()
160+
data = response.json()
161+
status = data.get("response", {}).get("status", {})
162+
163+
return status
164+
165+
115166
def patch_elastigroup(properties, elastigroup_id, spotinst_account_data):
116167
'''
117168
Patch specific properties of the ElastiGroup.

tests/test_elastigroup_api.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
from pytest import raises
55

66
from senza.components.elastigroup import ELASTIGROUP_RESOURCE_TYPE
7-
from senza.spotinst.components.elastigroup_api import update_capacity, get_elastigroup, patch_elastigroup, deploy, \
8-
deploy_status, SPOTINST_API_URL, SpotInstAccountData, get_spotinst_account_data
7+
from senza.spotinst.components.elastigroup_api import update_capacity, get_elastigroup, get_stateful_instances, \
8+
recycle_stateful_instance, patch_elastigroup, deploy, deploy_status, SPOTINST_API_URL, SpotInstAccountData, \
9+
get_spotinst_account_data
910

1011

1112
def test_update_capacity():
@@ -65,6 +66,57 @@ def test_get_elastigroup():
6566
assert group['name'] == 'my-app-1'
6667

6768

69+
def test_get_stateful_instances():
70+
instances = {
71+
'response': {
72+
'items': [{
73+
'id': 'ssi-abc123',
74+
'instanceId': 'i-321cba',
75+
}, {
76+
'id': 'ssi-def456',
77+
'instanceId': 'i-456def',
78+
}]
79+
}
80+
}
81+
82+
elastigroup_id = 'sig-xfy'
83+
spotinst_account_data = SpotInstAccountData('act-zwk', 'fake-token')
84+
with responses.RequestsMock() as rsps:
85+
rsps.add(rsps.GET,
86+
'{}/aws/ec2/group/{}/statefulInstance?accountId={}'.format(
87+
SPOTINST_API_URL, elastigroup_id, spotinst_account_data.account_id
88+
),
89+
status=200,
90+
json=instances)
91+
92+
instances = get_stateful_instances(elastigroup_id, spotinst_account_data)
93+
assert instances[0]['id'] == 'ssi-abc123'
94+
assert instances[1]['id'] == 'ssi-def456'
95+
96+
97+
def test_recycle_stateful_instance():
98+
recycle_response = {
99+
'response': {
100+
'status': {
101+
'code': 200
102+
}
103+
}
104+
}
105+
elastigroup_id = 'sig-xfy'
106+
stateful_instance_id = 'ssi-abcdef1'
107+
spotinst_account_data = SpotInstAccountData('act-zwk', 'fake-token')
108+
with responses.RequestsMock() as rsps:
109+
rsps.add(rsps.PUT,
110+
'{}/aws/ec2/group/{}/statefulInstance/{}/recycle?accountId={}'.format(
111+
SPOTINST_API_URL, elastigroup_id, stateful_instance_id, spotinst_account_data.account_id
112+
),
113+
status=200,
114+
json=recycle_response)
115+
116+
recycle_response = recycle_stateful_instance(elastigroup_id, stateful_instance_id, spotinst_account_data)
117+
assert recycle_response['code'] == 200
118+
119+
68120
def test_patch_elastigroup():
69121
patch = {
70122
'ImageId': 'image-foo',

0 commit comments

Comments
 (0)