Skip to content

Commit 2970bd6

Browse files
authored
fix: update deployment function (#110)
* fix: update deployment function * fix: code cleanup * fix: retry for updating capacity provider service * chore: update gin * fix: removed timeout parameter
1 parent e58b683 commit 2970bd6

File tree

4 files changed

+104
-33
lines changed

4 files changed

+104
-33
lines changed

src/emd/cfn/codepipeline/template.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,6 @@ Resources:
356356
TemplateConfiguration: BuildOutput::parameters.json
357357
Capabilities: CAPABILITY_IAM,CAPABILITY_NAMED_IAM
358358
RoleArn: !GetAtt CloudFormationServiceRole.Arn
359-
TimeoutInMinutes: 45
360359
InputArtifacts:
361360
- Name: BuildOutput
362361
RunOrder: 1

src/emd/cfn/ecs/template.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,14 @@ Resources:
386386
- Type: forward
387387
TargetGroupArn: !Ref ServiceTargetGroup
388388

389+
ForceAPIRouterDeployment:
390+
Type: Custom::ForceAPIRouterDeployment
391+
DependsOn: Service
392+
Properties:
393+
ServiceToken: !Ref LambdaDeploymentHelperArn
394+
# Adding a timestamp parameter to ensure this resource is updated when needed
395+
Timestamp: !Ref "AWS::StackName"
396+
389397
Outputs:
390398
Model:
391399
Description: Model ID used to generate the response.

src/emd/cfn/shared/ecs_cluster.yaml

Lines changed: 95 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ Resources:
140140
Action:
141141
- ecs:PutClusterCapacityProviders
142142
- ecs:DescribeClusters
143+
- ecs:UpdateService
144+
- ecs:DescribeServices
143145
- logs:CreateLogGroup
144146
- logs:CreateLogStream
145147
- logs:PutLogEvents
@@ -162,40 +164,70 @@ Resources:
162164
ecs_client = boto3.client('ecs')
163165
cluster_name = os.environ['ECS_CLUSTER_NAME']
164166
capacity_provider_name = event['ResourceProperties']['CapacityProvider']
167+
168+
def try_update_with_retry():
169+
# Simple retry mechanism - try twice with a delay
170+
try:
171+
return _do_update()
172+
except Exception as e:
173+
if 'UpdateInProgressException' in str(e):
174+
print("Cluster busy, waiting 30 seconds before retry...")
175+
import time
176+
time.sleep(30)
177+
return _do_update() # Try once more
178+
else:
179+
raise # Re-raise if it's not the specific error we're handling
180+
181+
def _do_update():
182+
cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0]
183+
current_capacity_providers = cluster_info.get('capacityProviders', [])
184+
185+
if capacity_provider_name not in current_capacity_providers:
186+
current_capacity_providers.append(capacity_provider_name)
187+
188+
return ecs_client.put_cluster_capacity_providers(
189+
cluster=cluster_name,
190+
capacityProviders=current_capacity_providers,
191+
defaultCapacityProviderStrategy=[
192+
{
193+
'capacityProvider': capacity_provider_name,
194+
'weight': 1,
195+
'base': 0
196+
}
197+
]
198+
)
199+
165200
try:
166201
if event['RequestType'] in ['Create', 'Update']:
167-
cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0]
168-
current_capacity_providers = cluster_info.get('capacityProviders', [])
169-
170-
if capacity_provider_name not in current_capacity_providers:
171-
current_capacity_providers.append(capacity_provider_name)
172-
173-
ecs_client.put_cluster_capacity_providers(
174-
cluster=cluster_name,
175-
capacityProviders=current_capacity_providers,
176-
defaultCapacityProviderStrategy=[
177-
{
178-
'capacityProvider': capacity_provider_name,
179-
'weight': 1,
180-
'base': 0
181-
}
182-
]
183-
)
202+
try_update_with_retry()
184203
elif event['RequestType'] == 'Delete':
185-
# Retrieve current capacity providers
186-
cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0]
187-
current_capacity_providers = cluster_info.get('capacityProviders', [])
188-
189-
# Remove only the specific capacity provider
190-
updated_capacity_providers = [
191-
cp for cp in current_capacity_providers if cp != capacity_provider_name
192-
]
193-
194-
ecs_client.put_cluster_capacity_providers(
195-
cluster=cluster_name,
196-
capacityProviders=updated_capacity_providers,
197-
defaultCapacityProviderStrategy=[]
198-
)
204+
def _do_delete():
205+
# Retrieve current capacity providers
206+
cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0]
207+
current_capacity_providers = cluster_info.get('capacityProviders', [])
208+
209+
# Remove only the specific capacity provider
210+
updated_capacity_providers = [
211+
cp for cp in current_capacity_providers if cp != capacity_provider_name
212+
]
213+
214+
return ecs_client.put_cluster_capacity_providers(
215+
cluster=cluster_name,
216+
capacityProviders=updated_capacity_providers,
217+
defaultCapacityProviderStrategy=[]
218+
)
219+
220+
# Simple retry for delete operation too
221+
try:
222+
_do_delete()
223+
except Exception as e:
224+
if 'UpdateInProgressException' in str(e):
225+
print("Cluster busy during delete, waiting 30 seconds before retry...")
226+
import time
227+
time.sleep(30)
228+
_do_delete() # Try once more
229+
else:
230+
raise
199231
cfnresponse.send(event, context, cfnresponse.SUCCESS, {})
200232
except Exception as e:
201233
cfnresponse.send(event, context, cfnresponse.FAILED, {'Error': str(e)})
@@ -240,6 +272,38 @@ Resources:
240272
response_data = {'DnsName': dns_name}
241273
cfnresponse.send(event, context, cfnresponse.SUCCESS, response_data)
242274
275+
def force_api_router_deployment(event, context):
276+
"""
277+
Forces a new deployment for the APIRouterService.
278+
This will restart the service with the latest task definition.
279+
"""
280+
ecs_client = boto3.client('ecs')
281+
cluster_name = os.environ['ECS_CLUSTER_NAME']
282+
service_name = "EMD-API-Router"
283+
284+
try:
285+
# Check if the service exists
286+
response = ecs_client.describe_services(
287+
cluster=cluster_name,
288+
services=[service_name]
289+
)
290+
291+
if not response['services'] or response['services'][0]['status'] != 'ACTIVE':
292+
raise Exception(f"Service {service_name} not found or not active in cluster {cluster_name}")
293+
294+
# Force a new deployment
295+
ecs_client.update_service(
296+
cluster=cluster_name,
297+
service=service_name,
298+
forceNewDeployment=True
299+
)
300+
301+
response_data = {'Message': f"Forced new deployment for {service_name}"}
302+
cfnresponse.send(event, context, cfnresponse.SUCCESS, response_data)
303+
except Exception as e:
304+
print(f"Error forcing deployment: {str(e)}")
305+
cfnresponse.send(event, context, cfnresponse.FAILED, {'Error': str(e)})
306+
243307
def handler(event, context):
244308
print(event)
245309
print(context)

src/emd/cfn/shared/openai_router/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ toolchain go1.24.2
66

77
require (
88
github.com/aws/aws-sdk-go v1.54.0
9-
github.com/gin-gonic/gin v1.8.1
9+
github.com/gin-gonic/gin v1.9.1
1010
)
1111

1212
require (

0 commit comments

Comments
 (0)