11"""This is called to run a trial by worker nodes (local / remote)."""
22
33
4+ def assign_delete_permissions (
5+ aci_client ,
6+ auth_client ,
7+ max_undead_containers ,
8+ credential ,
9+ subscription_id ,
10+ client_id ,
11+ resource_group_name ,
12+ container_groups ,
13+ ):
14+ from heapq import heappush , heappop
15+ from datetime import datetime
16+ import time
17+ import uuid
18+ from azure .mgmt .containerinstance import ContainerInstanceManagementClient
19+ from azure .mgmt .authorization import AuthorizationManagementClient
20+ from azure .mgmt .authorization .models import RoleAssignmentCreateParameters
21+ from azure .core .exceptions import HttpResponseError
22+
23+ # Contributor Role
24+ role_definition_id = f"/subscriptions/{ subscription_id } /providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
25+
26+ while max_undead_containers < len (container_groups ):
27+ _ , container_group_name , started = heappop (container_groups )
28+ try :
29+ if started is not None :
30+ if not started .done ():
31+ heappush (
32+ container_groups ,
33+ (datetime .now (), container_group_name , started ),
34+ )
35+ time .sleep (1 )
36+ continue
37+ started = None
38+
39+ if aci_client is None :
40+ aci_client = ContainerInstanceManagementClient (
41+ credential , subscription_id
42+ )
43+
44+ container_group = aci_client .container_groups .get (
45+ resource_group_name , container_group_name
46+ )
47+
48+ role_assignment_params1 = RoleAssignmentCreateParameters (
49+ role_definition_id = role_definition_id ,
50+ principal_id = container_group .identity .principal_id ,
51+ principal_type = "ServicePrincipal" ,
52+ )
53+ role_assignment_params2 = RoleAssignmentCreateParameters (
54+ role_definition_id = role_definition_id ,
55+ principal_id = client_id ,
56+ principal_type = "User" ,
57+ )
58+ scope = f"/subscriptions/{ subscription_id } /resourceGroups/{ resource_group_name } /providers/Microsoft.ContainerInstance/containerGroups/{ container_group_name } "
59+
60+ if auth_client is None :
61+ auth_client = AuthorizationManagementClient (credential , subscription_id )
62+
63+ auth_client .role_assignments .create (
64+ scope , str (uuid .uuid4 ()), role_assignment_params1
65+ )
66+ auth_client .role_assignments .create (
67+ scope , str (uuid .uuid4 ()), role_assignment_params2
68+ )
69+ except HttpResponseError :
70+ aci_client = None
71+ auth_client = None
72+ heappush (container_groups , (datetime .now (), container_group_name , started ))
73+ time .sleep (1 )
74+
75+ return aci_client , auth_client
76+
77+
478def run_azure_process (
579 experiment_id ,
680 n_runners ,
@@ -16,46 +90,33 @@ def run_azure_process(
1690):
1791 startup_script = """
1892 self_delete() {
19- echo "Attempt to self-delete this container group. Exit code was $1."
93+ echo "Attempt to self-delete this container group. Exit code was: $1"
94+
95+ if [ $1 -ne 0 ]; then
96+ echo "Waiting 10 mintues to allow inspection of the logs..."
97+ sleep 600
98+ fi
99+
100+ SUBSCRIPTION_ID=${SUBSCRIPTION_ID}
101+ RESOURCE_GROUP_NAME=${RESOURCE_GROUP_NAME}
102+ CONTAINER_GROUP_NAME=${CONTAINER_GROUP_NAME}
20103
21104 retry_count=0
22105 while true; do
23106 echo "Downloading azure tools."
24107
25108 curl -sL https://aka.ms/InstallAzureCLIDeb -o install_script.sh
26109 exit_code=$?
27- if [ $exit_code -eq 0 ]; then
28- break
110+ if [ $exit_code -ne 0 ]; then
111+ echo "curl failed with exit code $exit_code."
29112 fi
30113
31- echo "curl failed with exit code $exit_code."
32- if [ $retry_count -ge 300 ]; then
33- echo "Maximum number of retries reached. Command failed."
34- exit 62
114+ bash install_script.sh
115+ exit_code=$?
116+ if [ $exit_code -ne 0 ]; then
117+ echo "Failed to install azure tools with exit code $exit_code. Attempting to delete anyway."
35118 fi
36- retry_count=$((retry_count + 1))
37- echo "Sleeping."
38- sleep 300
39- echo "Retrying."
40- done
41-
42- bash install_script.sh
43- exit_code=$?
44- if [ $exit_code -ne 0 ]; then
45- echo "Failed to install azure tools with exit code $exit_code. Attempting to delete anyway."
46- fi
47-
48- SUBSCRIPTION_ID=${SUBSCRIPTION_ID}
49- RESOURCE_GROUP_NAME=${RESOURCE_GROUP_NAME}
50- CONTAINER_GROUP_NAME=${CONTAINER_GROUP_NAME}
51119
52- if [ $1 -ne 0 ]; then
53- echo "Waiting 10 mintues to allow inspection of the logs..."
54- sleep 600
55- fi
56-
57- retry_count=0
58- while true; do
59120 echo "Logging into azure to delete this container."
60121 az login --identity
61122 exit_code=$?
@@ -78,8 +139,10 @@ def run_azure_process(
78139 break
79140 fi
80141 retry_count=$((retry_count + 1))
81- done
82142
143+ echo "Retrying."
144+ done
145+
83146 exit $1 # failed to self-kill the container we are running this on.
84147 }
85148
@@ -298,11 +361,10 @@ def run_azure_process(
298361 import time
299362 import uuid
300363 from multiprocessing .pool import MaybeEncodingError
301-
364+ from heapq import heappush
365+ from datetime import datetime
302366 from azure .core .exceptions import HttpResponseError
303367 from azure .identity import ClientSecretCredential
304- from azure .mgmt .authorization import AuthorizationManagementClient
305- from azure .mgmt .authorization .models import RoleAssignmentCreateParameters
306368 from azure .mgmt .containerinstance import ContainerInstanceManagementClient
307369 from azure .mgmt .containerinstance .models import (
308370 Container ,
@@ -315,6 +377,8 @@ def run_azure_process(
315377 )
316378 from azure .mgmt .resource import ResourceManagementClient
317379
380+ max_undead_containers = 5
381+
318382 client_id = azure_json ["client_id" ]
319383
320384 if credential is None :
@@ -327,11 +391,15 @@ def run_azure_process(
327391 resource_group_name = azure_json ["resource_group" ]
328392 subscription_id = azure_json ["subscription_id" ]
329393
330- aci_client = ContainerInstanceManagementClient (credential , subscription_id )
394+ aci_client = None
395+ auth_client = None
396+ container_groups = []
331397 res_client = ResourceManagementClient (credential , subscription_id )
332398
333399 # If this first call fails, then allow the Exception to propagate.
334- resource_group = res_client .resource_groups .get (resource_group_name )
400+ resource_group_location = res_client .resource_groups .get (
401+ resource_group_name
402+ ).location
335403
336404 container_resource_requests = ResourceRequests (
337405 cpu = num_cores ,
@@ -342,7 +410,6 @@ def run_azure_process(
342410 )
343411
344412 container_group_names = set ()
345- starts = []
346413 for runner_id in range (n_runners ):
347414 container_group_name = f"powerlift-container-group-{ batch_id } -{ runner_id :04} "
348415
@@ -366,85 +433,52 @@ def run_azure_process(
366433 environment_variables = env_vars ,
367434 )
368435 container_group = ContainerGroup (
369- location = resource_group . location ,
436+ location = resource_group_location ,
370437 containers = [container ],
371438 os_type = OperatingSystemTypes .linux ,
372439 restart_policy = ContainerGroupRestartPolicy .never ,
373440 identity = {"type" : "SystemAssigned" },
374441 )
375442
443+ if aci_client is None :
444+ aci_client = ContainerInstanceManagementClient (credential , subscription_id )
445+
376446 while True :
377447 try :
378448 # begin_create_or_update returns LROPoller,
379449 # but this is only indicates when the containter is started
380450 started = aci_client .container_groups .begin_create_or_update (
381- resource_group . name , container_group_name , container_group
451+ resource_group_name , container_group_name , container_group
382452 )
383453 break
384454 except HttpResponseError :
385455 time .sleep (1 )
386456
387- starts .append (started )
388-
389457 container_group_names .add (container_group_name )
458+ heappush (container_groups , (datetime .now (), container_group_name , started ))
459+ aci_client , auth_client = assign_delete_permissions (
460+ aci_client ,
461+ auth_client ,
462+ max_undead_containers ,
463+ credential ,
464+ subscription_id ,
465+ client_id ,
466+ resource_group_name ,
467+ container_groups ,
468+ )
390469
391- # make sure they have all started before exiting the process
392- for started in starts :
393- while True :
394- try :
395- while not started .done ():
396- time .sleep (1 )
397- break
398- except HttpResponseError :
399- time .sleep (1 )
470+ assign_delete_permissions (
471+ aci_client ,
472+ auth_client ,
473+ 0 ,
474+ credential ,
475+ subscription_id ,
476+ client_id ,
477+ resource_group_name ,
478+ container_groups ,
479+ )
400480
401481 if delete_group_container_on_complete :
402- auth_client = AuthorizationManagementClient (credential , subscription_id )
403-
404- # Contributor Role
405- role_definition_id = f"/subscriptions/{ subscription_id } /providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
406-
407- for container_group_name in container_group_names :
408- while True :
409- try :
410- container_group = aci_client .container_groups .get (
411- resource_group_name , container_group_name
412- )
413- break
414- except HttpResponseError :
415- time .sleep (1 )
416-
417- scope = f"/subscriptions/{ subscription_id } /resourceGroups/{ resource_group_name } /providers/Microsoft.ContainerInstance/containerGroups/{ container_group_name } "
418- role_assignment_params = RoleAssignmentCreateParameters (
419- role_definition_id = role_definition_id ,
420- principal_id = container_group .identity .principal_id ,
421- principal_type = "ServicePrincipal" ,
422- )
423-
424- while True :
425- try :
426- auth_client .role_assignments .create (
427- scope , str (uuid .uuid4 ()), role_assignment_params
428- )
429- break
430- except HttpResponseError :
431- time .sleep (1 )
432-
433- role_assignment_params = RoleAssignmentCreateParameters (
434- role_definition_id = role_definition_id ,
435- principal_id = client_id ,
436- principal_type = "User" ,
437- )
438-
439- while True :
440- try :
441- auth_client .role_assignments .create (
442- scope , str (uuid .uuid4 ()), role_assignment_params
443- )
444- break
445- except HttpResponseError :
446- time .sleep (1 )
447-
448482 deletes = []
449483 while len (container_group_names ) != 0 :
450484 remove_after = []
0 commit comments