Skip to content

Commit 6302d43

Browse files
author
allegroai
committed
Add support for skipping container apt installs using CLEARML_AGENT_SKIP_CONTAINER_APT env var in k8s
Add runtime callback support for setting runtime properties per task in k8s Fix remove task from pending queue and set to failed when kubectl apply fails
1 parent 760bbca commit 6302d43

File tree

1 file changed

+101
-19
lines changed

1 file changed

+101
-19
lines changed

clearml_agent/glue/k8s.py

Lines changed: 101 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -69,16 +69,23 @@ class K8sIntegration(Worker):
6969
'echo "ldconfig" >> /etc/profile',
7070
"/usr/sbin/sshd -p {port}"]
7171

72-
CONTAINER_BASH_SCRIPT = [
72+
_CONTAINER_APT_SCRIPT_SECTION = [
7373
"export DEBIAN_FRONTEND='noninteractive'",
7474
"echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean",
7575
"chown -R root /root/.cache/pip",
7676
"apt-get update",
7777
"apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
78+
]
79+
80+
CONTAINER_BASH_SCRIPT = [
81+
*(
82+
'[ ! -z "$CLEARML_AGENT_SKIP_CONTAINER_APT" ] || {}'.format(line)
83+
for line in _CONTAINER_APT_SCRIPT_SECTION
84+
),
7885
"declare LOCAL_PYTHON",
7986
"[ ! -z $LOCAL_PYTHON ] || for i in {{15..5}}; do which python3.$i && python3.$i -m pip --version && "
8087
"export LOCAL_PYTHON=$(which python3.$i) && break ; done",
81-
"[ ! -z $LOCAL_PYTHON ] || apt-get install -y python3-pip",
88+
'[ ! -z "$CLEARML_AGENT_SKIP_CONTAINER_APT" ] || [ ! -z "$LOCAL_PYTHON" ] || apt-get install -y python3-pip',
8289
"[ ! -z $LOCAL_PYTHON ] || export LOCAL_PYTHON=python3",
8390
"{extra_bash_init_cmd}",
8491
"[ ! -z $CLEARML_AGENT_NO_UPDATE ] || $LOCAL_PYTHON -m pip install clearml-agent{agent_install_args}",
@@ -100,6 +107,7 @@ def __init__(
100107
num_of_services=20,
101108
base_pod_num=1,
102109
user_props_cb=None,
110+
runtime_cb=None,
103111
overrides_yaml=None,
104112
template_yaml=None,
105113
clearml_conf_file=None,
@@ -127,6 +135,7 @@ def __init__(
127135
:param callable user_props_cb: An Optional callable allowing additional user properties to be specified
128136
when scheduling a task to run in a pod. Callable can receive an optional pod number and should return
129137
a dictionary of user properties (name and value). Signature is [[Optional[int]], Dict[str,str]]
138+
:param callable runtime_cb: An Optional callable allowing additional task runtime to be specified (see user_props_cb)
130139
:param str overrides_yaml: YAML file containing the overrides for the pod (optional)
131140
:param str template_yaml: YAML file containing the template for the pod (optional).
132141
If provided the pod is scheduled with kubectl apply and overrides are ignored, otherwise with kubectl run.
@@ -161,6 +170,7 @@ def __init__(
161170
self.base_pod_num = base_pod_num
162171
self._edit_hyperparams_support = None
163172
self._user_props_cb = user_props_cb
173+
self._runtime_cb = runtime_cb
164174
self.conf_file_content = None
165175
self.overrides_json_string = None
166176
self.template_dict = None
@@ -198,6 +208,10 @@ def __init__(
198208
self._session.feature_set != "basic" and self._session.check_min_server_version("3.22.3")
199209
)
200210

211+
@property
212+
def agent_label(self):
213+
return self._get_agent_label()
214+
201215
def _create_daemon_instance(self, cls_, **kwargs):
202216
return cls_(agent=self, **kwargs)
203217

@@ -430,6 +444,9 @@ def resource_applied(self, resource_name: str, namespace: str, task_id: str, ses
430444
""" Called when a resource (pod/job) was applied """
431445
pass
432446

447+
def ports_mode_supported_for_task(self, task_id: str, task_data):
448+
return self.ports_mode
449+
433450
def run_one_task(self, queue: Text, task_id: Text, worker_args=None, task_session=None, **_):
434451
print('Pulling task {} launching on kubernetes cluster'.format(task_id))
435452
session = task_session or self._session
@@ -501,8 +518,10 @@ def run_one_task(self, queue: Text, task_id: Text, worker_args=None, task_sessio
501518
)
502519
)
503520

504-
if self.ports_mode:
521+
ports_mode = False
522+
if self.ports_mode_supported_for_task(task_id, task_data):
505523
print("Kubernetes looking for available pod to use")
524+
ports_mode = True
506525

507526
# noinspection PyBroadException
508527
try:
@@ -513,12 +532,12 @@ def run_one_task(self, queue: Text, task_id: Text, worker_args=None, task_sessio
513532
# Search for a free pod number
514533
pod_count = 0
515534
pod_number = self.base_pod_num
516-
while self.ports_mode or self.max_pods_limit:
535+
while ports_mode or self.max_pods_limit:
517536
pod_number = self.base_pod_num + pod_count
518537

519538
try:
520539
items_count = self._get_pod_count(
521-
extra_labels=[self.limit_pod_label.format(pod_number=pod_number)] if self.ports_mode else None,
540+
extra_labels=[self.limit_pod_label.format(pod_number=pod_number)] if ports_mode else None,
522541
msg="Looking for a free pod/port"
523542
)
524543
except GetPodCountError:
@@ -568,11 +587,11 @@ def run_one_task(self, queue: Text, task_id: Text, worker_args=None, task_sessio
568587
break
569588
pod_count += 1
570589

571-
labels = self._get_pod_labels(queue, queue_name)
572-
if self.ports_mode:
590+
labels = self._get_pod_labels(queue, queue_name, task_data)
591+
if ports_mode:
573592
labels.append(self.limit_pod_label.format(pod_number=pod_number))
574593

575-
if self.ports_mode:
594+
if ports_mode:
576595
print("Kubernetes scheduling task id={} on pod={} (pod_count={})".format(task_id, pod_number, pod_count))
577596
else:
578597
print("Kubernetes scheduling task id={}".format(task_id))
@@ -611,40 +630,95 @@ def run_one_task(self, queue: Text, task_id: Text, worker_args=None, task_sessio
611630
send_log = "Running kubectl encountered an error: {}".format(error)
612631
self.log.error(send_log)
613632
self.send_logs(task_id, send_log.splitlines())
633+
634+
# Make sure to remove the task from our k8s pending queue
635+
self._session.api_client.queues.remove_task(
636+
task=task_id,
637+
queue=self.k8s_pending_queue_id,
638+
)
639+
# Set task as failed
640+
session.api_client.tasks.failed(task_id, force=True)
614641
return
615642

616643
if pod_name:
617644
self.resource_applied(
618645
resource_name=pod_name, namespace=namespace, task_id=task_id, session=session
619646
)
620647

648+
self.set_task_info(
649+
task_id=task_id, task_session=task_session, queue_name=queue_name, ports_mode=ports_mode,
650+
pod_number=pod_number, pod_count=pod_count, task_data=task_data
651+
)
652+
653+
def set_task_info(
654+
self, task_id: str, task_session, task_data, queue_name: str, ports_mode: bool, pod_number, pod_count
655+
):
621656
user_props = {"k8s-queue": str(queue_name)}
622-
if self.ports_mode:
623-
user_props.update(
624-
{
625-
"k8s-pod-number": pod_number,
626-
"k8s-pod-label": labels[0],
627-
"k8s-internal-pod-count": pod_count,
628-
"k8s-agent": self._get_agent_label(),
629-
}
630-
)
657+
runtime = {}
658+
if ports_mode:
659+
agent_label = self._get_agent_label()
660+
user_props.update({
661+
"k8s-pod-number": pod_number,
662+
"k8s-pod-label": agent_label, # backwards-compatibility / legacy
663+
"k8s-internal-pod-count": pod_count,
664+
"k8s-agent": agent_label,
665+
})
631666

632667
if self._user_props_cb:
633668
# noinspection PyBroadException
634669
try:
635-
custom_props = self._user_props_cb(pod_number) if self.ports_mode else self._user_props_cb()
670+
custom_props = self._user_props_cb(pod_number) if ports_mode else self._user_props_cb()
636671
user_props.update(custom_props)
637672
except Exception:
638673
pass
639674

675+
if self._runtime_cb:
676+
# noinspection PyBroadException
677+
try:
678+
custom_runtime = self._runtime_cb(pod_number) if ports_mode else self._runtime_cb()
679+
runtime.update(custom_runtime)
680+
except Exception:
681+
pass
682+
640683
if user_props:
641684
self._set_task_user_properties(
642685
task_id=task_id,
643686
task_session=task_session,
644687
**user_props
645688
)
646689

647-
def _get_pod_labels(self, queue, queue_name):
690+
if runtime:
691+
task_runtime = self._get_task_runtime(task_id) or {}
692+
task_runtime.update(runtime)
693+
694+
try:
695+
res = task_session.send_request(
696+
service='tasks', action='edit', method=Request.def_method,
697+
json={
698+
"task": task_id, "force": True, "runtime": task_runtime
699+
},
700+
)
701+
if not res.ok:
702+
raise Exception("failed setting runtime property")
703+
except Exception as ex:
704+
print("WARNING: failed setting custom runtime properties for task '{}': {}".format(task_id, ex))
705+
706+
def _get_task_runtime(self, task_id) -> Optional[dict]:
707+
try:
708+
res = self._session.send_request(
709+
service='tasks', action='get_by_id', method=Request.def_method,
710+
json={"task": task_id, "only_fields": ["runtime"]},
711+
)
712+
if not res.ok:
713+
raise ValueError(f"request returned {res.status_code}")
714+
data = res.json().get("data")
715+
if not data or "task" not in data:
716+
raise ValueError("empty data in result")
717+
return data["task"].get("runtime", {})
718+
except Exception as ex:
719+
print(f"ERROR: Failed getting runtime properties for task {task_id}: {ex}")
720+
721+
def _get_pod_labels(self, queue, queue_name, task_data):
648722
return [
649723
self._get_agent_label(),
650724
"{}={}".format(self.QUEUE_LABEL, self._safe_k8s_label_value(queue)),
@@ -1012,6 +1086,9 @@ def _cleanup_old_pods(self, namespaces, extra_msg=None):
10121086

10131087
return deleted_pods
10141088

1089+
def check_if_suspended(self) -> bool:
1090+
pass
1091+
10151092
def run_tasks_loop(self, queues: List[Text], worker_params, **kwargs):
10161093
"""
10171094
:summary: Pull and run tasks from queues.
@@ -1061,6 +1138,11 @@ def run_tasks_loop(self, queues: List[Text], worker_params, **kwargs):
10611138
# delete old completed / failed pods
10621139
self._cleanup_old_pods(namespaces, extra_msg="Cleanup cycle {cmd}")
10631140

1141+
if self.check_if_suspended():
1142+
print("Agent is suspended, sleeping for {:.1f} seconds".format(self._polling_interval))
1143+
sleep(self._polling_interval)
1144+
break
1145+
10641146
# get next task in queue
10651147
try:
10661148
# print(f"debug> getting tasks for queue {queue}")

0 commit comments

Comments
 (0)