Netflix · saikonen · Apr 15, 2025 · Apr 22, 2025 · Apr 23, 2025 · Apr 23, 2025
diff --git a/metaflow/plugins/argo/argo_workflows.py b/metaflow/plugins/argo/argo_workflows.py
@@ -54,6 +54,7 @@
 from metaflow.plugins.kubernetes.kube_utils import qos_requests_and_limits
 
 from metaflow.plugins.kubernetes.kubernetes_jobsets import KubernetesArgoJobSet
+from metaflow.plugins.retry_decorator import PLATFORM_EVICTED_EXITCODE, RetryEvents
 from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
 from metaflow.user_configs.config_options import ConfigInput
 from metaflow.util import (
@@ -1520,11 +1521,13 @@ def _container_templates(self):
             max_user_code_retries = 0
             max_error_retries = 0
             minutes_between_retries = "2"
+            retry_conditions = []
             for decorator in node.decorators:
                 if decorator.name == "retry":
                     minutes_between_retries = decorator.attributes.get(
                         "minutes_between_retries", minutes_between_retries
                     )
+                    retry_conditions = decorator.attributes["only_on"]
                 user_code_retries, error_retries = decorator.step_task_retry_count()
                 max_user_code_retries = max(max_user_code_retries, user_code_retries)
                 max_error_retries = max(max_error_retries, error_retries)
@@ -1546,6 +1549,21 @@ def _container_templates(self):
 
             minutes_between_retries = int(minutes_between_retries)
 
+            # Translate RetryEvents to expressions for Argo
+            event_to_expr = {
+                RetryEvents.STEP: "asInt(lastRetry.exitCode) == 1",
+                RetryEvents.PREEMPT: "asInt(lastRetry.exitCode) == %s"
+                % PLATFORM_EVICTED_EXITCODE,
+            }
+            retry_expr = None
+            if retry_conditions:
+                retry_expressions = [
+                    expr
+                    for event, expr in event_to_expr.items()
+                    if event.value in retry_conditions
+                ]
+                retry_expr = "||".join(retry_expressions)
+
             # Configure log capture.
             mflog_expr = export_mflog_env_vars(
                 datastore_type=self.flow_datastore.TYPE,
@@ -2137,6 +2155,7 @@ def _container_templates(self):
                     .retry_strategy(
                         times=total_retries,
                         minutes_between_retries=minutes_between_retries,
+                        expression=retry_expr,
                     )
                 )
             else:
@@ -2156,6 +2175,7 @@ def _container_templates(self):
                     .retry_strategy(
                         times=total_retries,
                         minutes_between_retries=minutes_between_retries,
+                        expression=retry_expr,
                     )
                     .metadata(
                         ObjectMeta()
@@ -3661,13 +3681,17 @@ def service_account_name(self, service_account_name):
         self.payload["serviceAccountName"] = service_account_name
         return self
 
-    def retry_strategy(self, times, minutes_between_retries):
+    def retry_strategy(self, times, minutes_between_retries, expression=None):
         if times > 0:
             self.payload["retryStrategy"] = {
-                "retryPolicy": "Always",
                 "limit": times,
                 "backoff": {"duration": "%sm" % minutes_between_retries},
             }
+            if expression is None:
+                self.payload["retryStrategy"]["retryPolicy"] = "Always"
+            else:
+                self.payload["retryStrategy"]["expression"] = expression
+
         return self
 
     def empty_dir_volume(self, name, medium=None, size_limit=None):

diff --git a/metaflow/plugins/aws/batch/batch.py b/metaflow/plugins/aws/batch/batch.py
@@ -52,6 +52,10 @@ class BatchKilledException(MetaflowException):
     headline = "AWS Batch task killed"
 
 
+class BatchSpotInstanceTerminated(MetaflowException):
+    headline = "Spot Instance has been terminated"
+
+
 class Batch(object):
     def __init__(self, metadata, environment):
         self.metadata = metadata
@@ -482,6 +486,11 @@ def wait_for_launch(job, child_jobs):
         # to Amazon S3.
 
         if self.job.is_crashed:
+
+            # Custom exception for spot instance terminations
+            if self.job.status_code == 234:
+                raise BatchSpotInstanceTerminated()
+
             msg = next(
                 msg
                 for msg in [

diff --git a/metaflow/plugins/aws/batch/batch_cli.py b/metaflow/plugins/aws/batch/batch_cli.py
@@ -6,12 +6,22 @@
 
 from metaflow import util
 from metaflow import R
-from metaflow.exception import CommandException, METAFLOW_EXIT_DISALLOW_RETRY
+from metaflow.exception import (
+    METAFLOW_EXIT_ALLOW_RETRY,
+    CommandException,
+    METAFLOW_EXIT_DISALLOW_RETRY,
+)
 from metaflow.metadata_provider.util import sync_local_metadata_from_datastore
 from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
 from metaflow.mflog import TASK_LOG_SOURCE
 from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
-from .batch import Batch, BatchKilledException
+from .batch import (
+    Batch,
+    BatchException,
+    BatchKilledException,
+    BatchSpotInstanceTerminated,
+)
+from metaflow.plugins.retry_decorator import RetryEvents
 
 
 @click.group()
@@ -283,6 +293,7 @@ def echo(msg, stream="stderr", batch_id=None, **kwargs):
     if split_vars:
         env.update(split_vars)
 
+    retry_conditions = retry_deco[0].attributes["only_on"] if retry_deco else []
     if retry_count:
         ctx.obj.echo_always(
             "Sleeping %d minutes before the next AWS Batch retry"
@@ -356,5 +367,15 @@ def _sync_metadata():
         # don't retry killed tasks
         traceback.print_exc()
         sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
+    except BatchSpotInstanceTerminated:
+        traceback.print_exc()
+        if not retry_conditions or RetryEvents.PREEMPT.value in retry_conditions:
+            sys.exit(METAFLOW_EXIT_ALLOW_RETRY)
+        else:
+            sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
+    except BatchException:
+        if not retry_conditions or RetryEvents.STEP.value in retry_conditions:
+            raise
+        sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
     finally:
         _sync_metadata()
diff --git a/metaflow/plugins/aws/batch/batch_client.py b/metaflow/plugins/aws/batch/batch_client.py
@@ -4,6 +4,7 @@
 import random
 import time
 import hashlib
+from typing import Dict, List, Optional
 
 try:
     unicode
@@ -630,8 +631,12 @@ def parameter(self, key, value):
         self.payload["parameters"][key] = str(value)
         return self
 
-    def attempts(self, attempts):
+    def attempts(self, attempts, evaluate_on_exit: Optional[List[Dict]] = None):
         self.payload["retryStrategy"]["attempts"] = attempts
+        if evaluate_on_exit is not None:
+            # required for specifying custom retry strategies
+            # ref: https://docs.aws.amazon.com/batch/latest/APIReference/API_EvaluateOnExit.html
+            self.payload["retryStrategy"]["evaluateOnExit"] = evaluate_on_exit
         return self
 
 

diff --git a/metaflow/plugins/aws/batch/batch_decorator.py b/metaflow/plugins/aws/batch/batch_decorator.py
@@ -299,6 +299,8 @@ def task_pre_step(
             self._save_logs_sidecar.start()
 
             # Start spot termination monitor sidecar.
+            # TODO: A nicer way to pass the main process id to a Sidecar, in order to allow sidecars to send signals back to the main process.
+            os.environ["MF_MAIN_PID"] = str(os.getpid())
             current._update_env(
                 {"spot_termination_notice": "/tmp/spot_termination_notice"}
             )

diff --git a/metaflow/plugins/aws/step_functions/step_functions.py b/metaflow/plugins/aws/step_functions/step_functions.py
@@ -18,6 +18,7 @@
     SFN_S3_DISTRIBUTED_MAP_OUTPUT_PATH,
 )
 from metaflow.parameters import deploy_time_eval
+from metaflow.plugins.retry_decorator import PLATFORM_EVICTED_EXITCODE, RetryEvents
 from metaflow.user_configs.config_options import ConfigInput
 from metaflow.util import dict_to_cli_options, to_pascalcase
 
@@ -824,9 +825,38 @@ def _batch(self, node):
         batch_deco = [deco for deco in node.decorators if deco.name == "batch"][0]
         resources = {}
         resources.update(batch_deco.attributes)
+
         # Resolve retry strategy.
         user_code_retries, total_retries = self._get_retries(node)
 
+        # retry conditions mapping
+        retry_deco = next(
+            (deco for deco in node.decorators if deco.name == "retry"), None
+        )
+        retry_conditions = (
+            retry_deco.attributes["only_on"] if retry_deco is not None else []
+        )
+
+        # Translate RetryEvents to expressions for SFN
+        event_to_expr = {
+            RetryEvents.STEP: {"action": "RETRY", "onExitCode": "1"},
+            RetryEvents.PREEMPT: {
+                "action": "RETRY",
+                "onExitCode": str(PLATFORM_EVICTED_EXITCODE),
+            },
+        }
+        retry_expr = None
+        # NOTE: AWS only allows 5 distinct EvaluateOnExit conditions, so any more than this will require combining them.
+        if retry_conditions:
+            retry_expr = [
+                expr
+                for event, expr in event_to_expr.items()
+                if event.value in retry_conditions
+            ]
+            # we need to append a catch-all exit condition, as for no matches the default behavior with Batch is to retry the job.
+            # retry conditions are only evaluated for non-zero exit codes, so the wildcard is fine here.
+            retry_expr.append({"action": "EXIT", "onExitCode": "*"})
+
         task_spec = {
             "flow_name": attrs["metaflow.flow_name"],
             "step_name": attrs["metaflow.step_name"],
@@ -875,7 +905,7 @@ def _batch(self, node):
                 log_driver=resources["log_driver"],
                 log_options=resources["log_options"],
             )
-            .attempts(total_retries + 1)
+            .attempts(attempts=total_retries + 1, evaluate_on_exit=retry_expr)
         )
 
     def _get_retries(self, node):

diff --git a/metaflow/plugins/kubernetes/kubernetes.py b/metaflow/plugins/kubernetes/kubernetes.py
@@ -72,6 +72,10 @@ class KubernetesKilledException(MetaflowException):
     headline = "Kubernetes Batch job killed"
 
 
+class KubernetesSpotInstanceTerminated(MetaflowException):
+    headline = "Kubernetes node spot instance has been terminated"
+
+
 class Kubernetes(object):
     def __init__(
         self,
@@ -764,6 +768,9 @@ def _has_updates():
                     )
                 if int(exit_code) == 134:
                     raise KubernetesException("%s (exit code %s)" % (msg, exit_code))
+                if int(exit_code) == 234:
+                    # NOTE. K8S exit codes are mod 256
+                    raise KubernetesSpotInstanceTerminated()
                 else:
                     msg = "%s (exit code %s)" % (msg, exit_code)
             raise KubernetesException(

diff --git a/metaflow/plugins/kubernetes/kubernetes_cli.py b/metaflow/plugins/kubernetes/kubernetes_cli.py
@@ -11,7 +11,11 @@
 import metaflow.tracing as tracing
 from metaflow import JSONTypeClass, util
 from metaflow._vendor import click
-from metaflow.exception import METAFLOW_EXIT_DISALLOW_RETRY, MetaflowException
+from metaflow.exception import (
+    METAFLOW_EXIT_DISALLOW_RETRY,
+    METAFLOW_EXIT_ALLOW_RETRY,
+    MetaflowException,
+)
 from metaflow.metadata_provider.util import sync_local_metadata_from_datastore
 from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
 from metaflow.mflog import TASK_LOG_SOURCE
@@ -21,7 +25,9 @@
     Kubernetes,
     KubernetesException,
     KubernetesKilledException,
+    KubernetesSpotInstanceTerminated,
 )
+from metaflow.plugins.retry_decorator import RetryEvents
 
 
 @click.group()
@@ -221,6 +227,7 @@ def echo(msg, stream="stderr", job_id=None, **kwargs):
         minutes_between_retries = int(
             retry_deco[0].attributes.get("minutes_between_retries", 2)
         )
+    retry_conditions = retry_deco[0].attributes["only_on"] if retry_deco else []
     if retry_count:
         ctx.obj.echo_always(
             "Sleeping %d minutes before the next retry" % minutes_between_retries
@@ -330,6 +337,17 @@ def _sync_metadata():
         # don't retry killed tasks
         traceback.print_exc()
         sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
+    except KubernetesSpotInstanceTerminated:
+        traceback.print_exc()
+        if not retry_conditions or RetryEvents.PREEMPT.value in retry_conditions:
+            sys.exit(METAFLOW_EXIT_ALLOW_RETRY)
+        else:
+            sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
+    except KubernetesException:
+        if not retry_conditions or RetryEvents.STEP.value in retry_conditions:
+            raise
+        sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
+
     finally:
         _sync_metadata()
 

diff --git a/metaflow/plugins/kubernetes/kubernetes_decorator.py b/metaflow/plugins/kubernetes/kubernetes_decorator.py
@@ -549,6 +549,8 @@ def task_pre_step(
             self._save_logs_sidecar.start()
 
             # Start spot termination monitor sidecar.
+            # TODO: A nicer way to pass the main process id to a Sidecar, in order to allow sidecars to send signals back to the main process.
+            os.environ["MF_MAIN_PID"] = str(os.getpid())
             current._update_env(
                 {"spot_termination_notice": "/tmp/spot_termination_notice"}
             )

diff --git a/metaflow/plugins/kubernetes/spot_monitor_sidecar.py b/metaflow/plugins/kubernetes/spot_monitor_sidecar.py
@@ -21,6 +21,9 @@ def __init__(self):
         self._token = None
         self._token_expiry = 0
 
+        # Due to nesting, os.getppid is not reliable for fetching the main task pid
+        self.main_pid = int(os.getenv("MF_MAIN_PID", os.getppid()))
+
         if self._is_aws_spot_instance():
             self._process = Process(target=self._monitor_loop)
             self._process.start()
@@ -71,7 +74,7 @@ def _monitor_loop(self):
                 if response.status_code == 200:
                     termination_time = response.text
                     self._emit_termination_metadata(termination_time)
-                    os.kill(os.getppid(), signal.SIGTERM)
+                    os.kill(self.main_pid, signal.SIGUSR1)
                     break
             except (requests.exceptions.RequestException, requests.exceptions.Timeout):
                 pass