[RFR] S3 bugfixes (#2329)

npow · npow · savingoyal · web-flow · commit b9d0249cac38 · 2025-04-01T22:12:21.000-07:00
This PR fixes several issues which caused s3op to be stuck:
1. Need to call `queue.cancel_join_thread()` so that the workers can
exit without flushing the queue, otherwise there is a deadlock
2. Catch the correct exceptions in download/upload (they don't actually
raise `ClientError`)
3. Handle `InternalError`
4. Handle `SSLError`
5. Optimistically assume all other unhandled exceptions are transient

Additional improvements:
1. Added exponential backoff to `jitter_sleep()`
2. Set default retry config in s3op.py to match that in aws_client.py
3. Fix a bug where the retry setting was not being applied properly if
config was missing
4. Fail early on fatal errors
5. Don't restart from scratch when there's no progress

---------

Co-authored-by: npow &lt;npow@netflix.com&gt;
Co-authored-by: Savin &lt;savingoyal@gmail.com&gt;
diff --git a/.github/workflows/metaflow.s3_tests.yml b/.github/workflows/metaflow.s3_tests.yml
@@ -14,7 +14,9 @@ permissions: read-all
 
 jobs:
   test_data:
-    if: (github.event.action == 'labeled' && (github.event.label.name == 'approved' || github.event.label.name == 'ok-to-test')) || (github.event.action != 'labeled' && (contains(github.event.pull_request.labels.*.name, 'ok-to-test') || contains(github.event.pull_request.labels.*.name, 'approved')))
+    # NOTE: temporarily disable s3 tests since they will fail due to 403
+    # https://netflix.slack.com/archives/C023Y9JH36W/p1740806169172489?thread_ts=1740180557.110859&cid=C023Y9JH36W
+    if: false && ((github.event.action == 'labeled' && (github.event.label.name == 'approved' || github.event.label.name == 'ok-to-test')) || (github.event.action != 'labeled' && (contains(github.event.pull_request.labels.*.name, 'ok-to-test') || contains(github.event.pull_request.labels.*.name, 'approved'))))
     name: metaflow.s3 / Python ${{ matrix.ver }} on ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
     strategy:
diff --git a/metaflow/metaflow_config.py b/metaflow/metaflow_config.py
@@ -109,6 +109,12 @@
 # top-level retries)
 S3_TRANSIENT_RETRY_COUNT = from_conf("S3_TRANSIENT_RETRY_COUNT", 20)
 
+# S3 retry configuration used in the aws client
+# Use the adaptive retry strategy by default
+S3_CLIENT_RETRY_CONFIG = from_conf(
+    "S3_CLIENT_RETRY_CONFIG", {"max_attempts": 10, "mode": "adaptive"}
+)
+
 # Threshold to start printing warnings for an AWS retry
 RETRY_WARNING_THRESHOLD = 3
 
diff --git a/metaflow/plugins/aws/aws_client.py b/metaflow/plugins/aws/aws_client.py
@@ -14,6 +14,7 @@ def get_client(
             AWS_SANDBOX_ENABLED,
             AWS_SANDBOX_STS_ENDPOINT_URL,
             AWS_SANDBOX_API_KEY,
+            S3_CLIENT_RETRY_CONFIG,
         )
 
         if session_vars is None:
@@ -37,10 +38,10 @@ def get_client(
         if module == "s3" and (
             "config" not in client_params or client_params["config"].retries is None
         ):
-            # Use the adaptive retry strategy by default -- do not set anything if
-            # the user has already set something
+            # do not set anything if the user has already set something
             config = client_params.get("config", Config())
-            config.retries = {"max_attempts": 10, "mode": "adaptive"}
+            config.retries = S3_CLIENT_RETRY_CONFIG
+            client_params["config"] = config
 
         if AWS_SANDBOX_ENABLED:
             # role is ignored in the sandbox
diff --git a/metaflow/plugins/datatools/s3/s3.py b/metaflow/plugins/datatools/s3/s3.py
@@ -18,6 +18,7 @@
     S3_RETRY_COUNT,
     S3_TRANSIENT_RETRY_COUNT,
     S3_SERVER_SIDE_ENCRYPTION,
+    S3_WORKER_COUNT,
     TEMPDIR,
 )
 from metaflow.util import (
@@ -1390,9 +1391,31 @@ def _one_boto_op(self, op, url, create_tmp_file=True):
         )
 
     # add some jitter to make sure retries are not synchronized
-    def _jitter_sleep(self, trynum, multiplier=2):
-        interval = multiplier**trynum + random.randint(0, 10)
-        time.sleep(interval)
+    def _jitter_sleep(
+        self, trynum: int, base: int = 2, cap: int = 360, jitter: float = 0.1
+    ) -> None:
+        """
+        Sleep for an exponentially increasing interval with added jitter.
+
+        Parameters
+        ----------
+        trynum: The current retry attempt number.
+        base: The base multiplier for the exponential backoff.
+        cap: The maximum interval to sleep.
+        jitter: The maximum jitter percentage to add to the interval.
+        """
+        # Calculate the exponential backoff interval
+        interval = min(cap, base**trynum)
+
+        # Add random jitter
+        jitter_value = interval * jitter * random.uniform(-1, 1)
+        interval_with_jitter = interval + jitter_value
+
+        # Ensure the interval is not negative
+        interval_with_jitter = max(0, interval_with_jitter)
+
+        # Sleep for the calculated interval
+        time.sleep(interval_with_jitter)
 
     # NOTE: re: _read_many_files and _put_many_files
     # All file IO is through binary files - we write bytes, we read
@@ -1480,20 +1503,17 @@ def _s3op_with_retries(self, mode, **options):
         #    - a known transient failure (SlowDown for example) in which case we will
         #      retry *only* the inputs that have this transient failure.
         #    - an unknown failure (something went wrong but we cannot say if it was
-        #      a known permanent failure or something else). In this case, we retry
-        #      the operation completely.
-        #
-        # There are therefore two retry counts:
-        #  - the transient failure retry count: how many times do we try on known
-        #    transient errors
-        #  - the top-level retry count: how many times do we try on unknown failures
+        #      a known permanent failure or something else). In this case, we assume
+        #      it's a transient failure and retry only those inputs (same as above).
         #
-        # Note that, if the operation runs out of transient failure retries, it will
-        # count as an "unknown" failure (ie: it will be retried according to the
-        # outer top-level retry count). In other words, you can potentially have
-        # transient_retry_count * retry_count tries).
-        # Finally, if on transient failures, we make NO progress (ie: no input is
-        # successfully processed), that counts as an "unknown" failure.
+        # NOTES(npow): 2025-05-13
+        # Previously, this code would also retry the fatal failures, including no_progress
+        # and unknown failures, from the beginning. This is not ideal because:
+        # 1. Fatal errors are not supposed to be retried.
+        # 2. Retrying from the beginning does not improve the situation, and is
+        #    wasteful since we have already uploaded some files.
+        # 3. The number of transient errors is far more than fatal errors, so we
+        #    can be optimistic and assume the unknown errors are transient.
         cmdline = [sys.executable, os.path.abspath(s3op.__file__), mode]
         recursive_get = False
         for key, value in options.items():
@@ -1528,7 +1548,6 @@ def _inject_failure_rate():
             # Otherwise, we cap the failure rate at 90%
             return min(90, self._s3_inject_failures)
 
-        retry_count = 0  # Number of retries (excluding transient failures)
         transient_retry_count = 0  # Number of transient retries (per top-level retry)
         inject_failures = _inject_failure_rate()
         out_lines = []  # List to contain the lines returned by _s3op_with_retries
@@ -1595,7 +1614,12 @@ def try_s3_op(last_ok_count, pending_retries, out_lines, inject_failures):
                         # things, this will shrink more and more until we are doing a
                         # single operation at a time. If things start going better, it
                         # will increase by 20% every round.
-                        max_count = min(int(last_ok_count * 1.2), len(pending_retries))
+                        #
+                        # If we made no progress (last_ok_count == 0) we retry at most
+                        # 2*S3_WORKER_COUNT from whatever is left in `pending_retries`
+                        max_count = min(
+                            int(last_ok_count * 1.2), len(pending_retries)
+                        ) or min(2 * S3_WORKER_COUNT, len(pending_retries))
                         tmp_input.writelines(pending_retries[:max_count])
                         tmp_input.flush()
                         debug.s3client_exec(
@@ -1712,38 +1736,16 @@ def try_s3_op(last_ok_count, pending_retries, out_lines, inject_failures):
                         _update_out_lines(out_lines, ok_lines, resize=loop_count == 0)
                         return 0, 0, inject_failures, err_out
 
-        while retry_count <= S3_RETRY_COUNT:
+        while transient_retry_count <= S3_TRANSIENT_RETRY_COUNT:
             (
                 last_ok_count,
                 last_retry_count,
                 inject_failures,
                 err_out,
             ) = try_s3_op(last_ok_count, pending_retries, out_lines, inject_failures)
-            if err_out or (
-                last_retry_count != 0
-                and (
-                    last_ok_count == 0
-                    or transient_retry_count > S3_TRANSIENT_RETRY_COUNT
-                )
-            ):
-                # We had a fatal failure (err_out is not None)
-                # or we made no progress (last_ok_count is 0)
-                # or we are out of transient retries
-                # so we will restart from scratch (being very conservative)
-                retry_count += 1
-                err_msg = err_out
-                if err_msg is None and last_ok_count == 0:
-                    err_msg = "No progress"
-                if err_msg is None:
-                    err_msg = "Too many transient errors"
-                print(
-                    "S3 non-transient error (attempt #%d): %s" % (retry_count, err_msg)
-                )
-                _reset()
-                if retry_count <= S3_RETRY_COUNT:
-                    self._jitter_sleep(retry_count)
-                continue
-            elif last_retry_count != 0:
+            if err_out:
+                break
+            if last_retry_count != 0:
                 # During our last try, we did not manage to process everything we wanted
                 # due to a transient failure so we try again.
                 transient_retry_count += 1
diff --git a/metaflow/plugins/datatools/s3/s3op.py b/metaflow/plugins/datatools/s3/s3op.py
diff --git a/test/data/s3/test_s3op.py b/test/data/s3/test_s3op.py