|
18 | 18 | S3_RETRY_COUNT,
|
19 | 19 | S3_TRANSIENT_RETRY_COUNT,
|
20 | 20 | S3_SERVER_SIDE_ENCRYPTION,
|
| 21 | + S3_WORKER_COUNT, |
21 | 22 | TEMPDIR,
|
22 | 23 | )
|
23 | 24 | from metaflow.util import (
|
@@ -1390,9 +1391,31 @@ def _one_boto_op(self, op, url, create_tmp_file=True):
|
1390 | 1391 | )
|
1391 | 1392 |
|
1392 | 1393 | # add some jitter to make sure retries are not synchronized
|
1393 |
| - def _jitter_sleep(self, trynum, multiplier=2): |
1394 |
| - interval = multiplier**trynum + random.randint(0, 10) |
1395 |
| - time.sleep(interval) |
| 1394 | + def _jitter_sleep( |
| 1395 | + self, trynum: int, base: int = 2, cap: int = 360, jitter: float = 0.1 |
| 1396 | + ) -> None: |
| 1397 | + """ |
| 1398 | + Sleep for an exponentially increasing interval with added jitter. |
| 1399 | +
|
| 1400 | + Parameters |
| 1401 | + ---------- |
| 1402 | + trynum: The current retry attempt number. |
| 1403 | + base: The base multiplier for the exponential backoff. |
| 1404 | + cap: The maximum interval to sleep. |
| 1405 | + jitter: The maximum jitter percentage to add to the interval. |
| 1406 | + """ |
| 1407 | + # Calculate the exponential backoff interval |
| 1408 | + interval = min(cap, base**trynum) |
| 1409 | + |
| 1410 | + # Add random jitter |
| 1411 | + jitter_value = interval * jitter * random.uniform(-1, 1) |
| 1412 | + interval_with_jitter = interval + jitter_value |
| 1413 | + |
| 1414 | + # Ensure the interval is not negative |
| 1415 | + interval_with_jitter = max(0, interval_with_jitter) |
| 1416 | + |
| 1417 | + # Sleep for the calculated interval |
| 1418 | + time.sleep(interval_with_jitter) |
1396 | 1419 |
|
1397 | 1420 | # NOTE: re: _read_many_files and _put_many_files
|
1398 | 1421 | # All file IO is through binary files - we write bytes, we read
|
@@ -1480,20 +1503,17 @@ def _s3op_with_retries(self, mode, **options):
|
1480 | 1503 | # - a known transient failure (SlowDown for example) in which case we will
|
1481 | 1504 | # retry *only* the inputs that have this transient failure.
|
1482 | 1505 | # - an unknown failure (something went wrong but we cannot say if it was
|
1483 |
| - # a known permanent failure or something else). In this case, we retry |
1484 |
| - # the operation completely. |
1485 |
| - # |
1486 |
| - # There are therefore two retry counts: |
1487 |
| - # - the transient failure retry count: how many times do we try on known |
1488 |
| - # transient errors |
1489 |
| - # - the top-level retry count: how many times do we try on unknown failures |
| 1506 | + # a known permanent failure or something else). In this case, we assume |
| 1507 | + # it's a transient failure and retry only those inputs (same as above). |
1490 | 1508 | #
|
1491 |
| - # Note that, if the operation runs out of transient failure retries, it will |
1492 |
| - # count as an "unknown" failure (ie: it will be retried according to the |
1493 |
| - # outer top-level retry count). In other words, you can potentially have |
1494 |
| - # transient_retry_count * retry_count tries). |
1495 |
| - # Finally, if on transient failures, we make NO progress (ie: no input is |
1496 |
| - # successfully processed), that counts as an "unknown" failure. |
| 1509 | + # NOTES(npow): 2025-05-13 |
| 1510 | + # Previously, this code would also retry the fatal failures, including no_progress |
| 1511 | + # and unknown failures, from the beginning. This is not ideal because: |
| 1512 | + # 1. Fatal errors are not supposed to be retried. |
| 1513 | + # 2. Retrying from the beginning does not improve the situation, and is |
| 1514 | + # wasteful since we have already uploaded some files. |
| 1515 | + # 3. The number of transient errors is far more than fatal errors, so we |
| 1516 | + # can be optimistic and assume the unknown errors are transient. |
1497 | 1517 | cmdline = [sys.executable, os.path.abspath(s3op.__file__), mode]
|
1498 | 1518 | recursive_get = False
|
1499 | 1519 | for key, value in options.items():
|
@@ -1528,7 +1548,6 @@ def _inject_failure_rate():
|
1528 | 1548 | # Otherwise, we cap the failure rate at 90%
|
1529 | 1549 | return min(90, self._s3_inject_failures)
|
1530 | 1550 |
|
1531 |
| - retry_count = 0 # Number of retries (excluding transient failures) |
1532 | 1551 | transient_retry_count = 0 # Number of transient retries (per top-level retry)
|
1533 | 1552 | inject_failures = _inject_failure_rate()
|
1534 | 1553 | out_lines = [] # List to contain the lines returned by _s3op_with_retries
|
@@ -1595,7 +1614,12 @@ def try_s3_op(last_ok_count, pending_retries, out_lines, inject_failures):
|
1595 | 1614 | # things, this will shrink more and more until we are doing a
|
1596 | 1615 | # single operation at a time. If things start going better, it
|
1597 | 1616 | # will increase by 20% every round.
|
1598 |
| - max_count = min(int(last_ok_count * 1.2), len(pending_retries)) |
| 1617 | + # |
| 1618 | + # If we made no progress (last_ok_count == 0) we retry at most |
| 1619 | + # 2*S3_WORKER_COUNT from whatever is left in `pending_retries` |
| 1620 | + max_count = min( |
| 1621 | + int(last_ok_count * 1.2), len(pending_retries) |
| 1622 | + ) or min(2 * S3_WORKER_COUNT, len(pending_retries)) |
1599 | 1623 | tmp_input.writelines(pending_retries[:max_count])
|
1600 | 1624 | tmp_input.flush()
|
1601 | 1625 | debug.s3client_exec(
|
@@ -1712,38 +1736,16 @@ def try_s3_op(last_ok_count, pending_retries, out_lines, inject_failures):
|
1712 | 1736 | _update_out_lines(out_lines, ok_lines, resize=loop_count == 0)
|
1713 | 1737 | return 0, 0, inject_failures, err_out
|
1714 | 1738 |
|
1715 |
| - while retry_count <= S3_RETRY_COUNT: |
| 1739 | + while transient_retry_count <= S3_TRANSIENT_RETRY_COUNT: |
1716 | 1740 | (
|
1717 | 1741 | last_ok_count,
|
1718 | 1742 | last_retry_count,
|
1719 | 1743 | inject_failures,
|
1720 | 1744 | err_out,
|
1721 | 1745 | ) = try_s3_op(last_ok_count, pending_retries, out_lines, inject_failures)
|
1722 |
| - if err_out or ( |
1723 |
| - last_retry_count != 0 |
1724 |
| - and ( |
1725 |
| - last_ok_count == 0 |
1726 |
| - or transient_retry_count > S3_TRANSIENT_RETRY_COUNT |
1727 |
| - ) |
1728 |
| - ): |
1729 |
| - # We had a fatal failure (err_out is not None) |
1730 |
| - # or we made no progress (last_ok_count is 0) |
1731 |
| - # or we are out of transient retries |
1732 |
| - # so we will restart from scratch (being very conservative) |
1733 |
| - retry_count += 1 |
1734 |
| - err_msg = err_out |
1735 |
| - if err_msg is None and last_ok_count == 0: |
1736 |
| - err_msg = "No progress" |
1737 |
| - if err_msg is None: |
1738 |
| - err_msg = "Too many transient errors" |
1739 |
| - print( |
1740 |
| - "S3 non-transient error (attempt #%d): %s" % (retry_count, err_msg) |
1741 |
| - ) |
1742 |
| - _reset() |
1743 |
| - if retry_count <= S3_RETRY_COUNT: |
1744 |
| - self._jitter_sleep(retry_count) |
1745 |
| - continue |
1746 |
| - elif last_retry_count != 0: |
| 1746 | + if err_out: |
| 1747 | + break |
| 1748 | + if last_retry_count != 0: |
1747 | 1749 | # During our last try, we did not manage to process everything we wanted
|
1748 | 1750 | # due to a transient failure so we try again.
|
1749 | 1751 | transient_retry_count += 1
|
|
0 commit comments