@@ -1270,8 +1270,14 @@ def __init__(
1270
1270
self ._target_state : DeploymentTargetState = DeploymentTargetState .default ()
1271
1271
1272
1272
self ._prev_startup_warning : float = time .time ()
1273
- self ._replica_constructor_retry_counter : int = 0
1274
1273
self ._replica_constructor_error_msg : Optional [str ] = None
1274
+ # Counter for how many times replicas failed to start. This is reset to 0 when:
1275
+ # (1) The deployment is deployed / re-deployed.
1276
+ # (2) The deployment reaches the HEALTHY state.
1277
+ self ._replica_constructor_retry_counter : int = 0
1278
+ # Flag for whether any replicas of the target version has successfully started.
1279
+ # This is reset to False when the deployment is re-deployed.
1280
+ self ._replica_has_started : bool = False
1275
1281
1276
1282
self ._replicas : ReplicaStateContainer = ReplicaStateContainer ()
1277
1283
self ._curr_status_info : DeploymentStatusInfo = DeploymentStatusInfo (
@@ -1393,10 +1399,24 @@ def _failed_to_start_threshold(self) -> int:
1393
1399
self ._target_state .target_num_replicas * MAX_PER_REPLICA_RETRY_COUNT ,
1394
1400
)
1395
1401
1396
- @property
1397
- def is_failed (self ) -> bool :
1398
- """Whether the deployment failed to deploy."""
1399
- return self ._curr_status_info .status == DeploymentStatus .DEPLOY_FAILED
1402
+ def _replica_startup_failing (self ) -> bool :
1403
+ """Check whether replicas are currently failing and the number of
1404
+ failures has exceeded a threshold.
1405
+ """
1406
+ return (
1407
+ self ._target_state .target_num_replicas > 0
1408
+ and self ._replica_constructor_retry_counter
1409
+ >= self ._failed_to_start_threshold
1410
+ )
1411
+
1412
+ def _terminally_failed (self ) -> bool :
1413
+ """Check whether the current version is terminally errored.
1414
+
1415
+ The version is considered terminally errored if the number of
1416
+ replica failures has exceeded a threshold, and there hasn't been
1417
+ any replicas of the target version that has successfully started.
1418
+ """
1419
+ return not self ._replica_has_started and self ._replica_startup_failing ()
1400
1420
1401
1421
def get_alive_replica_actor_ids (self ) -> Set [str ]:
1402
1422
return {replica .actor_id for replica in self ._replicas .get ()}
@@ -1454,7 +1474,7 @@ def broadcast_running_replicas_if_changed(self) -> None:
1454
1474
multiplexed model IDs.
1455
1475
"""
1456
1476
running_replica_infos = self .get_running_replica_infos ()
1457
- is_available = not self .is_failed
1477
+ is_available = not self ._terminally_failed ()
1458
1478
1459
1479
running_replicas_changed = (
1460
1480
set (self ._last_broadcasted_running_replica_infos )
@@ -1637,6 +1657,7 @@ def deploy(self, deployment_info: DeploymentInfo) -> bool:
1637
1657
f"(initial target replicas: { target_num_replicas } )."
1638
1658
)
1639
1659
self ._replica_constructor_retry_counter = 0
1660
+ self ._replica_has_started = False
1640
1661
return True
1641
1662
1642
1663
def autoscale (self ) -> int :
@@ -1860,11 +1881,7 @@ def scale_deployment_replicas(
1860
1881
stopping_replicas = self ._replicas .count (states = [ReplicaState .STOPPING ])
1861
1882
to_add = max (delta_replicas - stopping_replicas , 0 )
1862
1883
1863
- if (
1864
- to_add > 0
1865
- and self ._replica_constructor_retry_counter
1866
- < self ._failed_to_start_threshold
1867
- ):
1884
+ if to_add > 0 and not self ._terminally_failed ():
1868
1885
logger .info (f"Adding { to_add } replica{ 's' * (to_add > 1 )} to { self ._id } ." )
1869
1886
for _ in range (to_add ):
1870
1887
replica_id = ReplicaID (get_random_string (), deployment_id = self ._id )
@@ -1913,34 +1930,29 @@ def check_curr_status(self) -> Tuple[bool, bool]:
1913
1930
states = [ReplicaState .RUNNING ], version = target_version
1914
1931
)
1915
1932
1916
- failed_to_start_count = self ._replica_constructor_retry_counter
1917
-
1918
1933
# Got to make a call to complete current deploy() goal after
1919
1934
# start failure threshold reached, while we might still have
1920
1935
# pending replicas in current goal.
1921
- if (
1922
- failed_to_start_count >= self ._failed_to_start_threshold
1923
- and self ._failed_to_start_threshold != 0
1924
- ):
1925
- if running_at_target_version_replica_cnt > 0 :
1926
- # At least one RUNNING replica at target state, partial
1927
- # success; We can stop tracking constructor failures and
1928
- # leave it to the controller to fully scale to target
1929
- # number of replicas and only return as completed once
1930
- # reached target replica count
1931
- self ._replica_constructor_retry_counter = - 1
1932
- else :
1933
- self ._curr_status_info = self ._curr_status_info .handle_transition (
1934
- trigger = DeploymentStatusInternalTrigger .REPLICA_STARTUP_FAILED ,
1935
- message = (
1936
- f"The deployment failed to start { failed_to_start_count } times "
1937
- "in a row. This may be due to a problem with its "
1938
- "constructor or initial health check failing. See "
1939
- "controller logs for details. Error:\n "
1940
- f"{ self ._replica_constructor_error_msg } "
1941
- ),
1942
- )
1943
- return False , any_replicas_recovering
1936
+ if running_at_target_version_replica_cnt > 0 :
1937
+ # At least one RUNNING replica at target state, partial
1938
+ # success; We can stop tracking constructor failures and
1939
+ # leave it to the controller to fully scale to target
1940
+ # number of replicas and only return as completed once
1941
+ # reached target replica count
1942
+ self ._replica_has_started = True
1943
+ elif self ._replica_startup_failing ():
1944
+ self ._curr_status_info = self ._curr_status_info .handle_transition (
1945
+ trigger = DeploymentStatusInternalTrigger .REPLICA_STARTUP_FAILED ,
1946
+ message = (
1947
+ "The deployment failed to start "
1948
+ f"{ self ._replica_constructor_retry_counter } times "
1949
+ "in a row. This may be due to a problem with its "
1950
+ "constructor or initial health check failing. See "
1951
+ "controller logs for details. Error:\n "
1952
+ f"{ self ._replica_constructor_error_msg } "
1953
+ ),
1954
+ )
1955
+ return False , any_replicas_recovering
1944
1956
1945
1957
# If we have pending ops, the current goal is *not* ready.
1946
1958
if (
@@ -1966,6 +1978,7 @@ def check_curr_status(self) -> Tuple[bool, bool]:
1966
1978
self ._curr_status_info = self ._curr_status_info .handle_transition (
1967
1979
trigger = DeploymentStatusInternalTrigger .HEALTHY
1968
1980
)
1981
+ self ._replica_constructor_retry_counter = 0
1969
1982
return False , any_replicas_recovering
1970
1983
1971
1984
return False , any_replicas_recovering
@@ -2033,25 +2046,31 @@ def _check_startup_replicas(
2033
2046
def record_replica_startup_failure (self , error_msg : str ):
2034
2047
"""Record that a replica failed to start."""
2035
2048
2036
- if self ._replica_constructor_retry_counter >= 0 :
2037
- # Increase startup failure counter if we're tracking it
2038
- self ._replica_constructor_retry_counter += 1
2039
- self ._replica_constructor_error_msg = error_msg
2040
-
2041
- retrying_msg = "Retrying"
2042
- if self ._failed_to_start_threshold != 0 :
2043
- remaining_retries = max (
2044
- self ._failed_to_start_threshold
2045
- - self ._replica_constructor_retry_counter ,
2046
- 0 ,
2047
- )
2048
- retrying_msg += f" { remaining_retries } more time(s)"
2049
+ # There is no need to record replica failures if the target is 0.
2050
+ if self ._target_state .target_num_replicas == 0 :
2051
+ return
2049
2052
2050
- message = (
2051
- f"A replica failed to start with exception. { retrying_msg } . Error:\n "
2052
- f"{ error_msg } "
2053
+ # Increase startup failure counter
2054
+ self ._replica_constructor_retry_counter += 1
2055
+ self ._replica_constructor_error_msg = error_msg
2056
+
2057
+ # Update the deployment message only if replicas are failing during
2058
+ # the very first time the controller is trying to start replicas of
2059
+ # this version.
2060
+ retrying_msg = ""
2061
+ if not self ._replica_has_started :
2062
+ remaining_retries = max (
2063
+ self ._failed_to_start_threshold
2064
+ - self ._replica_constructor_retry_counter ,
2065
+ 0 ,
2053
2066
)
2054
- self ._curr_status_info = self ._curr_status_info .update_message (message )
2067
+ retrying_msg = f" { remaining_retries } more time(s)"
2068
+
2069
+ message = (
2070
+ f"A replica failed to start with exception. Retrying{ retrying_msg } . "
2071
+ f"Error:\n { error_msg } "
2072
+ )
2073
+ self ._curr_status_info = self ._curr_status_info .update_message (message )
2055
2074
2056
2075
def stop_replicas (self , replicas_to_stop ) -> None :
2057
2076
for replica in self ._replicas .pop ():
0 commit comments