Skip to content

Commit 873e693

Browse files
authored
[core][autoscaler][v1] prune IPs from the LoadMetrics after terminating nodes (#52409)
Signed-off-by: Rueian <rueiancsie@gmail.com>
1 parent c872a52 commit 873e693

File tree

2 files changed

+14
-12
lines changed

2 files changed

+14
-12
lines changed

python/ray/autoscaler/_private/autoscaler.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -403,18 +403,6 @@ def _update(self):
403403
# This will accumulate the nodes we need to terminate.
404404
self.nodes_to_terminate = []
405405

406-
# Update running nodes gauge
407-
num_workers = len(self.non_terminated_nodes.worker_ids)
408-
self.prom_metrics.running_workers.set(num_workers)
409-
410-
# Remove from LoadMetrics the ips unknown to the NodeProvider.
411-
self.load_metrics.prune_active_ips(
412-
active_ips=[
413-
self.provider.internal_ip(node_id)
414-
for node_id in self.non_terminated_nodes.all_node_ids
415-
]
416-
)
417-
418406
# Update status strings
419407
if AUTOSCALER_STATUS_LOG:
420408
logger.info(self.info_string())
@@ -437,6 +425,18 @@ def _update(self):
437425
self.attempt_to_recover_unhealthy_nodes(now)
438426
self.set_prometheus_updater_data()
439427

428+
# Update running nodes gauge
429+
num_workers = len(self.non_terminated_nodes.worker_ids)
430+
self.prom_metrics.running_workers.set(num_workers)
431+
432+
# Remove IPs from LoadMetrics that are not known to the NodeProvider.
433+
self.load_metrics.prune_active_ips(
434+
active_ips=[
435+
self.provider.internal_ip(node_id)
436+
for node_id in self.non_terminated_nodes.all_node_ids
437+
]
438+
)
439+
440440
# Dict[NodeType, int], List[ResourceDict]
441441
to_launch, unfulfilled = self.resource_demand_scheduler.get_nodes_to_launch(
442442
self.non_terminated_nodes.all_node_ids,

python/ray/tests/test_autoscaler.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3620,7 +3620,9 @@ def testScaleDownIdleTimeOut(self):
36203620
worker_ip = self.provider.non_terminated_node_ips(WORKER_FILTER)[0]
36213621
# Mark the node as idle
36223622
lm.update(worker_ip, mock_raylet_id(), {"CPU": 1}, {"CPU": 1}, 20)
3623+
assert lm.is_active(worker_ip)
36233624
autoscaler.update()
3625+
assert not lm.is_active(worker_ip)
36243626
assert self.provider.internal_ip("1") == worker_ip
36253627
events = autoscaler.event_summarizer.summary()
36263628
assert "Removing 1 nodes of type worker (idle)." in events, events

0 commit comments

Comments
 (0)