[core][autoscaler][v1] prune IPs from the LoadMetrics after terminating nodes (#52409)

rueian · web-flow · commit 873e69317b8f · 2025-04-23T11:06:05.000-07:00
Signed-off-by: Rueian &lt;rueiancsie@gmail.com&gt;
diff --git a/python/ray/autoscaler/_private/autoscaler.py b/python/ray/autoscaler/_private/autoscaler.py
@@ -403,18 +403,6 @@ def _update(self):
         # This will accumulate the nodes we need to terminate.
         self.nodes_to_terminate = []
 
-        # Update running nodes gauge
-        num_workers = len(self.non_terminated_nodes.worker_ids)
-        self.prom_metrics.running_workers.set(num_workers)
-
-        # Remove from LoadMetrics the ips unknown to the NodeProvider.
-        self.load_metrics.prune_active_ips(
-            active_ips=[
-                self.provider.internal_ip(node_id)
-                for node_id in self.non_terminated_nodes.all_node_ids
-            ]
-        )
-
         # Update status strings
         if AUTOSCALER_STATUS_LOG:
             logger.info(self.info_string())
@@ -437,6 +425,18 @@ def _update(self):
                     self.attempt_to_recover_unhealthy_nodes(now)
                 self.set_prometheus_updater_data()
 
+        # Update running nodes gauge
+        num_workers = len(self.non_terminated_nodes.worker_ids)
+        self.prom_metrics.running_workers.set(num_workers)
+
+        # Remove IPs from LoadMetrics that are not known to the NodeProvider.
+        self.load_metrics.prune_active_ips(
+            active_ips=[
+                self.provider.internal_ip(node_id)
+                for node_id in self.non_terminated_nodes.all_node_ids
+            ]
+        )
+
         # Dict[NodeType, int], List[ResourceDict]
         to_launch, unfulfilled = self.resource_demand_scheduler.get_nodes_to_launch(
             self.non_terminated_nodes.all_node_ids,
diff --git a/python/ray/tests/test_autoscaler.py b/python/ray/tests/test_autoscaler.py
@@ -3620,7 +3620,9 @@ def testScaleDownIdleTimeOut(self):
         worker_ip = self.provider.non_terminated_node_ips(WORKER_FILTER)[0]
         # Mark the node as idle
         lm.update(worker_ip, mock_raylet_id(), {"CPU": 1}, {"CPU": 1}, 20)
+        assert lm.is_active(worker_ip)
         autoscaler.update()
+        assert not lm.is_active(worker_ip)
         assert self.provider.internal_ip("1") == worker_ip
         events = autoscaler.event_summarizer.summary()
         assert "Removing 1 nodes of type worker (idle)." in events, events