Skip to content

Commit ffedb2c

Browse files
authored
[core] Further deflake test_gcs_fault_tolerance.py (#52764)
Recent flake: https://buildkite.com/ray-project/postmerge/builds/9908#01969376-c292-484d-b9d4-abca4fbbd35d/180-1552 Attempting to fix by using `wait_for_condition` instead of an immediate assertion. If this doesn't work, it might reveal an actual bug in the autoscaler (there's no reason the nodes should be marked as `RUNNING` in this case). --------- Signed-off-by: Edward Oakes <ed.nmi.oakes@gmail.com>
1 parent 64de3e4 commit ffedb2c

File tree

1 file changed

+5
-6
lines changed

1 file changed

+5
-6
lines changed

python/ray/tests/test_gcs_fault_tolerance.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import pytest
1313

1414
import ray
15+
from ray.autoscaler.v2.sdk import get_cluster_status
1516
from ray.util.placement_group import placement_group
1617
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
1718
import ray._private.gcs_utils as gcs_utils
@@ -150,15 +151,13 @@ def test_autoscaler_init(
150151
assert len(nodes) == 2
151152
assert nodes[0]["alive"] and nodes[1]["alive"]
152153

153-
cluster_kill_gcs_wait(cluster)
154-
155154
# Restart gcs server process.
155+
cluster_kill_gcs_wait(cluster)
156156
cluster.head_node.start_gcs_server()
157157

158-
from ray.autoscaler.v2.sdk import get_cluster_status
159-
160-
status = get_cluster_status(ray.get_runtime_context().gcs_address)
161-
assert len(status.idle_nodes) == 2
158+
# Fetch the cluster status from the autoscaler and check that it works.
159+
status = get_cluster_status(cluster.address)
160+
wait_for_condition(lambda: len(status.idle_nodes) == 2)
162161

163162

164163
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)