[serve] Remove RAY_SERVE_ENABLE_QUEUE_LENGTH_CACHE flag (#51649)

akyang-anyscale · web-flow · commit 925b25c885ec · 2025-03-24T16:55:18.000-07:00
## Why are these changes needed? This PR removes a feature flag that controls whether the proxy should use cached replica queue length values for routing. The FF was [introduced](#42943) over a year ago as a way for users to quickly switch back to the previous implementation. It has been enabled by default for [over a year](#43169) now and works as expected, so let's remove it. Consequently, this PR also removes `RAY_SERVE_ENABLE_STRICT_MAX_ONGOING_REQUESTS`, as it is always enabled if `RAY_SERVE_ENABLE_QUEUE_LENGTH_CACHE` is enabled. Signed-off-by: akyang-anyscale <alexyang@anyscale.com>
diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
@@ -328,19 +328,6 @@ def parse_latency_buckets(bucket_str: str, default_buckets: list) -> list:
     os.environ.get("RAY_SERVE_MAX_QUEUE_LENGTH_RESPONSE_DEADLINE_S", 1.0)
 )
 
-# Feature flag for caching queue lengths for faster routing in each handle.
-RAY_SERVE_ENABLE_QUEUE_LENGTH_CACHE = (
-    os.environ.get("RAY_SERVE_ENABLE_QUEUE_LENGTH_CACHE", "1") == "1"
-)
-
-# Feature flag for strictly enforcing max_ongoing_requests (replicas will reject
-# requests).
-RAY_SERVE_ENABLE_STRICT_MAX_ONGOING_REQUESTS = (
-    os.environ.get("RAY_SERVE_ENABLE_STRICT_MAX_ONGOING_REQUESTS", "0") == "1"
-    # Strict enforcement path must be enabled for the queue length cache.
-    or RAY_SERVE_ENABLE_QUEUE_LENGTH_CACHE
-)
-
 # Length of time to respect entries in the queue length cache when scheduling requests.
 RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S = float(
     os.environ.get("RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S", 10.0)
diff --git a/python/ray/serve/_private/default_impl.py b/python/ray/serve/_private/default_impl.py
@@ -15,8 +15,6 @@
     RequestProtocol,
 )
 from ray.serve._private.constants import (
-    RAY_SERVE_ENABLE_QUEUE_LENGTH_CACHE,
-    RAY_SERVE_ENABLE_STRICT_MAX_ONGOING_REQUESTS,
     RAY_SERVE_PROXY_PREFER_LOCAL_AZ_ROUTING,
     RAY_SERVE_PROXY_PREFER_LOCAL_NODE_ROUTING,
 )
@@ -163,9 +161,7 @@ def create_router(
         else None,
         availability_zone,
         # Streaming ObjectRefGenerators are not supported in Ray Client
-        use_replica_queue_len_cache=(
-            not is_inside_ray_client_context and RAY_SERVE_ENABLE_QUEUE_LENGTH_CACHE
-        ),
+        use_replica_queue_len_cache=not is_inside_ray_client_context,
         create_replica_wrapper_func=lambda r: RunningReplica(r),
     )
 
@@ -177,10 +173,7 @@ def create_router(
         handle_source=handle_options._source,
         replica_scheduler=replica_scheduler,
         # Streaming ObjectRefGenerators are not supported in Ray Client
-        enable_strict_max_ongoing_requests=(
-            not is_inside_ray_client_context
-            and RAY_SERVE_ENABLE_STRICT_MAX_ONGOING_REQUESTS
-        ),
+        enable_strict_max_ongoing_requests=not is_inside_ray_client_context,
         resolve_request_arg_func=resolve_deployment_response,
     )
 
diff --git a/python/ray/serve/tests/BUILD b/python/ray/serve/tests/BUILD
@@ -301,54 +301,6 @@ py_test(
     ],
 )
 
-# Tests disabling queue len caching feature flag.
-# TODO(edoakes): remove this after the FF is removed.
-py_test_module_list(
-    size = "medium",
-    data = glob(["test_config_files/**/*"]),
-    env = {"RAY_SERVE_ENABLE_QUEUE_LENGTH_CACHE": "0"},
-    files = [
-        "test_handle_1.py",
-        "test_handle_2.py",
-        "test_handle_cancellation.py",
-        "test_handle_streaming.py",
-        "test_http_cancellation.py",
-        "test_multiplex.py",
-        "test_request_timeout.py",
-    ],
-    name_suffix = "_with_queue_len_cache_disabled",
-    tags = [
-        "exclusive",
-        "no_windows",
-        "team:serve",
-    ],
-    deps = [
-        ":common",
-        ":conftest",
-        "//python/ray/serve:serve_lib",
-    ],
-)
-
-# Tests autoscaling when queue len caching is disabled.
-py_test_module_list(
-    size = "large",
-    env = {"RAY_SERVE_ENABLE_QUEUE_LENGTH_CACHE": "0"},
-    files = [
-        "test_autoscaling_policy.py",
-    ],
-    name_suffix = "_with_queue_len_cache_disabled",
-    tags = [
-        "exclusive",
-        "no_windows",
-        "team:serve",
-    ],
-    deps = [
-        ":common",
-        ":conftest",
-        "//python/ray/serve:serve_lib",
-    ],
-)
-
 # Test old stop-fully-then-start behavior.
 # TODO(zcin): remove this after the old behavior is completely removed
 py_test_module_list(
diff --git a/python/ray/serve/tests/test_handle_2.py b/python/ray/serve/tests/test_handle_2.py
@@ -9,7 +9,6 @@
 from ray._private.test_utils import SignalActor, async_wait_for_condition
 from ray._common.utils import get_or_create_event_loop
 from ray.serve._private.constants import (
-    RAY_SERVE_ENABLE_STRICT_MAX_ONGOING_REQUESTS,
     RAY_SERVE_FORCE_LOCAL_TESTING_MODE,
 )
 from ray.serve.exceptions import RayServeException
@@ -412,10 +411,6 @@ async def __call__(self):
     RAY_SERVE_FORCE_LOCAL_TESTING_MODE,
     reason="local_testing_mode doesn't respect max_ongoing_requests",
 )
-@pytest.mark.skipif(
-    not RAY_SERVE_ENABLE_STRICT_MAX_ONGOING_REQUESTS,
-    reason="Strict enforcement must be enabled",
-)
 @pytest.mark.asyncio
 async def test_max_ongoing_requests_enforced(serve_instance):
     """Handles should respect max_ongoing_requests enforcement."""