ray-project
diff --git a/‎python/ray/_raylet.pyx
Lines changed: 28 additions & 28 deletions b/‎python/ray/_raylet.pyx
Lines changed: 28 additions & 28 deletions
diff --git a/‎python/ray/includes/libcoreworker.pxd
Lines changed: 1 addition & 5 deletions b/‎python/ray/includes/libcoreworker.pxd
Lines changed: 1 addition & 5 deletions
diff --git a/‎python/ray/tests/test_actor_cancel.py
Lines changed: 35 additions & 92 deletions b/‎python/ray/tests/test_actor_cancel.py
Lines changed: 35 additions & 92 deletions
@@ -36,9 +36,7 @@ from typing import (
 )
 
 import contextvars
-import concurrent
-from concurrent.futures import ThreadPoolExecutor
-from concurrent.futures import Future as ConcurrentFuture
+import concurrent.futures
 
 from libc.stdint cimport (
     int32_t,
@@ -253,7 +251,7 @@ GRPC_STATUS_CODE_UNIMPLEMENTED = CGrpcStatusCode.UNIMPLEMENTED
 
 logger = logging.getLogger(__name__)
 
-# The currently executing task, if any. These are used to synchronize task
+# The currently running task, if any. These are used to synchronize task
 # interruption for ray.cancel.
 current_task_id = None
 current_task_id_lock = threading.Lock()
@@ -1108,7 +1106,7 @@ cdef store_task_errors(
 
 
 cdef class StreamingGeneratorExecutionContext:
-    """The context to execute streaming generator function.
+    """The context to run a streaming generator function.
 
     Make sure you always call `initialize` API before
     accessing any fields.
@@ -2550,26 +2548,25 @@ cdef void delete_spilled_objects_handler(
                 job_id=None)
 
 
-cdef void cancel_async_task(
-        const CTaskID &c_task_id,
-        const CRayFunction &ray_function,
-        const c_string c_name_of_concurrency_group_to_execute) nogil:
+cdef c_bool cancel_async_actor_task(const CTaskID &c_task_id) nogil:
+    """Attempt to cancel a task running in this asyncio actor.
+
+    Returns True if the task was currently running and was cancelled, else False.
+
+    Note that the underlying asyncio task may not actually have been cancelled: it
+    could already have completed or else might not gracefully handle cancellation.
+    The return value only indicates that the task was found and cancelled.
+    """
     with gil:
-        function_descriptor = CFunctionDescriptorToPython(
-            ray_function.GetFunctionDescriptor())
-        name_of_concurrency_group_to_execute = \
-            c_name_of_concurrency_group_to_execute.decode("ascii")
         task_id = TaskID(c_task_id.Binary())
-
         worker = ray._private.worker.global_worker
-        eventloop, _ = worker.core_worker.get_event_loop(
-            function_descriptor, name_of_concurrency_group_to_execute)
-        future = worker.core_worker.get_queued_future(task_id)
-        if future is not None:
-            future.cancel()
-        # else, the task is already finished. If the task
-        # wasn't finished (task is queued on a client or server side),
-        # this method shouldn't have been called.
+        fut = worker.core_worker.get_future_for_running_task(task_id)
+        if fut is None:
+            # Either the task hasn't started executing yet or already finished.
+            return False
+
+        fut.cancel()
+        return True
 
 
 cdef void unhandled_exception_handler(const CRayObject& error) nogil:
@@ -2970,7 +2967,7 @@ cdef class CoreWorker:
         options.restore_spilled_objects = restore_spilled_objects_handler
         options.delete_spilled_objects = delete_spilled_objects_handler
         options.unhandled_exception_handler = unhandled_exception_handler
-        options.cancel_async_task = cancel_async_task
+        options.cancel_async_actor_task = cancel_async_actor_task
         options.get_lang_stack = get_py_stack
         options.is_local_mode = local_mode
         options.kill_main = kill_main_task
@@ -4461,15 +4458,15 @@ cdef class CoreWorker:
             for fd in function_descriptors:
                 self.fd_to_cgname_dict[fd] = cg_name
 
-    def get_event_loop_executor(self) -> ThreadPoolExecutor:
+    def get_event_loop_executor(self) -> concurrent.futures.ThreadPoolExecutor:
         if self.event_loop_executor is None:
             # NOTE: We're deliberately allocating thread-pool executor with
             #       a single thread, provided that many of its use-cases are
             #       not thread-safe yet (for ex, reporting streaming generator output)
-            self.event_loop_executor = ThreadPoolExecutor(max_workers=1)
+            self.event_loop_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
         return self.event_loop_executor
 
-    def reset_event_loop_executor(self, executor: ThreadPoolExecutor):
+    def reset_event_loop_executor(self, executor: concurrent.futures.ThreadPoolExecutor):
         self.event_loop_executor = executor
 
     def get_event_loop(self, function_descriptor, specified_cgname):
@@ -4622,8 +4619,11 @@ cdef class CoreWorker:
         return ActorID(CCoreWorkerProcess.GetCoreWorker().GetWorkerContext()
                        .GetRootDetachedActorID().Binary())
 
-    def get_queued_future(self, task_id: Optional[TaskID]) -> ConcurrentFuture:
-        """Get a asyncio.Future that's queued in the event loop."""
+    def get_future_for_running_task(self, task_id: Optional[TaskID]) -> Optional[concurrent.futures.Future]:
+        """Get the future corresponding to a running task (or None).
+
+        The underyling asyncio task might be queued, running, or completed.
+        """
         with self._task_id_to_future_lock:
             return self._task_id_to_future.get(task_id)
 
 
@@ -420,11 +420,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
             const c_string&,
             const c_vector[c_string]&) nogil) run_on_util_worker_handler
         (void(const CRayObject&) nogil) unhandled_exception_handler
-        (void(
-            const CTaskID &c_task_id,
-            const CRayFunction &ray_function,
-            const c_string c_name_of_concurrency_group_to_execute
-        ) nogil) cancel_async_task
+        (c_bool(const CTaskID &c_task_id) nogil) cancel_async_actor_task
         (void(c_string *stack_out) nogil) get_lang_stack
         c_bool is_local_mode
         int num_workers
 
@@ -2,6 +2,7 @@
 import os
 import sys
 import time
+import concurrent.futures
 from collections import defaultdict
 
 import pytest
@@ -134,70 +135,6 @@ def f():
         ray.get(ref_dep_not_resolved)
 
 
-@pytest.mark.skip(
-    reason=("The guarantee in this case is too weak now. Need more work.")
-)
-def test_in_flight_queued_requests_canceled(shutdown_only, monkeypatch):
-    """
-    When there are large input size in-flight actor tasks
-    tasks are queued inside a RPC layer (core_worker_client.h)
-    In this case, we don't cancel a request from a client side
-    but wait until it is sent to the server side and cancel it.
-    See SendRequests() inside core_worker_client.h
-    """
-    # Currently the max bytes is
-    # const int64_t kMaxBytesInFlight = 16 * 1024 * 1024.
-    # See core_worker_client.h.
-    input_arg = b"1" * 15 * 1024  # 15KB.
-    # Tasks are queued when there are more than 1024 tasks.
-    sig = SignalActor.remote()
-
-    @ray.remote
-    class Actor:
-        def __init__(self, signal_actor):
-            self.signal_actor = signal_actor
-
-        def f(self, input_arg):
-            ray.get(self.signal_actor.wait.remote())
-            return True
-
-    a = Actor.remote(sig)
-    refs = [a.f.remote(input_arg) for _ in range(5000)]
-
-    # Wait until the first task runs.
-    wait_for_condition(
-        lambda: len(list_tasks(filters=[("STATE", "=", "RUNNING")])) == 1
-    )
-
-    # Cancel all tasks.
-    for ref in refs:
-        ray.cancel(ref)
-
-    # The first ref is in progress, so we pop it out
-    first_ref = refs.pop(0)
-    ray.get(sig.send.remote())
-
-    # Make sure all tasks that are queued (including queued
-    # due to in-flight bytes) are canceled.
-    canceled = 0
-    for ref in refs:
-        try:
-            ray.get(ref)
-        except TaskCancelledError:
-            canceled += 1
-
-    # Verify at least half of tasks are canceled.
-    # Currently, the guarantee is weak because we cannot
-    # detect queued tasks due to inflight bytes limit.
-    # TODO(sang): Move the in flight bytes logic into
-    # actor submission queue instead of doing it inside
-    # core worker client.
-    assert canceled > 2500
-
-    # first ref shouldn't have been canceled.
-    assert ray.get(first_ref)
-
-
 def test_async_actor_server_side_cancel(shutdown_only):
     """
     Test Cancelation when a task is queued on a server side.
@@ -324,34 +261,6 @@ def f(refs):
         ray.get(sleep_ref)
 
 
-@pytest.mark.skip(reason=("Currently not passing. There's one edge case to fix."))
-def test_cancel_stress(shutdown_only):
-    ray.init()
-
-    @ray.remote
-    class Actor:
-        async def sleep(self):
-            await asyncio.sleep(1000)
-
-    actors = [Actor.remote() for _ in range(30)]
-
-    refs = []
-    for _ in range(20):
-        for actor in actors:
-            for i in range(100):
-                ref = actor.sleep.remote()
-                refs.append(ref)
-                if i % 2 == 0:
-                    ray.cancel(ref)
-
-    for ref in refs:
-        ray.cancel(ref)
-
-    for ref in refs:
-        with pytest.raises((ray.exceptions.TaskCancelledError, TaskCancelledError)):
-            ray.get(ref)
-
-
 def test_cancel_recursive_tree(shutdown_only):
     """Verify recursive cancel works for tree-nested tasks.
 
@@ -529,6 +438,40 @@ def get_child_ref(self):
             ray.get(ref)
 
 
+def test_concurrent_submission_and_cancellation(shutdown_only):
+    """Test submitting and then cancelling many tasks concurrently.
+
+    This is a regression test for race conditions such as:
+        https://github.com/ray-project/ray/issues/52628.
+    """
+    NUM_TASKS = 2500
+
+    @ray.remote(num_cpus=0)
+    class Worker:
+        async def sleep(self, i: int):
+            # NOTE: all tasks should be cancelled, so this won't actually sleep for the
+            # full duration if the test is passing.
+            await asyncio.sleep(30)
+
+    worker = Worker.remote()
+
+    # Submit many tasks in parallel to cause queueing on the caller and receiver.
+    with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_TASKS) as executor:
+        futures = [executor.submit(worker.sleep.remote, i) for i in range(NUM_TASKS)]
+        refs = [f.result() for f in concurrent.futures.as_completed(futures)]
+
+    # Cancel the tasks in reverse order of submission.
+    for ref in reversed(refs):
+        ray.cancel(ref)
+
+    # Check that all tasks were successfully cancelled (none ran to completion).
+    for ref in refs:
+        with pytest.raises(ray.exceptions.TaskCancelledError):
+            ray.get(ref)
+
+    print(f"All {NUM_TASKS} tasks were cancelled successfully.")
+
+
 if __name__ == "__main__":
     if os.environ.get("PARALLEL_CI"):
         sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))