[core][cgraph] Fix illegal memory access of cgraph when used in PP (#51734)

ruisearch42 · web-flow · commit 98d217e426d2 · 2025-03-28T10:08:08.000-05:00
This fixes the illegal memory access bug reported in #51596 The culprit PR is #51305 This PR technically just manually reverts #51305 because it's hard to simply `git revert` that commit, because we now enforce that there are no core->air dependencies in CI. See #51699 This PR simply reverts to the original logic before #51305 , and makes code copy as needed. We wanted to first get the bug mitigated, and then follow up to simplify and tailor the logic of `utils.get_cuda_devices()`, as it can be tricky. ## Related issue number Closes #51596 --------- Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
diff --git a/python/ray/dag/tests/experimental/test_torch_tensor_dag.py b/python/ray/dag/tests/experimental/test_torch_tensor_dag.py
@@ -18,7 +18,7 @@
     Communicator,
     TorchTensorAllocator,
 )
-from ray.experimental.channel.utils import get_default_torch_device
+from ray.experimental.channel.utils import get_devices
 from ray.experimental.channel.torch_tensor_type import TorchTensorType
 from ray.experimental.channel.nccl_group import _NcclGroup
 from ray._private.test_utils import (
@@ -40,7 +40,7 @@
 @ray.remote
 class TorchTensorWorker:
     def __init__(self):
-        self.device = get_default_torch_device(allow_cpu=True)
+        self.device = get_devices()[0]
 
     def init_distributed(self, world_size, rank):
         torch.distributed.init_process_group(
@@ -569,7 +569,7 @@ def initialize(self, rank: int) -> None:
                 rank == expected_rank
             ), f"NCCL actor's rank {rank} does not match expected rank {expected_rank}"
             self._rank = rank
-            self._device = get_default_torch_device(allow_cpu=True)
+            self._device = get_devices()[0]
 
         def get_rank(self, actor: ray.actor.ActorHandle) -> int:
             actor_ids = [a._ray_actor_id for a in self._actor_handles]
@@ -703,7 +703,7 @@ def initialize(self, rank: int) -> None:
                 rank == expected_rank
             ), f"NCCL actor's rank {rank} does not match expected rank {expected_rank}"
             self._rank = rank
-            self._device = get_default_torch_device(allow_cpu=True)
+            self._device = get_devices()[0]
 
         def get_rank(self, actor: ray.actor.ActorHandle) -> int:
             actor_ids = [a._ray_actor_id for a in self._actor_handles]
@@ -850,7 +850,7 @@ def initialize(self, rank: int) -> None:
                 rank == expected_rank
             ), f"NCCL actor's rank {rank} does not match expected rank {expected_rank}"
             self._rank = rank
-            self._device = get_default_torch_device(allow_cpu=True)
+            self._device = get_devices()[0]
 
         def get_rank(self, actor: ray.actor.ActorHandle) -> int:
             actor_ids = [a._ray_actor_id for a in self._actor_handles]
diff --git a/python/ray/experimental/channel/common.py b/python/ray/experimental/channel/common.py
@@ -19,7 +19,7 @@
 import ray
 import ray.exceptions
 from ray.experimental.channel.communicator import Communicator
-from ray.experimental.channel.utils import get_default_torch_device
+from ray.experimental.channel.utils import get_devices
 from ray.experimental.channel.serialization_context import _SerializationContext
 from ray.util.annotations import DeveloperAPI, PublicAPI
 
@@ -163,7 +163,7 @@ def torch_available(self) -> bool:
     @property
     def torch_device(self) -> "torch.device":
         if self._torch_device is None:
-            self._torch_device = get_default_torch_device(allow_cpu=True)
+            self._torch_device = get_devices()[0]
 
         return self._torch_device
 
diff --git a/python/ray/experimental/channel/nccl_group.py b/python/ray/experimental/channel/nccl_group.py
@@ -6,7 +6,7 @@
 from ray.exceptions import RayChannelError
 from ray.experimental.channel.communicator import Communicator, TorchTensorAllocator
 from ray.experimental.util.types import ReduceOp
-from ray.experimental.channel.utils import get_default_torch_device
+from ray.experimental.channel.utils import get_devices
 
 if TYPE_CHECKING:
     import cupy as cp
@@ -102,7 +102,7 @@ def __init__(
             import cupy as cp
 
             # TODO(swang): Allow default device to be overridden.
-            device = get_default_torch_device(allow_cpu=False)
+            device = get_devices()[0]
             self._cuda_stream = cp.cuda.ExternalStream(
                 cuda_stream, device_id=device.index
             )
diff --git a/python/ray/experimental/channel/utils.py b/python/ray/experimental/channel/utils.py
@@ -1,6 +1,7 @@
 from typing import TYPE_CHECKING, List, Optional, Tuple
 
 import ray
+import os
 
 if TYPE_CHECKING:
     import torch
@@ -95,22 +96,66 @@ def get_actor_node(actor: Optional["ray.actor.ActorHandle"]) -> str:
         )
 
 
-def get_default_torch_device(*, allow_cpu: bool) -> "torch.device":
-    """Get the default torch device inside this actor or driver.
+def get_cuda_devices() -> List["torch.device"]:
+    """Gets the correct torch cuda device list configured for this process.
 
-    If any GPUs are available, the default device will be cuda:0 and we will rely on
-    torch to handle mapping CUDA_VISIBLE_DEVICES to a physical device.
-
-    If no GPUs are available, a CPU device will be returned if allow_cpu is true, else
-    the function will raise a RuntimeError.
+    Assumes that `CUDA_VISIBLE_DEVICES` is set and is a
+    superset of the `ray.get_gpu_ids()`.
     """
+    # Note: currently this method replicates the logic from
+    # `CUDATorchDeviceManager.get_devices()`.
+    # TODO(rui): tailor and clean up the logic for proper use in
+    # Compiled Graphs.
     import torch
 
-    accelerator_ids = ray.get_runtime_context().get_accelerator_ids()
-    if not accelerator_ids.get("GPU", []):
-        if allow_cpu:
-            return torch.device("cpu")
+    # GPU IDs are assigned by Ray after you specify "use_gpu"
+    # GPU `ray.get_gpu_ids()` may return ints or may return strings.
+    # We should always convert to strings.
+    gpu_ids = [str(id) for id in ray.get_gpu_ids()]
+
+    device_ids = []
+
+    if len(gpu_ids) > 0:
+        cuda_visible_str = os.environ.get("CUDA_VISIBLE_DEVICES", "")
+        if cuda_visible_str and cuda_visible_str != "NoDevFiles":
+            cuda_visible_list = cuda_visible_str.split(",")
         else:
-            raise RuntimeError("No CUDA device available.")
+            cuda_visible_list = []
+
+        # By default, there should only be one GPU ID if `use_gpu=True`.
+        # If there are multiple GPUs, return a list of devices.
+        # If using fractional GPUs, these IDs are not guaranteed
+        # to be unique across different processes.
+        for gpu_id in gpu_ids:
+            try:
+                device_ids.append(cuda_visible_list.index(gpu_id))
+            except IndexError:
+                raise RuntimeError(
+                    "CUDA_VISIBLE_DEVICES set incorrectly. "
+                    f"Got {cuda_visible_str}, expected to include {gpu_id}. "
+                    "Did you override the `CUDA_VISIBLE_DEVICES` environment"
+                    " variable? If not, please help file an issue on Github."
+                )
+
+    else:
+        # If called on the driver or outside of Ray Train, return the
+        # 0th device.
+        device_ids.append(0)
+
+    return [torch.device(f"cuda:{device_id}") for device_id in device_ids]
+
 
-    return torch.device("cuda:0")
+def get_devices() -> List["torch.device"]:
+    """Gets the correct torch device list configured for this process.
+
+    Returns a list of torch devices allocated for the current worker.
+    If no devices are assigned, then it returns a list with a single CPU device.
+    """
+
+    import torch
+
+    gpu_ids = [str(id) for id in ray.get_gpu_ids()]
+    if len(gpu_ids) > 0:
+        return get_cuda_devices()
+    else:
+        return [torch.device("cpu")]