Skip to content

Commit 19aa2d1

Browse files
authored
[BUG] Fix the long standing ValueError during the GPU/CUDA initialization (#3337)
* use xorbits PR to fix the gpu init error, see: xorbitsai/xorbits#242 * Use get_task_id() instead
1 parent ae8917f commit 19aa2d1

File tree

2 files changed

+7
-7
lines changed

2 files changed

+7
-7
lines changed

mars/lib/nvutils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -386,21 +386,23 @@ def get_device_info(dev_index: int) -> _cu_device_info:
386386
except KeyError:
387387
pass
388388

389+
_init_nvml()
390+
if _nvml_lib is None:
391+
return None
389392
_init()
390393
if _init_pid is None:
391394
return None
392395

393396
device = c_int()
394397
name_buf = create_string_buffer(100)
395-
uuid_t = _CUuuid_t()
396398
cc_major = c_int()
397399
cc_minor = c_int()
398400
cores = c_int()
399401
threads_per_core = c_int()
400402

403+
uuid_b = get_index_and_uuid(dev_index).uuid
401404
_cu_check_error(_cuda_lib.cuDeviceGet(byref(device), c_int(dev_index)))
402405
_cu_check_error(_cuda_lib.cuDeviceGetName(name_buf, len(name_buf), device))
403-
_cu_check_error(_cuda_lib.cuDeviceGetUuid(byref(uuid_t), device))
404406
_cu_check_error(
405407
_cuda_lib.cuDeviceComputeCapability(byref(cc_major), byref(cc_minor), device)
406408
)
@@ -426,7 +428,7 @@ def get_device_info(dev_index: int) -> _cu_device_info:
426428

427429
info = _device_infos[dev_index] = _cu_device_info(
428430
index=real_dev_index,
429-
uuid=uuid.UUID(bytes=uuid_t.bytes),
431+
uuid=uuid_b,
430432
name=name_buf.value.decode(),
431433
multiprocessors=cores.value,
432434
cuda_cores=cores.value
@@ -448,11 +450,9 @@ def get_device_status(dev_index: int) -> _nvml_device_status:
448450

449451
dev_uuid = get_device_info(dev_index).uuid
450452

451-
uuid_str = ("GPU-" + str(dev_uuid)).encode()
452-
453453
if not _is_wsl:
454454
_nvml_check_error(
455-
_nvml_lib.nvmlDeviceGetHandleByUUID(uuid_str, byref(c_device))
455+
_nvml_lib.nvmlDeviceGetHandleByUUID(dev_uuid, byref(c_device))
456456
)
457457

458458
_nvml_check_error(

mars/services/task/execution/ray/executor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def execute_subtask(
178178
"""
179179
init_metrics("ray")
180180
started_subtask_number.record(1)
181-
ray_task_id = ray.get_runtime_context().task_id
181+
ray_task_id = ray.get_runtime_context().get_task_id()
182182
subtask_chunk_graph = deserialize(*subtask_chunk_graph)
183183
logger.info("Start subtask: %s, ray task id: %s.", subtask_id, ray_task_id)
184184
# Optimize chunk graph.

0 commit comments

Comments
 (0)