Skip to content

Commit d257a01

Browse files
authored
Merge pull request #12596 from bosilca/topic/cuda_ipc_restore
Topic/cuda ipc restore
2 parents 71c2882 + 17d3873 commit d257a01

File tree

5 files changed

+182
-46
lines changed

5 files changed

+182
-46
lines changed

opal/mca/accelerator/cuda/accelerator_cuda.c

Lines changed: 149 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* All rights reserved.
77
* Copyright (c) Amazon.com, Inc. or its affiliates.
88
* All Rights reserved.
9+
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
910
* $COPYRIGHT$
1011
*
1112
* Additional copyrights may follow
@@ -106,6 +107,14 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
106107
accelerator_cuda_get_buffer_id
107108
};
108109

110+
static inline opal_accelerator_cuda_delayed_init_check(void)
111+
{
112+
if (OPAL_UNLIKELY(true != mca_accelerator_cuda_init_complete)) {
113+
return opal_accelerator_cuda_delayed_init();
114+
}
115+
return OPAL_SUCCESS;
116+
}
117+
109118
static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags)
110119
{
111120
CUresult result;
@@ -236,15 +245,15 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
236245
}
237246
}
238247
/* First access on a device pointer finalizes CUDA support initialization. */
239-
opal_accelerator_cuda_delayed_init();
248+
(void)opal_accelerator_cuda_delayed_init_check();
240249
return 1;
241250
}
242251

243252
static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream)
244253
{
245254
CUresult result;
246-
int delayed_init = opal_accelerator_cuda_delayed_init();
247-
if (OPAL_UNLIKELY(0 != delayed_init)) {
255+
int delayed_init = opal_accelerator_cuda_delayed_init_check();
256+
if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) {
248257
return delayed_init;
249258
}
250259
*stream = (opal_accelerator_stream_t*)OBJ_NEW(opal_accelerator_cuda_stream_t);
@@ -293,8 +302,8 @@ OBJ_CLASS_INSTANCE(
293302
static int accelerator_cuda_create_event(int dev_id, opal_accelerator_event_t **event, bool enable_ipc)
294303
{
295304
CUresult result;
296-
int delayed_init = opal_accelerator_cuda_delayed_init();
297-
if (OPAL_UNLIKELY(0 != delayed_init)) {
305+
int delayed_init = opal_accelerator_cuda_delayed_init_check();
306+
if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) {
298307
return delayed_init;
299308
}
300309

@@ -396,8 +405,8 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
396405
{
397406
CUresult result;
398407

399-
int delayed_init = opal_accelerator_cuda_delayed_init();
400-
if (OPAL_UNLIKELY(0 != delayed_init)) {
408+
int delayed_init = opal_accelerator_cuda_delayed_init_check();
409+
if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) {
401410
return delayed_init;
402411
}
403412

@@ -423,8 +432,8 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
423432
{
424433
CUresult result;
425434

426-
int delayed_init = opal_accelerator_cuda_delayed_init();
427-
if (OPAL_UNLIKELY(0 != delayed_init)) {
435+
int delayed_init = opal_accelerator_cuda_delayed_init_check();
436+
if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) {
428437
return delayed_init;
429438
}
430439

@@ -464,8 +473,8 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
464473
CUdeviceptr tmp;
465474
CUresult result;
466475

467-
int delayed_init = opal_accelerator_cuda_delayed_init();
468-
if (OPAL_UNLIKELY(0 != delayed_init)) {
476+
int delayed_init = opal_accelerator_cuda_delayed_init_check();
477+
if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) {
469478
return delayed_init;
470479
}
471480

@@ -503,8 +512,8 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
503512
{
504513
CUresult result;
505514

506-
int delayed_init = opal_accelerator_cuda_delayed_init();
507-
if (OPAL_UNLIKELY(0 != delayed_init)) {
515+
int delayed_init = opal_accelerator_cuda_delayed_init_check();
516+
if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) {
508517
return delayed_init;
509518
}
510519

@@ -542,8 +551,8 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
542551
{
543552
CUresult result;
544553

545-
int delayed_init = opal_accelerator_cuda_delayed_init();
546-
if (OPAL_UNLIKELY(0 != delayed_init)) {
554+
int delayed_init = opal_accelerator_cuda_delayed_init_check();
555+
if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) {
547556
return delayed_init;
548557
}
549558

@@ -566,25 +575,80 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
566575

567576
static bool accelerator_cuda_is_ipc_enabled(void)
568577
{
569-
return false;
578+
return true;
579+
}
580+
581+
static void mca_accelerator_cuda_ipc_handle_destruct(opal_accelerator_cuda_ipc_handle_t *handle)
582+
{
583+
if (NULL != handle && NULL != handle->base.dev_ptr) {
584+
cuIpcCloseMemHandle((CUdeviceptr) handle->base.dev_ptr);
585+
handle->base.dev_ptr = NULL;
586+
}
570587
}
571588

589+
OBJ_CLASS_INSTANCE(
590+
opal_accelerator_cuda_ipc_handle_t,
591+
opal_accelerator_ipc_handle_t,
592+
NULL,
593+
mca_accelerator_cuda_ipc_handle_destruct);
594+
572595
static int accelerator_cuda_get_ipc_handle(int dev_id, void *dev_ptr,
573596
opal_accelerator_ipc_handle_t *handle)
574597
{
575-
return OPAL_ERR_NOT_IMPLEMENTED;
598+
if (NULL == dev_ptr || NULL == handle) {
599+
return OPAL_ERR_BAD_PARAM;
600+
}
601+
602+
CUipcMemHandle cuda_ipc_handle;
603+
opal_accelerator_cuda_ipc_handle_t *cuda_handle = (opal_accelerator_cuda_ipc_handle_t *) handle;
604+
605+
OBJ_CONSTRUCT(cuda_handle, opal_accelerator_cuda_ipc_handle_t);
606+
cuda_handle->base.dev_ptr = NULL;
607+
608+
CUresult err = cuIpcGetMemHandle(&cuda_ipc_handle,
609+
(CUdeviceptr)dev_ptr);
610+
if (OPAL_UNLIKELY(CUDA_SUCCESS != err)) {
611+
opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
612+
"Error in cuIpcGetMemHandle dev_ptr %p", dev_ptr);
613+
OBJ_DESTRUCT(cuda_handle);
614+
return OPAL_ERROR;
615+
}
616+
memcpy(cuda_handle->base.handle, &cuda_ipc_handle, IPC_MAX_HANDLE_SIZE);
617+
618+
return OPAL_SUCCESS;
576619
}
577620

578621
static int accelerator_cuda_import_ipc_handle(int dev_id, uint8_t ipc_handle[IPC_MAX_HANDLE_SIZE],
579622
opal_accelerator_ipc_handle_t *handle)
580623
{
581-
return OPAL_ERR_NOT_IMPLEMENTED;
624+
opal_accelerator_cuda_ipc_handle_t *cuda_handle = (opal_accelerator_cuda_ipc_handle_t *) handle;
625+
OBJ_CONSTRUCT(cuda_handle, opal_accelerator_cuda_ipc_handle_t);
626+
memcpy(cuda_handle->base.handle, ipc_handle, IPC_MAX_HANDLE_SIZE);
627+
628+
return OPAL_SUCCESS;
582629
}
583630

584631
static int accelerator_cuda_open_ipc_handle(int dev_id, opal_accelerator_ipc_handle_t *handle,
585632
void **dev_ptr)
586633
{
587-
return OPAL_ERR_NOT_IMPLEMENTED;
634+
if (NULL == dev_ptr || NULL == handle) {
635+
return OPAL_ERR_BAD_PARAM;
636+
}
637+
638+
CUresult err = cuIpcOpenMemHandle((CUdeviceptr *) &handle->dev_ptr,
639+
*(CUipcMemHandle*)handle->handle,
640+
CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
641+
if (CUDA_ERROR_ALREADY_MAPPED == err) {
642+
return OPAL_ERR_WOULD_BLOCK;
643+
}
644+
else if (CUDA_SUCCESS != err) {
645+
opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
646+
"error in cuIpcOpenMemHandle");
647+
return OPAL_ERROR;
648+
}
649+
*dev_ptr = handle->dev_ptr;
650+
651+
return OPAL_SUCCESS;
588652
}
589653

590654
static int accelerator_cuda_compare_ipc_handles(uint8_t handle_1[IPC_MAX_HANDLE_SIZE],
@@ -593,29 +657,84 @@ static int accelerator_cuda_compare_ipc_handles(uint8_t handle_1[IPC_MAX_HANDLE_
593657
return memcmp(handle_1, handle_2, IPC_MAX_HANDLE_SIZE);
594658
}
595659

660+
static void mca_accelerator_cuda_ipc_event_handle_destruct(opal_accelerator_cuda_ipc_handle_t *handle)
661+
{
662+
// Just a place holder, there is no cuIpcCloseEventHandle.
663+
}
664+
665+
OBJ_CLASS_INSTANCE(
666+
opal_accelerator_cuda_ipc_event_handle_t,
667+
opal_accelerator_ipc_event_handle_t,
668+
NULL,
669+
mca_accelerator_cuda_ipc_event_handle_destruct);
670+
596671
static int accelerator_cuda_get_ipc_event_handle(opal_accelerator_event_t *event,
597672
opal_accelerator_ipc_event_handle_t *handle)
598673
{
599-
return OPAL_ERR_NOT_IMPLEMENTED;
674+
if (NULL == event || NULL == handle) {
675+
return OPAL_ERR_BAD_PARAM;
676+
}
677+
678+
CUipcEventHandle cuda_ipc_handle;
679+
opal_accelerator_cuda_ipc_event_handle_t *cuda_handle = (opal_accelerator_cuda_ipc_event_handle_t *) handle;
680+
OBJ_CONSTRUCT(cuda_handle, opal_accelerator_cuda_ipc_event_handle_t);
681+
682+
memset(cuda_ipc_handle.reserved, 0, CU_IPC_HANDLE_SIZE);
683+
CUresult err = cuIpcGetEventHandle(&cuda_ipc_handle,
684+
*((CUevent *)event->event));
685+
if (OPAL_UNLIKELY(CUDA_SUCCESS != err)) {
686+
opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
687+
"error in cuIpcGetEventHandle");
688+
OBJ_DESTRUCT(cuda_handle);
689+
return OPAL_ERROR;
690+
}
691+
memcpy(cuda_handle->base.handle, &cuda_ipc_handle, IPC_MAX_HANDLE_SIZE);
692+
693+
return OPAL_SUCCESS;
600694
}
601695

602696
static int accelerator_cuda_import_ipc_event_handle(uint8_t ipc_handle[IPC_MAX_HANDLE_SIZE],
603697
opal_accelerator_ipc_event_handle_t *handle)
604698
{
605-
return OPAL_ERR_NOT_IMPLEMENTED;
699+
opal_accelerator_cuda_ipc_handle_t *cuda_handle = (opal_accelerator_cuda_ipc_handle_t *) handle;
700+
701+
OBJ_CONSTRUCT(cuda_handle, opal_accelerator_cuda_ipc_handle_t);
702+
memcpy(cuda_handle->base.handle, ipc_handle, IPC_MAX_HANDLE_SIZE);
703+
704+
return OPAL_SUCCESS;
606705
}
607706

608707
static int accelerator_cuda_open_ipc_event_handle(opal_accelerator_ipc_event_handle_t *handle,
609708
opal_accelerator_event_t *event)
610709
{
611-
return OPAL_ERR_NOT_IMPLEMENTED;
710+
if (NULL == event || NULL == handle) {
711+
return OPAL_ERR_BAD_PARAM;
712+
}
713+
714+
opal_accelerator_cuda_ipc_event_handle_t *cuda_handle = (opal_accelerator_cuda_ipc_event_handle_t *) handle;
715+
opal_accelerator_cuda_event_t *cuda_event = (opal_accelerator_cuda_event_t *) event;
716+
OBJ_CONSTRUCT(cuda_event, opal_accelerator_cuda_event_t);
717+
cuda_event->base.event = malloc(sizeof(CUevent));
718+
if (NULL == cuda_event->base.event) {
719+
return OPAL_ERR_OUT_OF_RESOURCE;
720+
}
721+
722+
CUresult err = cuIpcOpenEventHandle( (CUevent *)cuda_event->base.event,
723+
*((CUipcEventHandle*)cuda_handle->base.handle));
724+
if (OPAL_UNLIKELY(CUDA_SUCCESS != err)) {
725+
opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
726+
"error in cuIpcOpenEventHandle");
727+
return OPAL_ERROR;
728+
}
729+
730+
return OPAL_SUCCESS;
612731
}
613732

614733
static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size)
615734
{
616735
CUresult result;
617-
int delayed_init = opal_accelerator_cuda_delayed_init();
618-
if (OPAL_UNLIKELY(0 != delayed_init)) {
736+
int delayed_init = opal_accelerator_cuda_delayed_init_check();
737+
if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) {
619738
return delayed_init;
620739
}
621740

@@ -652,8 +771,8 @@ static int accelerator_cuda_get_device(int *dev_id)
652771
CUdevice cuDev;
653772
CUresult result;
654773

655-
int delayed_init = opal_accelerator_cuda_delayed_init();
656-
if (OPAL_UNLIKELY(0 != delayed_init)) {
774+
int delayed_init = opal_accelerator_cuda_delayed_init_check();
775+
if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) {
657776
return delayed_init;
658777
}
659778

@@ -714,8 +833,8 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de
714833
{
715834
CUresult result;
716835

717-
int delayed_init = opal_accelerator_cuda_delayed_init();
718-
if (OPAL_UNLIKELY(0 != delayed_init)) {
836+
int delayed_init = opal_accelerator_cuda_delayed_init_check();
837+
if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) {
719838
return delayed_init;
720839
}
721840

@@ -744,8 +863,8 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
744863
CUresult result;
745864
int enable = 1;
746865

747-
int delayed_init = opal_accelerator_cuda_delayed_init();
748-
if (OPAL_UNLIKELY(0 != delayed_init)) {
866+
int delayed_init = opal_accelerator_cuda_delayed_init_check();
867+
if (OPAL_UNLIKELY(OPAL_SUCCESS != delayed_init)) {
749868
return delayed_init;
750869
}
751870

opal/mca/accelerator/cuda/accelerator_cuda.h

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
* Copyright (c) 2014 Intel, Inc. All rights reserved.
33
* Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
44
* All Rights reserved.
5+
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -37,15 +38,28 @@ struct opal_accelerator_cuda_event_t {
3738
typedef struct opal_accelerator_cuda_event_t opal_accelerator_cuda_event_t;
3839
OBJ_CLASS_DECLARATION(opal_accelerator_cuda_event_t);
3940

41+
struct opal_accelerator_cuda_ipc_handle_t {
42+
opal_accelerator_ipc_handle_t base;
43+
};
44+
typedef struct opal_accelerator_cuda_ipc_handle_t opal_accelerator_cuda_ipc_handle_t;
45+
OBJ_CLASS_DECLARATION(opal_accelerator_cuda_ipc_handle_t);
46+
47+
struct opal_accelerator_cuda_ipc_event_handle_t {
48+
opal_accelerator_ipc_event_handle_t base;
49+
};
50+
typedef struct opal_accelerator_cuda_ipc_event_handle_t opal_accelerator_cuda_ipc_event_handle_t;
51+
OBJ_CLASS_DECLARATION(opal_accelerator_cuda_ipc_event_handle_t);
52+
4053
/* Declare extern variables, defined in accelerator_cuda_component.c */
41-
OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_memcpy_stream;
42-
OPAL_DECLSPEC extern opal_mutex_t opal_accelerator_cuda_stream_lock;
54+
extern CUstream opal_accelerator_cuda_memcpy_stream;
55+
extern opal_mutex_t opal_accelerator_cuda_stream_lock;
56+
extern bool mca_accelerator_cuda_init_complete;
4357

4458
OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_component;
4559

46-
OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module;
60+
extern opal_accelerator_base_module_t opal_accelerator_cuda_module;
4761

48-
OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void);
62+
extern int opal_accelerator_cuda_delayed_init(void);
4963

5064
END_C_DECLS
5165

0 commit comments

Comments
 (0)