6
6
* All rights reserved.
7
7
* Copyright (c) Amazon.com, Inc. or its affiliates.
8
8
* All Rights reserved.
9
+ * Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
9
10
* $COPYRIGHT$
10
11
*
11
12
* Additional copyrights may follow
@@ -106,6 +107,14 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
106
107
accelerator_cuda_get_buffer_id
107
108
};
108
109
110
+ static inline opal_accelerator_cuda_delayed_init_check (void )
111
+ {
112
+ if (OPAL_UNLIKELY (true != mca_accelerator_cuda_init_complete )) {
113
+ return opal_accelerator_cuda_delayed_init ();
114
+ }
115
+ return OPAL_SUCCESS ;
116
+ }
117
+
109
118
static int accelerator_cuda_check_addr (const void * addr , int * dev_id , uint64_t * flags )
110
119
{
111
120
CUresult result ;
@@ -236,15 +245,15 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
236
245
}
237
246
}
238
247
/* First access on a device pointer finalizes CUDA support initialization. */
239
- opal_accelerator_cuda_delayed_init ();
248
+ ( void ) opal_accelerator_cuda_delayed_init_check ();
240
249
return 1 ;
241
250
}
242
251
243
252
static int accelerator_cuda_create_stream (int dev_id , opal_accelerator_stream_t * * stream )
244
253
{
245
254
CUresult result ;
246
- int delayed_init = opal_accelerator_cuda_delayed_init ();
247
- if (OPAL_UNLIKELY (0 != delayed_init )) {
255
+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
256
+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
248
257
return delayed_init ;
249
258
}
250
259
* stream = (opal_accelerator_stream_t * )OBJ_NEW (opal_accelerator_cuda_stream_t );
@@ -293,8 +302,8 @@ OBJ_CLASS_INSTANCE(
293
302
static int accelerator_cuda_create_event (int dev_id , opal_accelerator_event_t * * event , bool enable_ipc )
294
303
{
295
304
CUresult result ;
296
- int delayed_init = opal_accelerator_cuda_delayed_init ();
297
- if (OPAL_UNLIKELY (0 != delayed_init )) {
305
+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
306
+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
298
307
return delayed_init ;
299
308
}
300
309
@@ -396,8 +405,8 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
396
405
{
397
406
CUresult result ;
398
407
399
- int delayed_init = opal_accelerator_cuda_delayed_init ();
400
- if (OPAL_UNLIKELY (0 != delayed_init )) {
408
+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
409
+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
401
410
return delayed_init ;
402
411
}
403
412
@@ -423,8 +432,8 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
423
432
{
424
433
CUresult result ;
425
434
426
- int delayed_init = opal_accelerator_cuda_delayed_init ();
427
- if (OPAL_UNLIKELY (0 != delayed_init )) {
435
+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
436
+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
428
437
return delayed_init ;
429
438
}
430
439
@@ -464,8 +473,8 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
464
473
CUdeviceptr tmp ;
465
474
CUresult result ;
466
475
467
- int delayed_init = opal_accelerator_cuda_delayed_init ();
468
- if (OPAL_UNLIKELY (0 != delayed_init )) {
476
+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
477
+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
469
478
return delayed_init ;
470
479
}
471
480
@@ -503,8 +512,8 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
503
512
{
504
513
CUresult result ;
505
514
506
- int delayed_init = opal_accelerator_cuda_delayed_init ();
507
- if (OPAL_UNLIKELY (0 != delayed_init )) {
515
+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
516
+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
508
517
return delayed_init ;
509
518
}
510
519
@@ -542,8 +551,8 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
542
551
{
543
552
CUresult result ;
544
553
545
- int delayed_init = opal_accelerator_cuda_delayed_init ();
546
- if (OPAL_UNLIKELY (0 != delayed_init )) {
554
+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
555
+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
547
556
return delayed_init ;
548
557
}
549
558
@@ -566,25 +575,80 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
566
575
567
576
static bool accelerator_cuda_is_ipc_enabled (void )
568
577
{
569
- return false;
578
+ return true;
579
+ }
580
+
581
+ static void mca_accelerator_cuda_ipc_handle_destruct (opal_accelerator_cuda_ipc_handle_t * handle )
582
+ {
583
+ if (NULL != handle && NULL != handle -> base .dev_ptr ) {
584
+ cuIpcCloseMemHandle ((CUdeviceptr ) handle -> base .dev_ptr );
585
+ handle -> base .dev_ptr = NULL ;
586
+ }
570
587
}
571
588
589
+ OBJ_CLASS_INSTANCE (
590
+ opal_accelerator_cuda_ipc_handle_t ,
591
+ opal_accelerator_ipc_handle_t ,
592
+ NULL ,
593
+ mca_accelerator_cuda_ipc_handle_destruct );
594
+
572
595
static int accelerator_cuda_get_ipc_handle (int dev_id , void * dev_ptr ,
573
596
opal_accelerator_ipc_handle_t * handle )
574
597
{
575
- return OPAL_ERR_NOT_IMPLEMENTED ;
598
+ if (NULL == dev_ptr || NULL == handle ) {
599
+ return OPAL_ERR_BAD_PARAM ;
600
+ }
601
+
602
+ CUipcMemHandle cuda_ipc_handle ;
603
+ opal_accelerator_cuda_ipc_handle_t * cuda_handle = (opal_accelerator_cuda_ipc_handle_t * ) handle ;
604
+
605
+ OBJ_CONSTRUCT (cuda_handle , opal_accelerator_cuda_ipc_handle_t );
606
+ cuda_handle -> base .dev_ptr = NULL ;
607
+
608
+ CUresult err = cuIpcGetMemHandle (& cuda_ipc_handle ,
609
+ (CUdeviceptr )dev_ptr );
610
+ if (OPAL_UNLIKELY (CUDA_SUCCESS != err )) {
611
+ opal_output_verbose (10 , opal_accelerator_base_framework .framework_output ,
612
+ "Error in cuIpcGetMemHandle dev_ptr %p" , dev_ptr );
613
+ OBJ_DESTRUCT (cuda_handle );
614
+ return OPAL_ERROR ;
615
+ }
616
+ memcpy (cuda_handle -> base .handle , & cuda_ipc_handle , IPC_MAX_HANDLE_SIZE );
617
+
618
+ return OPAL_SUCCESS ;
576
619
}
577
620
578
621
static int accelerator_cuda_import_ipc_handle (int dev_id , uint8_t ipc_handle [IPC_MAX_HANDLE_SIZE ],
579
622
opal_accelerator_ipc_handle_t * handle )
580
623
{
581
- return OPAL_ERR_NOT_IMPLEMENTED ;
624
+ opal_accelerator_cuda_ipc_handle_t * cuda_handle = (opal_accelerator_cuda_ipc_handle_t * ) handle ;
625
+ OBJ_CONSTRUCT (cuda_handle , opal_accelerator_cuda_ipc_handle_t );
626
+ memcpy (cuda_handle -> base .handle , ipc_handle , IPC_MAX_HANDLE_SIZE );
627
+
628
+ return OPAL_SUCCESS ;
582
629
}
583
630
584
631
static int accelerator_cuda_open_ipc_handle (int dev_id , opal_accelerator_ipc_handle_t * handle ,
585
632
void * * dev_ptr )
586
633
{
587
- return OPAL_ERR_NOT_IMPLEMENTED ;
634
+ if (NULL == dev_ptr || NULL == handle ) {
635
+ return OPAL_ERR_BAD_PARAM ;
636
+ }
637
+
638
+ CUresult err = cuIpcOpenMemHandle ((CUdeviceptr * ) & handle -> dev_ptr ,
639
+ * (CUipcMemHandle * )handle -> handle ,
640
+ CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS );
641
+ if (CUDA_ERROR_ALREADY_MAPPED == err ) {
642
+ return OPAL_ERR_WOULD_BLOCK ;
643
+ }
644
+ else if (CUDA_SUCCESS != err ) {
645
+ opal_output_verbose (10 , opal_accelerator_base_framework .framework_output ,
646
+ "error in cuIpcOpenMemHandle" );
647
+ return OPAL_ERROR ;
648
+ }
649
+ * dev_ptr = handle -> dev_ptr ;
650
+
651
+ return OPAL_SUCCESS ;
588
652
}
589
653
590
654
static int accelerator_cuda_compare_ipc_handles (uint8_t handle_1 [IPC_MAX_HANDLE_SIZE ],
@@ -593,29 +657,84 @@ static int accelerator_cuda_compare_ipc_handles(uint8_t handle_1[IPC_MAX_HANDLE_
593
657
return memcmp (handle_1 , handle_2 , IPC_MAX_HANDLE_SIZE );
594
658
}
595
659
660
+ static void mca_accelerator_cuda_ipc_event_handle_destruct (opal_accelerator_cuda_ipc_handle_t * handle )
661
+ {
662
+ // Just a place holder, there is no cuIpcCloseEventHandle.
663
+ }
664
+
665
+ OBJ_CLASS_INSTANCE (
666
+ opal_accelerator_cuda_ipc_event_handle_t ,
667
+ opal_accelerator_ipc_event_handle_t ,
668
+ NULL ,
669
+ mca_accelerator_cuda_ipc_event_handle_destruct );
670
+
596
671
static int accelerator_cuda_get_ipc_event_handle (opal_accelerator_event_t * event ,
597
672
opal_accelerator_ipc_event_handle_t * handle )
598
673
{
599
- return OPAL_ERR_NOT_IMPLEMENTED ;
674
+ if (NULL == event || NULL == handle ) {
675
+ return OPAL_ERR_BAD_PARAM ;
676
+ }
677
+
678
+ CUipcEventHandle cuda_ipc_handle ;
679
+ opal_accelerator_cuda_ipc_event_handle_t * cuda_handle = (opal_accelerator_cuda_ipc_event_handle_t * ) handle ;
680
+ OBJ_CONSTRUCT (cuda_handle , opal_accelerator_cuda_ipc_event_handle_t );
681
+
682
+ memset (cuda_ipc_handle .reserved , 0 , CU_IPC_HANDLE_SIZE );
683
+ CUresult err = cuIpcGetEventHandle (& cuda_ipc_handle ,
684
+ * ((CUevent * )event -> event ));
685
+ if (OPAL_UNLIKELY (CUDA_SUCCESS != err )) {
686
+ opal_output_verbose (10 , opal_accelerator_base_framework .framework_output ,
687
+ "error in cuIpcGetEventHandle" );
688
+ OBJ_DESTRUCT (cuda_handle );
689
+ return OPAL_ERROR ;
690
+ }
691
+ memcpy (cuda_handle -> base .handle , & cuda_ipc_handle , IPC_MAX_HANDLE_SIZE );
692
+
693
+ return OPAL_SUCCESS ;
600
694
}
601
695
602
696
static int accelerator_cuda_import_ipc_event_handle (uint8_t ipc_handle [IPC_MAX_HANDLE_SIZE ],
603
697
opal_accelerator_ipc_event_handle_t * handle )
604
698
{
605
- return OPAL_ERR_NOT_IMPLEMENTED ;
699
+ opal_accelerator_cuda_ipc_handle_t * cuda_handle = (opal_accelerator_cuda_ipc_handle_t * ) handle ;
700
+
701
+ OBJ_CONSTRUCT (cuda_handle , opal_accelerator_cuda_ipc_handle_t );
702
+ memcpy (cuda_handle -> base .handle , ipc_handle , IPC_MAX_HANDLE_SIZE );
703
+
704
+ return OPAL_SUCCESS ;
606
705
}
607
706
608
707
static int accelerator_cuda_open_ipc_event_handle (opal_accelerator_ipc_event_handle_t * handle ,
609
708
opal_accelerator_event_t * event )
610
709
{
611
- return OPAL_ERR_NOT_IMPLEMENTED ;
710
+ if (NULL == event || NULL == handle ) {
711
+ return OPAL_ERR_BAD_PARAM ;
712
+ }
713
+
714
+ opal_accelerator_cuda_ipc_event_handle_t * cuda_handle = (opal_accelerator_cuda_ipc_event_handle_t * ) handle ;
715
+ opal_accelerator_cuda_event_t * cuda_event = (opal_accelerator_cuda_event_t * ) event ;
716
+ OBJ_CONSTRUCT (cuda_event , opal_accelerator_cuda_event_t );
717
+ cuda_event -> base .event = malloc (sizeof (CUevent ));
718
+ if (NULL == cuda_event -> base .event ) {
719
+ return OPAL_ERR_OUT_OF_RESOURCE ;
720
+ }
721
+
722
+ CUresult err = cuIpcOpenEventHandle ( (CUevent * )cuda_event -> base .event ,
723
+ * ((CUipcEventHandle * )cuda_handle -> base .handle ));
724
+ if (OPAL_UNLIKELY (CUDA_SUCCESS != err )) {
725
+ opal_output_verbose (10 , opal_accelerator_base_framework .framework_output ,
726
+ "error in cuIpcOpenEventHandle" );
727
+ return OPAL_ERROR ;
728
+ }
729
+
730
+ return OPAL_SUCCESS ;
612
731
}
613
732
614
733
static int accelerator_cuda_host_register (int dev_id , void * ptr , size_t size )
615
734
{
616
735
CUresult result ;
617
- int delayed_init = opal_accelerator_cuda_delayed_init ();
618
- if (OPAL_UNLIKELY (0 != delayed_init )) {
736
+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
737
+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
619
738
return delayed_init ;
620
739
}
621
740
@@ -652,8 +771,8 @@ static int accelerator_cuda_get_device(int *dev_id)
652
771
CUdevice cuDev ;
653
772
CUresult result ;
654
773
655
- int delayed_init = opal_accelerator_cuda_delayed_init ();
656
- if (OPAL_UNLIKELY (0 != delayed_init )) {
774
+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
775
+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
657
776
return delayed_init ;
658
777
}
659
778
@@ -714,8 +833,8 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de
714
833
{
715
834
CUresult result ;
716
835
717
- int delayed_init = opal_accelerator_cuda_delayed_init ();
718
- if (OPAL_UNLIKELY (0 != delayed_init )) {
836
+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
837
+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
719
838
return delayed_init ;
720
839
}
721
840
@@ -744,8 +863,8 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
744
863
CUresult result ;
745
864
int enable = 1 ;
746
865
747
- int delayed_init = opal_accelerator_cuda_delayed_init ();
748
- if (OPAL_UNLIKELY (0 != delayed_init )) {
866
+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
867
+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
749
868
return delayed_init ;
750
869
}
751
870
0 commit comments