Skip to content

Commit 7bcbc7a

Browse files
authored
Merge pull request #10069 from wckzhang/acceleratorframework
opal/accelerator: Initial accelerator framework implementation
2 parents 6755fe7 + c7a5af3 commit 7bcbc7a

File tree

112 files changed

+4957
-1609
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

112 files changed

+4957
-1609
lines changed

config/opal_config_files.m4

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ AC_DEFUN([OPAL_CONFIG_FILES],[
1818
AC_CONFIG_FILES([
1919
opal/Makefile
2020
opal/cuda/Makefile
21-
opal/rocm/Makefile
2221
opal/etc/Makefile
2322
opal/include/Makefile
2423
opal/datatype/Makefile

configure.ac

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -988,19 +988,6 @@ AC_CACHE_SAVE
988988
opal_show_title "System-specific tests"
989989

990990
OPAL_CHECK_CUDA
991-
##################################
992-
# ROCm support
993-
##################################
994-
OPAL_CHECK_ROCM([opal_rocm],
995-
[opal_rocm_happy="yes"],
996-
[opal_rocm_happy="no"])
997-
OPAL_SUMMARY_ADD([Miscellaneous], [ROCm support], [], [$opal_rocm_happy])
998-
999-
AS_IF([test "$OPAL_CUDA_SUPPORT" = "1" && test "$OPAL_ROCM_SUPPORT" = "1"],
1000-
[AC_MSG_WARN([Cannot support both CUDA and ROCm.])
1001-
AC_MSG_WARN([You must reconfigure Open MPI choosing either CUDA or ROCm .])
1002-
AC_MSG_ERROR([Cannot continue.])])
1003-
1004991
##################################
1005992
OPAL_CHECK_OS_FLAVORS
1006993

ompi/mca/coll/cuda/coll_cuda.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
#include "ompi/mca/coll/coll.h"
2525
#include "ompi/mca/coll/base/base.h"
2626
#include "ompi/communicator/communicator.h"
27+
#include "opal/mca/accelerator/accelerator.h"
28+
#include "opal/mca/accelerator/base/base.h"
2729

2830
BEGIN_C_DECLS
2931

@@ -71,6 +73,41 @@ mca_coll_cuda_reduce_scatter_block(const void *sbuf, void *rbuf, int rcount,
7173
struct ompi_communicator_t *comm,
7274
mca_coll_base_module_t *module);
7375

76+
77+
/* Checks the type of pointer
78+
*
79+
* @param addr One pointer to check
80+
* @retval <0 An error has occurred.
81+
* @retval 0 The buffer does not belong to a managed buffer
82+
* in device memory.
83+
* @retval >0 The buffer belongs to a managed buffer in
84+
* device memory.
85+
*/
86+
static inline int mca_coll_cuda_check_buf(void *addr)
87+
{
88+
uint64_t flags;
89+
int dev_id;
90+
if (OPAL_LIKELY(NULL != addr)) {
91+
return opal_accelerator.check_addr(addr, &dev_id, &flags);
92+
} else {
93+
return OPAL_ERROR;
94+
}
95+
}
96+
97+
static inline void *mca_coll_cuda_memcpy(void *dest, const void *src, size_t size)
98+
{
99+
int res;
100+
res = opal_accelerator.memcpy(MCA_ACCELERATOR_NO_DEVICE_ID, MCA_ACCELERATOR_NO_DEVICE_ID,
101+
dest, src, size, MCA_ACCELERATOR_TRANSFER_UNSPEC);
102+
if (res != 0) {
103+
opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src,
104+
(int) size);
105+
abort();
106+
} else {
107+
return dest;
108+
}
109+
}
110+
74111
/* Types */
75112
/* Module */
76113

ompi/mca/coll/cuda/coll_cuda_allreduce.c

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
#include "ompi/op/op.h"
2020
#include "opal/datatype/opal_convertor.h"
21-
#include "opal/cuda/common_cuda.h"
2221

2322
/*
2423
* allreduce_intra
@@ -41,23 +40,29 @@ mca_coll_cuda_allreduce(const void *sbuf, void *rbuf, int count,
4140
int rc;
4241

4342
bufsize = opal_datatype_span(&dtype->super, count, &gap);
44-
45-
if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
43+
rc = mca_coll_cuda_check_buf((void *)sbuf);
44+
if (rc < 0) {
45+
return rc;
46+
}
47+
if ((MPI_IN_PLACE != sbuf) && (rc > 0)) {
4648
sbuf1 = (char*)malloc(bufsize);
4749
if (NULL == sbuf1) {
4850
return OMPI_ERR_OUT_OF_RESOURCE;
4951
}
50-
opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize);
52+
mca_coll_cuda_memcpy(sbuf1, sbuf, bufsize);
5153
sbuf = sbuf1 - gap;
5254
}
53-
54-
if (opal_cuda_check_bufs(rbuf, NULL)) {
55+
rc = mca_coll_cuda_check_buf(rbuf);
56+
if (rc < 0) {
57+
return rc;
58+
}
59+
if (rc > 0) {
5560
rbuf1 = (char*)malloc(bufsize);
5661
if (NULL == rbuf1) {
5762
if (NULL != sbuf1) free(sbuf1);
5863
return OMPI_ERR_OUT_OF_RESOURCE;
5964
}
60-
opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize);
65+
mca_coll_cuda_memcpy(rbuf1, rbuf, bufsize);
6166
rbuf2 = rbuf; /* save away original buffer */
6267
rbuf = rbuf1 - gap;
6368
}
@@ -67,7 +72,7 @@ mca_coll_cuda_allreduce(const void *sbuf, void *rbuf, int count,
6772
}
6873
if (NULL != rbuf1) {
6974
rbuf = rbuf2;
70-
opal_cuda_memcpy_sync(rbuf, rbuf1, bufsize);
75+
mca_coll_cuda_memcpy(rbuf, rbuf1, bufsize);
7176
free(rbuf1);
7277
}
7378
return rc;

ompi/mca/coll/cuda/coll_cuda_exscan.c

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
#include "ompi/op/op.h"
2020
#include "opal/datatype/opal_convertor.h"
21-
#include "opal/cuda/common_cuda.h"
2221

2322
int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count,
2423
struct ompi_datatype_t *dtype,
@@ -33,23 +32,30 @@ int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count,
3332
int rc;
3433

3534
bufsize = opal_datatype_span(&dtype->super, count, &gap);
35+
rc = mca_coll_cuda_check_buf((void *)sbuf);
36+
if (rc < 0) {
37+
return rc;
38+
}
3639

37-
if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
40+
if ((MPI_IN_PLACE != sbuf) && rc > 0) {
3841
sbuf1 = (char*)malloc(bufsize);
3942
if (NULL == sbuf1) {
4043
return OMPI_ERR_OUT_OF_RESOURCE;
4144
}
42-
opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize);
45+
mca_coll_cuda_memcpy(sbuf1, sbuf, bufsize);
4346
sbuf = sbuf1 - gap;
4447
}
45-
46-
if (opal_cuda_check_bufs(rbuf, NULL)) {
48+
rc = mca_coll_cuda_check_buf(rbuf);
49+
if (rc < 0) {
50+
return rc;
51+
}
52+
if (rc > 0) {
4753
rbuf1 = (char*)malloc(bufsize);
4854
if (NULL == rbuf1) {
4955
if (NULL != sbuf1) free(sbuf1);
5056
return OMPI_ERR_OUT_OF_RESOURCE;
5157
}
52-
opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize);
58+
mca_coll_cuda_memcpy(rbuf1, rbuf, bufsize);
5359
rbuf2 = rbuf; /* save away original buffer */
5460
rbuf = rbuf1 - gap;
5561
}
@@ -61,7 +67,7 @@ int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count,
6167
}
6268
if (NULL != rbuf1) {
6369
rbuf = rbuf2;
64-
opal_cuda_memcpy_sync(rbuf, rbuf1, bufsize);
70+
mca_coll_cuda_memcpy(rbuf, rbuf1, bufsize);
6571
free(rbuf1);
6672
}
6773
return rc;

ompi/mca/coll/cuda/coll_cuda_reduce.c

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
#include "ompi/op/op.h"
2020
#include "opal/datatype/opal_convertor.h"
21-
#include "opal/cuda/common_cuda.h"
2221

2322
/*
2423
* reduce_log_inter
@@ -44,24 +43,31 @@ mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count,
4443

4544
bufsize = opal_datatype_span(&dtype->super, count, &gap);
4645

47-
48-
if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
46+
rc = mca_coll_cuda_check_buf((void *)sbuf);
47+
if (rc < 0) {
48+
return rc;
49+
}
50+
if ((MPI_IN_PLACE != sbuf) && (rc > 0)) {
4951
sbuf1 = (char*)malloc(bufsize);
5052
if (NULL == sbuf1) {
5153
return OMPI_ERR_OUT_OF_RESOURCE;
5254
}
53-
opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize);
55+
mca_coll_cuda_memcpy(sbuf1, sbuf, bufsize);
5456
sbuf2 = sbuf; /* save away original buffer */
5557
sbuf = sbuf1 - gap;
5658
}
5759

58-
if ((rank == root) && (opal_cuda_check_bufs((char *)rbuf, NULL))) {
60+
rc = mca_coll_cuda_check_buf(rbuf);
61+
if (rc < 0) {
62+
return rc;
63+
}
64+
if ((rank == root) && (rc > 0)) {
5965
rbuf1 = (char*)malloc(bufsize);
6066
if (NULL == rbuf1) {
6167
if (NULL != sbuf1) free(sbuf1);
6268
return OMPI_ERR_OUT_OF_RESOURCE;
6369
}
64-
opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize);
70+
mca_coll_cuda_memcpy(rbuf1, rbuf, bufsize);
6571
rbuf2 = rbuf; /* save away original buffer */
6672
rbuf = rbuf1 - gap;
6773
}
@@ -74,7 +80,7 @@ mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count,
7480
}
7581
if (NULL != rbuf1) {
7682
rbuf = rbuf2;
77-
opal_cuda_memcpy_sync(rbuf, rbuf1, bufsize);
83+
mca_coll_cuda_memcpy(rbuf, rbuf1, bufsize);
7884
free(rbuf1);
7985
}
8086
return rc;

ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
#include "ompi/op/op.h"
2020
#include "opal/datatype/opal_convertor.h"
21-
#include "opal/cuda/common_cuda.h"
2221

2322
/*
2423
* reduce_scatter_block
@@ -47,23 +46,29 @@ mca_coll_cuda_reduce_scatter_block(const void *sbuf, void *rbuf, int rcount,
4746
rbufsize = opal_datatype_span(&dtype->super, rcount, &gap);
4847

4948
sbufsize = rbufsize * ompi_comm_size(comm);
50-
51-
if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
49+
rc = mca_coll_cuda_check_buf((void *)sbuf);
50+
if (rc < 0) {
51+
return rc;
52+
}
53+
if ((MPI_IN_PLACE != sbuf) && (rc > 0)) {
5254
sbuf1 = (char*)malloc(sbufsize);
5355
if (NULL == sbuf1) {
5456
return OMPI_ERR_OUT_OF_RESOURCE;
5557
}
56-
opal_cuda_memcpy_sync(sbuf1, sbuf, sbufsize);
58+
mca_coll_cuda_memcpy(sbuf1, sbuf, sbufsize);
5759
sbuf = sbuf1 - gap;
5860
}
59-
60-
if (opal_cuda_check_bufs(rbuf, NULL)) {
61+
rc = mca_coll_cuda_check_buf(rbuf);
62+
if (rc < 0) {
63+
return rc;
64+
}
65+
if (rc > 0) {
6166
rbuf1 = (char*)malloc(rbufsize);
6267
if (NULL == rbuf1) {
6368
if (NULL != sbuf1) free(sbuf1);
6469
return OMPI_ERR_OUT_OF_RESOURCE;
6570
}
66-
opal_cuda_memcpy_sync(rbuf1, rbuf, rbufsize);
71+
mca_coll_cuda_memcpy(rbuf1, rbuf, rbufsize);
6772
rbuf2 = rbuf; /* save away original buffer */
6873
rbuf = rbuf1 - gap;
6974
}
@@ -74,7 +79,7 @@ mca_coll_cuda_reduce_scatter_block(const void *sbuf, void *rbuf, int rcount,
7479
}
7580
if (NULL != rbuf1) {
7681
rbuf = rbuf2;
77-
opal_cuda_memcpy_sync(rbuf, rbuf1, rbufsize);
82+
mca_coll_cuda_memcpy(rbuf, rbuf1, rbufsize);
7883
free(rbuf1);
7984
}
8085
return rc;

ompi/mca/coll/cuda/coll_cuda_scan.c

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
#include "ompi/op/op.h"
2020
#include "opal/datatype/opal_convertor.h"
21-
#include "opal/cuda/common_cuda.h"
2221

2322
/*
2423
* scan
@@ -40,23 +39,29 @@ int mca_coll_cuda_scan(const void *sbuf, void *rbuf, int count,
4039
int rc;
4140

4241
bufsize = opal_datatype_span(&dtype->super, count, &gap);
43-
44-
if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
42+
rc = mca_coll_cuda_check_buf((void *)sbuf);
43+
if (rc < 0) {
44+
return rc;
45+
}
46+
if ((MPI_IN_PLACE != sbuf) && (rc > 0)) {
4547
sbuf1 = (char*)malloc(bufsize);
4648
if (NULL == sbuf1) {
4749
return OMPI_ERR_OUT_OF_RESOURCE;
4850
}
49-
opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize);
51+
mca_coll_cuda_memcpy(sbuf1, sbuf, bufsize);
5052
sbuf = sbuf1 - gap;
5153
}
52-
53-
if (opal_cuda_check_bufs(rbuf, NULL)) {
54+
rc = mca_coll_cuda_check_buf(rbuf);
55+
if (rc < 0) {
56+
return rc;
57+
}
58+
if (rc > 0) {
5459
rbuf1 = (char*)malloc(bufsize);
5560
if (NULL == rbuf1) {
5661
if (NULL != sbuf1) free(sbuf1);
5762
return OMPI_ERR_OUT_OF_RESOURCE;
5863
}
59-
opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize);
64+
mca_coll_cuda_memcpy(rbuf1, rbuf, bufsize);
6065
rbuf2 = rbuf; /* save away original buffer */
6166
rbuf = rbuf1 - gap;
6267
}
@@ -67,7 +72,7 @@ int mca_coll_cuda_scan(const void *sbuf, void *rbuf, int count,
6772
}
6873
if (NULL != rbuf1) {
6974
rbuf = rbuf2;
70-
opal_cuda_memcpy_sync(rbuf, rbuf1, bufsize);
75+
mca_coll_cuda_memcpy(rbuf, rbuf1, bufsize);
7176
free(rbuf1);
7277
}
7378
return rc;

ompi/mca/coll/libnbc/nbc_ialltoall.c

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ static int nbc_alltoall_init(const void* sendbuf, int sendcount, MPI_Datatype se
7171
void *tmpbuf = NULL;
7272
ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
7373
ptrdiff_t span, gap = 0;
74+
uint64_t flags;
75+
int is_accel_buf1;
76+
int is_accel_buf2;
77+
int dev_id;
7478

7579
NBC_IN_PLACE(sendbuf, recvbuf, inplace);
7680

@@ -145,11 +149,15 @@ static int nbc_alltoall_init(const void* sendbuf, int sendcount, MPI_Datatype se
145149
}
146150

147151
/* phase 1 - rotate n data blocks upwards into the tmpbuffer */
148-
#if OPAL_CUDA_SUPPORT
149-
if (NBC_Type_intrinsic(sendtype) && !(opal_cuda_check_bufs((char *)sendbuf, (char *)recvbuf))) {
150-
#else
151-
if (NBC_Type_intrinsic(sendtype)) {
152-
#endif /* OPAL_CUDA_SUPPORT */
152+
is_accel_buf1 = opal_accelerator.check_addr(sendbuf, &dev_id, &flags);
153+
is_accel_buf2 = opal_accelerator.check_addr(recvbuf, &dev_id, &flags);
154+
if (is_accel_buf1 < 0) {
155+
return is_accel_buf1;
156+
} else if (is_accel_buf2 < 0) {
157+
return is_accel_buf2;
158+
}
159+
if (NBC_Type_intrinsic(sendtype) &&
160+
is_accel_buf1 == 0 && is_accel_buf2 == 0) {
153161
/* contiguous - just copy (1st copy) */
154162
memcpy (tmpbuf, (char *) sendbuf + datasize * rank, datasize * (p - rank));
155163
if (rank != 0) {

0 commit comments

Comments
 (0)