Skip to content

Commit 1139d9e

Browse files
authored
Merge pull request #7931 from bosilca/fix/7928
Fix the BTL API conversion for the SMCUDA BTL
2 parents 7702dfc + 8bc1f3d commit 1139d9e

File tree

6 files changed

+66
-58
lines changed

6 files changed

+66
-58
lines changed

ompi/mca/pml/ob1/pml_ob1_recvfrag.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -624,10 +624,10 @@ void mca_pml_ob1_recv_frag_callback_frag (mca_btl_base_module_t *btl,
624624
assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV);
625625

626626
/* This will trigger the opal_convertor_pack to start asynchronous copy. */
627-
mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,descriptor->des_segment_count,des);
627+
mca_pml_ob1_recv_request_frag_copy_start(recvreq, btl, segments, descriptor->des_segment_count, NULL);
628628

629629
/* Let BTL know that it CANNOT free the frag */
630-
descriptor->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
630+
//TODO: GB: descriptor->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
631631

632632
return;
633633
}

ompi/mca/pml/ob1/pml_ob1_recvreq.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -577,7 +577,7 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq
577577
*/
578578
void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvreq,
579579
mca_btl_base_module_t* btl,
580-
mca_btl_base_segment_t* segments,
580+
const mca_btl_base_segment_t* segments,
581581
size_t num_segments,
582582
mca_btl_base_descriptor_t* des)
583583
{

ompi/mca/pml/ob1/pml_ob1_recvreq.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ void mca_pml_ob1_recv_request_progress_frag(
332332
void mca_pml_ob1_recv_request_frag_copy_start(
333333
mca_pml_ob1_recv_request_t* req,
334334
struct mca_btl_base_module_t* btl,
335-
mca_btl_base_segment_t* segments,
335+
const mca_btl_base_segment_t* segments,
336336
size_t num_segments,
337337
mca_btl_base_descriptor_t* des);
338338

opal/mca/btl/smcuda/btl_smcuda.c

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1197,17 +1197,16 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
11971197
if (endpoint->ipcstate != IPC_INIT) {
11981198
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
11991199
return;
1200-
} else {
1201-
endpoint->ipctries++;
1202-
if (endpoint->ipctries > MAXTRIES) {
1203-
endpoint->ipcstate = IPC_BAD;
1204-
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
1205-
return;
1206-
}
1207-
/* All is good. Set up state and continue. */
1208-
endpoint->ipcstate = IPC_SENT;
1200+
}
1201+
endpoint->ipctries++;
1202+
if (endpoint->ipctries > MAXTRIES) {
1203+
endpoint->ipcstate = IPC_BAD;
12091204
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
1205+
return;
12101206
}
1207+
/* All is good. Set up state and continue. */
1208+
endpoint->ipcstate = IPC_SENT;
1209+
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
12111210

12121211
if ( mca_btl_smcuda_component.num_outstanding_frags * 2 > (int) mca_btl_smcuda_component.fifo_size ) {
12131212
mca_btl_smcuda_component_progress();

opal/mca/btl/smcuda/btl_smcuda_component.c

Lines changed: 50 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2011 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2009 The University of Tennessee and The University
6+
* Copyright (c) 2004-2020 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -42,6 +42,7 @@
4242
#include <sys/stat.h> /* for mkfifo */
4343
#endif /* HAVE_SYS_STAT_H */
4444

45+
#include "opal/mca/hwloc/base/base.h"
4546
#include "opal/mca/shmem/base/base.h"
4647
#include "opal/mca/shmem/shmem.h"
4748
#include "opal/util/bit_ops.h"
@@ -135,8 +136,15 @@ static inline unsigned int mca_btl_smcuda_param_register_uint(
135136
return *storage;
136137
}
137138

138-
static int mca_btl_smcuda_component_verify(void) {
139-
139+
static int mca_btl_smcuda_component_verify(void)
140+
{
141+
/* We canot support async memcpy right now */
142+
if( (mca_btl_smcuda.super.btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV) ||
143+
(mca_btl_smcuda.super.btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND) ) {
144+
opal_output_verbose(10, opal_btl_base_framework.framework_output,
145+
"btl: smcuda: disable all asynchronous memcpy support");
146+
}
147+
mca_btl_smcuda.super.btl_flags &= ~(MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV | MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND);
140148
return mca_btl_base_param_verify(&mca_btl_smcuda.super);
141149
}
142150

@@ -679,20 +687,15 @@ static void mca_btl_smcuda_send_cuda_ipc_ack(struct mca_btl_base_module_t* btl,
679687
* BTL. It handles smcuda specific control messages that are triggered
680688
* when GPU memory transfers are initiated. */
681689
static void btl_smcuda_control(mca_btl_base_module_t* btl,
682-
mca_btl_base_tag_t tag,
683-
mca_btl_base_descriptor_t* des, void* cbdata)
690+
const mca_btl_base_receive_descriptor_t *descriptor)
684691
{
685692
int mydevnum, ipcaccess, res;
686693
ctrlhdr_t ctrlhdr;
687694
opal_proc_t *ep_proc;
688-
struct mca_btl_base_endpoint_t *endpoint;
689695
mca_btl_smcuda_t *smcuda_btl = (mca_btl_smcuda_t *)btl;
690-
mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
691-
mca_btl_base_segment_t* segments = des->des_segments;
696+
const mca_btl_base_segment_t* segments = descriptor->des_segments;
697+
struct mca_btl_base_endpoint_t *endpoint = descriptor->endpoint;
692698

693-
/* Use the rank of the peer that sent the data to get to the endpoint
694-
* structure. This is needed for PML callback. */
695-
endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
696699
ep_proc = endpoint->proc_opal;
697700

698701
/* Copy out control message payload to examine it */
@@ -764,7 +767,6 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl,
764767
}
765768
}
766769

767-
assert(endpoint->peer_smp_rank == frag->hdr->my_smp_rank);
768770
opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output,
769771
"Analyzed CUDA IPC request: myrank=%d, mydev=%d, peerrank=%d, "
770772
"peerdev=%d --> ACCESS=%d",
@@ -872,6 +874,13 @@ mca_btl_smcuda_component_init(int *num_btls,
872874
* shared-memory segment. this routine sets component sm_max_procs. */
873875
calc_sm_max_procs(num_local_procs);
874876

877+
/* Before we can safely create the backend file we need to know minimal
878+
* information about the local node. We need at least a size of a cache line
879+
* as we align the data in the backing file to it. The simplest way for now is
880+
* to force the HWLOC initialization.
881+
*/
882+
opal_hwloc_base_get_topology();
883+
875884
/* This is where the modex will live some day. For now, just have local rank
876885
* 0 create a rendezvous file containing the backing store info, so the
877886
* other local procs can read from it during add_procs. The rest will just
@@ -999,7 +1008,6 @@ int mca_btl_smcuda_component_progress(void)
9991008
/* local variables */
10001009
mca_btl_base_segment_t seg;
10011010
mca_btl_smcuda_frag_t *frag;
1002-
mca_btl_smcuda_frag_t Frag;
10031011
sm_fifo_t *fifo = NULL;
10041012
mca_btl_smcuda_hdr_t *hdr;
10051013
int my_smp_rank = mca_btl_smcuda_component.my_smp_rank;
@@ -1046,7 +1054,6 @@ int mca_btl_smcuda_component_progress(void)
10461054
switch(((uintptr_t)hdr) & MCA_BTL_SMCUDA_FRAG_TYPE_MASK) {
10471055
case MCA_BTL_SMCUDA_FRAG_SEND:
10481056
{
1049-
mca_btl_active_message_callback_t* reg;
10501057
/* change the address from address relative to the shared
10511058
* memory address, to a true virtual address */
10521059
hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr);
@@ -1058,17 +1065,16 @@ int mca_btl_smcuda_component_progress(void)
10581065
my_smp_rank, peer_smp_rank, j, FIFO_MAP(peer_smp_rank));
10591066
}
10601067
#endif
1061-
/* recv upcall */
1062-
reg = mca_btl_base_active_message_trigger + hdr->tag;
10631068
seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_smcuda_hdr_t);
10641069
seg.seg_len = hdr->len;
1065-
Frag.base.des_segment_count = 1;
1066-
Frag.base.des_segments = &seg;
1067-
#if OPAL_CUDA_SUPPORT
1068-
Frag.hdr = hdr; /* needed for peer rank in control messages */
1069-
#endif /* OPAL_CUDA_SUPPORT */
1070-
reg->cbfunc(&mca_btl_smcuda.super, hdr->tag, &(Frag.base),
1071-
reg->cbdata);
1070+
1071+
mca_btl_active_message_callback_t *reg = mca_btl_base_active_message_trigger + hdr->tag;
1072+
mca_btl_base_receive_descriptor_t recv_desc = {.endpoint = mca_btl_smcuda_component.sm_peers[peer_smp_rank],
1073+
.des_segments = &seg,
1074+
.des_segment_count = 1,
1075+
.tag = hdr->tag,
1076+
.cbdata = reg->cbdata};
1077+
reg->cbfunc(&mca_btl_smcuda.super, &recv_desc);
10721078
/* return the fragment */
10731079
MCA_BTL_SMCUDA_FIFO_WRITE(
10741080
mca_btl_smcuda_component.sm_peers[peer_smp_rank],
@@ -1101,27 +1107,27 @@ int mca_btl_smcuda_component_progress(void)
11011107
}
11021108
goto recheck_peer;
11031109
}
1104-
default:
1105-
/* unknown */
1106-
/*
1107-
* This code path should presumably never be called.
1108-
* It's unclear if it should exist or, if so, how it should be written.
1109-
* If we want to return it to the sending process,
1110-
* we have to figure out who the sender is.
1111-
* It seems we need to subtract the mask bits.
1112-
* Then, hopefully this is an sm header that has an smp_rank field.
1113-
* Presumably that means the received header was relative.
1114-
* Or, maybe this code should just be removed.
1115-
*/
1116-
opal_output(0, "mca_btl_smcuda_component_progress read an unknown type of header");
1117-
hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr);
1118-
peer_smp_rank = hdr->my_smp_rank;
1119-
hdr = (mca_btl_smcuda_hdr_t*)((uintptr_t)hdr->frag |
1120-
MCA_BTL_SMCUDA_FRAG_STATUS_MASK);
1121-
MCA_BTL_SMCUDA_FIFO_WRITE(
1122-
mca_btl_smcuda_component.sm_peers[peer_smp_rank],
1123-
my_smp_rank, peer_smp_rank, hdr, false, true, rc);
1124-
break;
1110+
default:
1111+
/* unknown */
1112+
/*
1113+
* This code path should presumably never be called.
1114+
* It's unclear if it should exist or, if so, how it should be written.
1115+
* If we want to return it to the sending process,
1116+
* we have to figure out who the sender is.
1117+
* It seems we need to subtract the mask bits.
1118+
* Then, hopefully this is an sm header that has an smp_rank field.
1119+
* Presumably that means the received header was relative.
1120+
* Or, maybe this code should just be removed.
1121+
*/
1122+
opal_output(0, "mca_btl_smcuda_component_progress read an unknown type of header");
1123+
hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr);
1124+
peer_smp_rank = hdr->my_smp_rank;
1125+
hdr = (mca_btl_smcuda_hdr_t*)((uintptr_t)hdr->frag |
1126+
MCA_BTL_SMCUDA_FRAG_STATUS_MASK);
1127+
MCA_BTL_SMCUDA_FIFO_WRITE(
1128+
mca_btl_smcuda_component.sm_peers[peer_smp_rank],
1129+
my_smp_rank, peer_smp_rank, hdr, false, true, rc);
1130+
break;
11251131
}
11261132
}
11271133
(void)rc; /* this is safe to ignore as the message is requeued till success */

opal/mca/common/cuda/common_cuda.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ static int cuda_event_ipc_first_avail, cuda_event_dtoh_first_avail, cuda_event_h
182182
static int cuda_event_ipc_first_used, cuda_event_dtoh_first_used, cuda_event_htod_first_used;
183183

184184
/* Number of status items currently in use */
185-
static int cuda_event_ipc_num_used, cuda_event_dtoh_num_used, cuda_event_htod_num_used;
185+
static volatile int cuda_event_ipc_num_used, cuda_event_dtoh_num_used, cuda_event_htod_num_used;
186186

187187
/* Size of array holding events */
188188
int cuda_event_max = 400;
@@ -1507,6 +1507,9 @@ void *mca_common_cuda_get_htod_stream(void) {
15071507
int progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t **frag) {
15081508
CUresult result;
15091509

1510+
if( OPAL_LIKELY(0 == cuda_event_ipc_num_used) )
1511+
return 0;
1512+
15101513
OPAL_THREAD_LOCK(&common_cuda_ipc_lock);
15111514
if (cuda_event_ipc_num_used > 0) {
15121515
opal_output_verbose(20, mca_common_cuda_output,

0 commit comments

Comments
 (0)