Skip to content

Commit 8bc1f3d

Browse files
committed
Don't allow any asynchronous CUDA operations.
There are 2 reasons for this: - pending CUDA events are not progressed by this BTL, so anything that becomes asychronous will never be completed. - we use the packed data on the shared memory backing file, and this will be returned to the peer process upon return (thus if we copy asynchronously we might not copy the right data). Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
1 parent 0e32b0a commit 8bc1f3d

File tree

2 files changed

+38
-32
lines changed

2 files changed

+38
-32
lines changed

opal/mca/btl/smcuda/btl_smcuda.c

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1197,17 +1197,16 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
11971197
if (endpoint->ipcstate != IPC_INIT) {
11981198
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
11991199
return;
1200-
} else {
1201-
endpoint->ipctries++;
1202-
if (endpoint->ipctries > MAXTRIES) {
1203-
endpoint->ipcstate = IPC_BAD;
1204-
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
1205-
return;
1206-
}
1207-
/* All is good. Set up state and continue. */
1208-
endpoint->ipcstate = IPC_SENT;
1200+
}
1201+
endpoint->ipctries++;
1202+
if (endpoint->ipctries > MAXTRIES) {
1203+
endpoint->ipcstate = IPC_BAD;
12091204
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
1205+
return;
12101206
}
1207+
/* All is good. Set up state and continue. */
1208+
endpoint->ipcstate = IPC_SENT;
1209+
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
12111210

12121211
if ( mca_btl_smcuda_component.num_outstanding_frags * 2 > (int) mca_btl_smcuda_component.fifo_size ) {
12131212
mca_btl_smcuda_component_progress();

opal/mca/btl/smcuda/btl_smcuda_component.c

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,15 @@ static inline unsigned int mca_btl_smcuda_param_register_uint(
136136
return *storage;
137137
}
138138

139-
static int mca_btl_smcuda_component_verify(void) {
140-
139+
static int mca_btl_smcuda_component_verify(void)
140+
{
141+
/* We canot support async memcpy right now */
142+
if( (mca_btl_smcuda.super.btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV) ||
143+
(mca_btl_smcuda.super.btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND) ) {
144+
opal_output_verbose(10, opal_btl_base_framework.framework_output,
145+
"btl: smcuda: disable all asynchronous memcpy support");
146+
}
147+
mca_btl_smcuda.super.btl_flags &= ~(MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV | MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND);
141148
return mca_btl_base_param_verify(&mca_btl_smcuda.super);
142149
}
143150

@@ -1100,27 +1107,27 @@ int mca_btl_smcuda_component_progress(void)
11001107
}
11011108
goto recheck_peer;
11021109
}
1103-
default:
1104-
/* unknown */
1105-
/*
1106-
* This code path should presumably never be called.
1107-
* It's unclear if it should exist or, if so, how it should be written.
1108-
* If we want to return it to the sending process,
1109-
* we have to figure out who the sender is.
1110-
* It seems we need to subtract the mask bits.
1111-
* Then, hopefully this is an sm header that has an smp_rank field.
1112-
* Presumably that means the received header was relative.
1113-
* Or, maybe this code should just be removed.
1114-
*/
1115-
opal_output(0, "mca_btl_smcuda_component_progress read an unknown type of header");
1116-
hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr);
1117-
peer_smp_rank = hdr->my_smp_rank;
1118-
hdr = (mca_btl_smcuda_hdr_t*)((uintptr_t)hdr->frag |
1119-
MCA_BTL_SMCUDA_FRAG_STATUS_MASK);
1120-
MCA_BTL_SMCUDA_FIFO_WRITE(
1121-
mca_btl_smcuda_component.sm_peers[peer_smp_rank],
1122-
my_smp_rank, peer_smp_rank, hdr, false, true, rc);
1123-
break;
1110+
default:
1111+
/* unknown */
1112+
/*
1113+
* This code path should presumably never be called.
1114+
* It's unclear if it should exist or, if so, how it should be written.
1115+
* If we want to return it to the sending process,
1116+
* we have to figure out who the sender is.
1117+
* It seems we need to subtract the mask bits.
1118+
* Then, hopefully this is an sm header that has an smp_rank field.
1119+
* Presumably that means the received header was relative.
1120+
* Or, maybe this code should just be removed.
1121+
*/
1122+
opal_output(0, "mca_btl_smcuda_component_progress read an unknown type of header");
1123+
hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr);
1124+
peer_smp_rank = hdr->my_smp_rank;
1125+
hdr = (mca_btl_smcuda_hdr_t*)((uintptr_t)hdr->frag |
1126+
MCA_BTL_SMCUDA_FRAG_STATUS_MASK);
1127+
MCA_BTL_SMCUDA_FIFO_WRITE(
1128+
mca_btl_smcuda_component.sm_peers[peer_smp_rank],
1129+
my_smp_rank, peer_smp_rank, hdr, false, true, rc);
1130+
break;
11241131
}
11251132
}
11261133
(void)rc; /* this is safe to ignore as the message is requeued till success */

0 commit comments

Comments
 (0)