Skip to content

Commit 9fb2d4e

Browse files
authored
Merge pull request #13094 from hjelmn/more_btl_uct_updates_that_fix_various_issues_discovered_on_our_hardware
More btl uct updates that fix various issues discovered on our hardware
2 parents a477b22 + 2a128b1 commit 9fb2d4e

File tree

4 files changed

+51
-43
lines changed

4 files changed

+51
-43
lines changed

opal/mca/btl/uct/btl_uct_endpoint.c

Lines changed: 16 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ static void mca_btl_uct_endpoint_destruct(mca_btl_uct_endpoint_t *endpoint)
3838
}
3939

4040
OBJ_DESTRUCT(&endpoint->ep_lock);
41+
if (endpoint->conn_ep) {
42+
OBJ_RELEASE(endpoint->conn_ep);
43+
}
4144
}
4245

4346
OBJ_CLASS_INSTANCE(mca_btl_uct_endpoint_t, opal_object_t, mca_btl_uct_endpoint_construct,
@@ -206,7 +209,6 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl,
206209
mca_btl_uct_conn_req_t *request,
207210
size_t request_length)
208211
{
209-
mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep;
210212
mca_btl_uct_conn_completion_t completion
211213
= {.super = {.count = 1, .func = mca_btl_uct_endpoint_flush_complete}, .complete = false};
212214
ucs_status_t ucs_status;
@@ -215,15 +217,13 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl,
215217
("sending connection request to peer. context id: %d, type: %d, length: %" PRIsize_t,
216218
request->context_id, request->type, request_length));
217219

218-
OBJ_RETAIN(endpoint->conn_ep);
219-
220220
/* need to drop the lock to avoid hold-and-wait */
221221
opal_mutex_unlock(&endpoint->ep_lock);
222222

223223
do {
224224
MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, {
225-
ucs_status = uct_ep_am_short(conn_ep->uct_ep, MCA_BTL_UCT_CONNECT_RDMA, request->type,
226-
request, request_length);
225+
ucs_status = uct_ep_am_short(endpoint->conn_ep->uct_ep, MCA_BTL_UCT_CONNECT_RDMA,
226+
request->type, request, request_length);
227227
});
228228
if (OPAL_LIKELY(UCS_OK == ucs_status)) {
229229
break;
@@ -238,11 +238,11 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl,
238238
} while (1);
239239

240240
/* for now we just wait for the connection request to complete before continuing */
241-
ucs_status = uct_ep_flush(conn_ep->uct_ep, 0, &completion.super);
241+
ucs_status = uct_ep_flush(endpoint->conn_ep->uct_ep, 0, &completion.super);
242242
if (UCS_OK != ucs_status && UCS_INPROGRESS != ucs_status) {
243243
/* NTH: I don't know if this path is needed. For some networks we must use a completion. */
244244
do {
245-
ucs_status = uct_ep_flush(conn_ep->uct_ep, 0, NULL);
245+
ucs_status = uct_ep_flush(endpoint->conn_ep->uct_ep, 0, NULL);
246246
mca_btl_uct_context_progress(conn_tl_context);
247247
} while (UCS_INPROGRESS == ucs_status);
248248
} else {
@@ -253,8 +253,6 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl,
253253

254254
opal_mutex_lock(&endpoint->ep_lock);
255255

256-
OBJ_RELEASE(endpoint->conn_ep);
257-
258256
return OPAL_SUCCESS;
259257
}
260258

@@ -265,7 +263,6 @@ static int mca_btl_uct_endpoint_send_connection_data(
265263
{
266264
mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl;
267265
mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0];
268-
mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep;
269266
uct_device_addr_t *device_addr = NULL;
270267
uct_iface_addr_t *iface_addr;
271268
ucs_status_t ucs_status;
@@ -274,31 +271,33 @@ static int mca_btl_uct_endpoint_send_connection_data(
274271

275272
BTL_VERBOSE(("connecting endpoint to remote endpoint"));
276273

277-
if (NULL == conn_ep) {
274+
if (NULL == endpoint->conn_ep) {
278275
BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p",
279276
opal_process_name_print(endpoint->ep_proc->proc_name)));
280277

281278
iface_addr = (uct_iface_addr_t *) conn_tl_data;
282279
device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data
283280
+ MCA_BTL_UCT_TL_ATTR(conn_tl, 0).iface_addr_len);
284281

285-
endpoint->conn_ep = conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t);
286-
if (OPAL_UNLIKELY(NULL == conn_ep)) {
282+
endpoint->conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t);
283+
if (OPAL_UNLIKELY(NULL == endpoint->conn_ep)) {
287284
return OPAL_ERR_OUT_OF_RESOURCE;
288285
}
289286

290287
/* create a temporary endpoint for setting up the rdma endpoint */
291288
MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, {
292289
ucs_status = mca_btl_uct_ep_create_connected_compat(conn_tl_context->uct_iface,
293290
device_addr, iface_addr,
294-
&conn_ep->uct_ep);
291+
&endpoint->conn_ep->uct_ep);
295292
});
296293
if (UCS_OK != ucs_status) {
297294
BTL_VERBOSE(
298295
("could not create an endpoint for forming connection to remote peer. code = %d",
299296
ucs_status));
300297
return OPAL_ERROR;
301298
}
299+
} else {
300+
OBJ_RETAIN(endpoint->conn_ep);
302301
}
303302

304303
size_t request_length = sizeof(mca_btl_uct_conn_req_t)
@@ -368,6 +367,9 @@ static int mca_btl_uct_endpoint_connect_endpoint(
368367
if (UCS_OK != ucs_status) {
369368
return OPAL_ERROR;
370369
}
370+
371+
mca_btl_uct_endpoint_set_flag(uct_btl, endpoint, tl_context->context_id, tl_endpoint,
372+
MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED);
371373
}
372374

373375
opal_timer_t now = opal_timer_base_get_usec();
@@ -391,7 +393,6 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp
391393
mca_btl_uct_device_context_t *tl_context
392394
= mca_btl_uct_module_get_tl_context_specific(uct_btl, tl, context_id);
393395
uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data;
394-
mca_btl_uct_connection_ep_t *conn_ep = NULL;
395396
mca_btl_uct_modex_t *modex;
396397
uint8_t *modex_data;
397398
size_t msg_size;
@@ -474,19 +475,8 @@ int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endp
474475

475476
} while (0);
476477

477-
/* to avoid a possible hold-and wait deadlock. destroy the endpoint after dropping the endpoint
478-
* lock. */
479-
if (endpoint->conn_ep && 1 == endpoint->conn_ep->super.obj_reference_count) {
480-
conn_ep = endpoint->conn_ep;
481-
endpoint->conn_ep = NULL;
482-
}
483-
484478
opal_mutex_unlock(&endpoint->ep_lock);
485479

486-
if (conn_ep) {
487-
OBJ_RELEASE(conn_ep);
488-
}
489-
490480
BTL_VERBOSE(("endpoint%s ready for use", (OPAL_ERR_OUT_OF_RESOURCE != rc) ? "" : " not yet"));
491481

492482
return rc;

opal/mca/btl/uct/btl_uct_endpoint.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* reserved.
1515
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
1616
* All Rights reserved.
17+
* Copyright (c) 2025 Google, LLC. All rights reserved.
1718
* $COPYRIGHT$
1819
*
1920
* Additional copyrights may follow
@@ -106,5 +107,29 @@ static inline int mca_btl_uct_endpoint_check_am(mca_btl_uct_module_t *module,
106107
module->am_tl->tl_index);
107108
}
108109

110+
// Requires that the endpoint lock is held.
111+
static inline void mca_btl_uct_endpoint_set_flag(mca_btl_uct_module_t *module, mca_btl_uct_endpoint_t *endpoint,
112+
int context_id, mca_btl_uct_tl_endpoint_t *tl_endpoint, int32_t flag) {
113+
opal_atomic_wmb();
114+
int32_t flag_value = opal_atomic_or_fetch_32(&tl_endpoint->flags, flag);
115+
if ((flag_value & (MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED | MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REM_READY)) ==
116+
(MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED | MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REM_READY)) {
117+
opal_atomic_fetch_or_32(&tl_endpoint->flags, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY);
118+
119+
opal_atomic_wmb();
120+
121+
mca_btl_uct_base_frag_t *frag;
122+
OPAL_LIST_FOREACH (frag, &module->pending_frags, mca_btl_uct_base_frag_t) {
123+
if (frag->context->context_id == context_id && endpoint == frag->endpoint) {
124+
frag->ready = true;
125+
}
126+
}
127+
128+
if (endpoint->conn_ep) {
129+
OBJ_RELEASE(endpoint->conn_ep);
130+
}
131+
}
132+
}
133+
109134
END_C_DECLS
110135
#endif

opal/mca/btl/uct/btl_uct_tl.c

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* and Technology (RIST). All rights reserved.
77
* Copyright (c) 2018 Triad National Security, LLC. All rights
88
* reserved.
9-
* Copyright (c) 2019 Google, LLC. All rights reserved.
9+
* Copyright (c) 2019-2025 Google, LLC. All rights reserved.
1010
* $COPYRIGHT$
1111
*
1212
* Additional copyrights may follow
@@ -199,9 +199,6 @@ int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module,
199199
int32_t ep_flags;
200200
int rc;
201201

202-
BTL_VERBOSE(("got connection request for endpoint %p. type = %d. context id = %d",
203-
(void *) endpoint, req->type, req->context_id));
204-
205202
if (NULL == endpoint) {
206203
BTL_ERROR(("could not create endpoint for connection request"));
207204
return UCS_ERR_UNREACHABLE;
@@ -211,6 +208,9 @@ int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module,
211208

212209
ep_flags = opal_atomic_fetch_or_32(&tl_endpoint->flags, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC);
213210

211+
BTL_VERBOSE(("got connection request for endpoint %p. type = %d. context id = %d. ep_flags = %x",
212+
(void *) endpoint, req->type, req->context_id, ep_flags));
213+
214214
if (!(ep_flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC)) {
215215
/* create any necessary resources */
216216
rc = mca_btl_uct_endpoint_connect(module, endpoint, req->context_id, req->ep_addr,
@@ -225,22 +225,13 @@ int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module,
225225
* message. this might be overkill but there is little documentation at the UCT level on when
226226
* an endpoint can be used. */
227227
if (req->type == 1) {
228-
/* remote side is ready */
229-
mca_btl_uct_base_frag_t *frag;
230-
228+
/* remote side is connected */
231229
/* to avoid a race with send adding pending frags grab the lock here */
232230
OPAL_THREAD_SCOPED_LOCK(&endpoint->ep_lock, {
233231
BTL_VERBOSE(("connection ready. sending %" PRIsize_t " frags",
234232
opal_list_get_size(&module->pending_frags)));
235-
(void) opal_atomic_or_fetch_32(&tl_endpoint->flags,
236-
MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY);
237-
opal_atomic_wmb();
238-
239-
OPAL_LIST_FOREACH (frag, &module->pending_frags, mca_btl_uct_base_frag_t) {
240-
if (frag->context->context_id == req->context_id && endpoint == frag->endpoint) {
241-
frag->ready = true;
242-
}
243-
}
233+
mca_btl_uct_endpoint_set_flag(module, endpoint, req->context_id, tl_endpoint,
234+
MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REM_READY);
244235
});
245236
}
246237

opal/mca/btl/uct/btl_uct_types.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,10 @@ struct mca_btl_uct_base_frag_t;
2727
# define MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC 0x1
2828
/** remote endpoint read */
2929
# define MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REM_READY 0x2
30+
/** local UCT endpoint connected */
31+
# define MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED 0x4
3032
/** connection was established */
31-
# define MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY 0x4
33+
# define MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY 0x8
3234

3335
/* AM tags */
3436
/** BTL fragment */

0 commit comments

Comments
 (0)