Skip to content

Commit e07a64c

Browse files
committed
btl/uct: fix some issues when using UCX over ugni
Though not a recommended configuration it is possible to use Open MPI over UCX over uGNI. This configuration had some issues related to the connection management and tl selection. This commit fixes those issues. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
1 parent fccb3e7 commit e07a64c

File tree

3 files changed

+36
-7
lines changed

3 files changed

+36
-7
lines changed

opal/mca/btl/uct/btl_uct_component.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
* Copyright (c) 2018 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
18+
* Copyright (c) 2018 Triad National Security, LLC. All rights
19+
* reserved.
1820
* $COPYRIGHT$
1921
*
2022
* Additional copyrights may follow
@@ -53,7 +55,7 @@ static int mca_btl_uct_component_register(void)
5355
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
5456
&mca_btl_uct_component.memory_domains);
5557

56-
mca_btl_uct_component.allowed_transports = "dc_mlx5,rc_mlx5,ud,any";
58+
mca_btl_uct_component.allowed_transports = "dc_mlx5,rc_mlx5,ud,ugni_rdma,ugni_smsg,any";
5759
(void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version,
5860
"transports", "Comma-delimited list of transports to use sorted by increasing "
5961
"priority. The list of transports available can be queried using ucx_info. Special"

opal/mca/btl/uct/btl_uct_endpoint.c

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
/*
33
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
44
* reserved.
5+
* Copyright (c) 2018 Triad National Security, LLC. All rights
6+
* reserved.
57
* $COPYRIGHT$
68
*
79
* Additional copyrights may follow
@@ -137,11 +139,26 @@ static void mca_btl_uct_connection_ep_destruct (mca_btl_uct_connection_ep_t *ep)
137139
OBJ_CLASS_INSTANCE(mca_btl_uct_connection_ep_t, opal_object_t, mca_btl_uct_connection_ep_construct,
138140
mca_btl_uct_connection_ep_destruct);
139141

142+
struct mca_btl_uct_conn_completion_t {
143+
uct_completion_t super;
144+
volatile bool complete;
145+
};
146+
typedef struct mca_btl_uct_conn_completion_t mca_btl_uct_conn_completion_t;
147+
148+
static void mca_btl_uct_endpoint_flush_complete (uct_completion_t *self, ucs_status_t status)
149+
{
150+
mca_btl_uct_conn_completion_t *completion = (mca_btl_uct_conn_completion_t *) self;
151+
BTL_VERBOSE(("connection flush complete"));
152+
completion->complete = true;
153+
}
154+
140155
static int mca_btl_uct_endpoint_send_conn_req (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint,
141156
mca_btl_uct_device_context_t *conn_tl_context,
142157
mca_btl_uct_conn_req_t *request, size_t request_length)
143158
{
144159
mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep;
160+
mca_btl_uct_conn_completion_t completion = {.super = {.count = 1, .func = mca_btl_uct_endpoint_flush_complete},
161+
.complete = false};
145162
ucs_status_t ucs_status;
146163

147164
BTL_VERBOSE(("sending connection request to peer. context id: %d, type: %d, length: %" PRIsize_t,
@@ -170,10 +187,18 @@ static int mca_btl_uct_endpoint_send_conn_req (mca_btl_uct_module_t *uct_btl, mc
170187
} while (1);
171188

172189
/* for now we just wait for the connection request to complete before continuing */
173-
do {
174-
ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, NULL);
175-
mca_btl_uct_context_progress (conn_tl_context);
176-
} while (UCS_INPROGRESS == ucs_status);
190+
ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, &completion.super);
191+
if (UCS_OK != ucs_status && UCS_INPROGRESS != ucs_status) {
192+
/* NTH: I don't know if this path is needed. For some networks we must use a completion. */
193+
do {
194+
ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, NULL);
195+
mca_btl_uct_context_progress (conn_tl_context);
196+
} while (UCS_INPROGRESS == ucs_status);
197+
} else {
198+
do {
199+
mca_btl_uct_context_progress (conn_tl_context);
200+
} while (!completion.complete);
201+
}
177202

178203
opal_mutex_lock (&endpoint->ep_lock);
179204

@@ -284,8 +309,8 @@ int mca_btl_uct_endpoint_connect (mca_btl_uct_module_t *uct_btl, mca_btl_uct_end
284309
void *ep_addr, int tl_index)
285310
{
286311
mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[context_id] + tl_index;
287-
mca_btl_uct_device_context_t *tl_context = mca_btl_uct_module_get_rdma_context_specific (uct_btl, context_id);
288312
mca_btl_uct_tl_t *tl = (tl_index == uct_btl->rdma_tl->tl_index) ? uct_btl->rdma_tl : uct_btl->am_tl;
313+
mca_btl_uct_device_context_t *tl_context = mca_btl_uct_module_get_tl_context_specific (uct_btl, tl, context_id);
289314
uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data;
290315
mca_btl_uct_connection_ep_t *conn_ep = NULL;
291316
mca_btl_uct_modex_t *modex;

opal/mca/btl/uct/btl_uct_tl.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
* reserved.
55
* Copyright (c) 2018 Research Organization for Information Science
66
* and Technology (RIST). All rights reserved.
7+
* Copyright (c) 2018 Triad National Security, LLC. All rights
8+
* reserved.
79
* $COPYRIGHT$
810
*
911
* Additional copyrights may follow
@@ -26,7 +28,7 @@
2628
* @brief Convert UCT capabilities to BTL flags
2729
*/
2830
static uint64_t mca_btl_uct_cap_to_btl_flag[][2] = {
29-
{UCT_IFACE_FLAG_AM_ZCOPY, MCA_BTL_FLAGS_SEND},
31+
{UCT_IFACE_FLAG_AM_SHORT, MCA_BTL_FLAGS_SEND},
3032
{UCT_IFACE_FLAG_PUT_ZCOPY, MCA_BTL_FLAGS_PUT},
3133
{UCT_IFACE_FLAG_GET_ZCOPY, MCA_BTL_FLAGS_GET},
3234
{0,0},

0 commit comments

Comments
 (0)