Skip to content

Commit bcb52ec

Browse files
xinzhao3artpol84
authored andcommitted
opal/common/ucx: add winfo ptr into req
Signed-off-by: Artem Polyakov <artpol84@gmail.com>
1 parent 3351742 commit bcb52ec

File tree

7 files changed

+104
-65
lines changed

7 files changed

+104
-65
lines changed

opal/mca/common/ucx/Makefile.am

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,13 @@ headers = \
1616
common_ucx.h \
1717
common_ucx_int.h \
1818
common_ucx_wpool.h \
19-
common_ucx_wpool_int.h \
20-
common_ucx_request.h
19+
common_ucx_wpool_int.h
2120

2221
# Source files
2322

2423
sources = \
2524
common_ucx.c \
26-
common_ucx_wpool.c \
27-
common_ucx_request.c
25+
common_ucx_wpool.c
2826

2927
# Help file
3028

opal/mca/common/ucx/common_ucx.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,5 @@
1515

1616
#include "common_ucx_int.h"
1717
#include "common_ucx_wpool.h"
18-
#include "common_ucx_request.h"
1918

2019
#endif

opal/mca/common/ucx/common_ucx_int.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
#define COMMON_UCX_INT_H
33

44
#include "opal_config.h"
5-
#include "common_ucx_request.h"
65

76
#include <stdint.h>
87

opal/mca/common/ucx/common_ucx_request.c

Lines changed: 0 additions & 17 deletions
This file was deleted.

opal/mca/common/ucx/common_ucx_request.h

Lines changed: 0 additions & 17 deletions
This file was deleted.

opal/mca/common/ucx/common_ucx_wpool.c

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,8 @@ static void
8080
_winfo_reset(opal_common_ucx_winfo_t *winfo)
8181
{
8282
if (winfo->inflight_req != UCS_OK) {
83-
opal_common_ucx_wait_request(winfo->inflight_req, winfo->worker,
84-
"opal_common_ucx_flush");
83+
opal_common_ucx_wait_request_mt(winfo->inflight_req,
84+
"opal_common_ucx_flush");
8585
winfo->inflight_req = UCS_OK;
8686
}
8787

@@ -1228,23 +1228,25 @@ opal_common_ucx_tlocal_fetch_spath(opal_common_ucx_wpmem_t *mem, int target)
12281228
}
12291229

12301230
OPAL_DECLSPEC int
1231-
opal_common_ucx_flush(ucp_ep_h ep, ucp_worker_h worker,
1232-
opal_common_ucx_flush_type_t type,
1233-
opal_common_ucx_flush_scope_t scope,
1234-
ucs_status_ptr_t *req_ptr)
1231+
opal_common_ucx_winfo_flush(opal_common_ucx_winfo_t *winfo, int target,
1232+
opal_common_ucx_flush_type_t type,
1233+
opal_common_ucx_flush_scope_t scope,
1234+
ucs_status_ptr_t *req_ptr)
12351235
{
12361236
ucs_status_ptr_t req;
12371237
ucs_status_t status = UCS_OK;
12381238
int rc = OPAL_SUCCESS;
12391239

12401240
#if HAVE_DECL_UCP_EP_FLUSH_NB
12411241
if (scope == OPAL_COMMON_UCX_SCOPE_EP) {
1242-
req = ucp_ep_flush_nb(ep, 0, opal_common_ucx_empty_complete_cb);
1242+
req = ucp_ep_flush_nb(winfo->endpoints[target], 0, opal_common_ucx_empty_complete_cb);
12431243
} else {
1244-
req = ucp_worker_flush_nb(worker, 0, opal_common_ucx_empty_complete_cb);
1244+
req = ucp_worker_flush_nb(winfo->worker, 0, opal_common_ucx_empty_complete_cb);
12451245
}
1246+
((opal_common_ucx_request_t *)req)->winfo = winfo;
1247+
12461248
if(OPAL_COMMON_UCX_FLUSH_B) {
1247-
rc = opal_common_ucx_wait_request(req, worker, "ucp_ep_flush_nb");
1249+
rc = opal_common_ucx_wait_request_mt(req, "ucp_ep_flush_nb");
12481250
} else {
12491251
*req_ptr = req;
12501252
}
@@ -1254,9 +1256,9 @@ opal_common_ucx_flush(ucp_ep_h ep, ucp_worker_h worker,
12541256
case OPAL_COMMON_UCX_FLUSH_NB_PREFERRED:
12551257
case OPAL_COMMON_UCX_FLUSH_B:
12561258
if (scope == OPAL_COMMON_UCX_SCOPE_EP) {
1257-
status = ucp_ep_flush(ep);
1259+
status = ucp_ep_flush(winfo->endpoints[target]);
12581260
} else {
1259-
status = ucp_worker_flush(worker);
1261+
status = ucp_worker_flush(winfo->worker);
12601262
}
12611263
rc = (status == UCS_OK) ? OPAL_SUCCESS : OPAL_ERROR;
12621264
case OPAL_COMMON_UCX_FLUSH_NB:
@@ -1287,9 +1289,8 @@ opal_common_ucx_wpmem_flush(opal_common_ucx_wpmem_t *mem,
12871289
continue;
12881290
}
12891291
opal_mutex_lock(&item->ptr->mutex);
1290-
rc = opal_common_ucx_flush(item->ptr->endpoints[target],
1291-
item->ptr->worker, OPAL_COMMON_UCX_FLUSH_B,
1292-
scope, NULL);
1292+
rc = opal_common_ucx_winfo_flush(item->ptr, target, OPAL_COMMON_UCX_FLUSH_B,
1293+
scope, NULL);
12931294
switch (scope) {
12941295
case OPAL_COMMON_UCX_SCOPE_WORKER:
12951296
item->ptr->global_inflight_ops = 0;
@@ -1323,3 +1324,20 @@ opal_common_ucx_wpmem_fence(opal_common_ucx_wpmem_t *mem) {
13231324
/* TODO */
13241325
return OPAL_SUCCESS;
13251326
}
1327+
1328+
OPAL_DECLSPEC void
1329+
opal_common_ucx_req_init(void *request) {
1330+
opal_common_ucx_request_t *req = (opal_common_ucx_request_t *)request;
1331+
req->ext_req = NULL;
1332+
req->ext_cb = NULL;
1333+
req->winfo = NULL;
1334+
}
1335+
1336+
OPAL_DECLSPEC void
1337+
opal_common_ucx_req_completion(void *request, ucs_status_t status) {
1338+
opal_common_ucx_request_t *req = (opal_common_ucx_request_t *)request;
1339+
if (req->ext_cb != NULL) {
1340+
(*req->ext_cb)(req->ext_req);
1341+
}
1342+
ucp_request_release(req);
1343+
}

opal/mca/common/ucx/common_ucx_wpool.h

Lines changed: 70 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include "opal_config.h"
66

77
#include "common_ucx_int.h"
8-
#include "common_ucx_request.h"
98
#include <stdint.h>
109
#include <string.h>
1110

@@ -79,7 +78,7 @@ typedef struct {
7978
pthread_key_t mem_tls_key;
8079
} opal_common_ucx_wpmem_t;
8180

82-
typedef struct {
81+
typedef struct opal_common_ucx_winfo {
8382
opal_recursive_mutex_t mutex;
8483
volatile int released;
8584
ucp_worker_h worker;
@@ -95,6 +94,14 @@ typedef struct {
9594
ucp_rkey_h *rkeys;
9695
} opal_common_ucx_tlocal_fast_ptrs_t;
9796

97+
typedef void (*opal_common_ucx_user_req_handler_t)(void *request);
98+
99+
typedef struct {
100+
void *ext_req;
101+
opal_common_ucx_user_req_handler_t ext_cb;
102+
opal_common_ucx_winfo_t *winfo;
103+
} opal_common_ucx_request_t;
104+
98105
typedef enum {
99106
OPAL_COMMON_UCX_PUT,
100107
OPAL_COMMON_UCX_GET
@@ -198,6 +205,10 @@ OPAL_DECLSPEC int opal_common_ucx_wpctx_create(opal_common_ucx_wpool_t *wpool, i
198205
opal_common_ucx_ctx_t **ctx_ptr);
199206
OPAL_DECLSPEC void opal_common_ucx_wpctx_release(opal_common_ucx_ctx_t *ctx);
200207

208+
/* request init / completion */
209+
OPAL_DECLSPEC void opal_common_ucx_req_init(void *request);
210+
OPAL_DECLSPEC void opal_common_ucx_req_completion(void *request, ucs_status_t status);
211+
201212
/* Managing thread local storage */
202213
OPAL_DECLSPEC int opal_common_ucx_tlocal_fetch_spath(opal_common_ucx_wpmem_t *mem, int target);
203214
static inline int
@@ -246,10 +257,57 @@ OPAL_DECLSPEC int opal_common_ucx_wpmem_flush(opal_common_ucx_wpmem_t *mem,
246257
int target);
247258
OPAL_DECLSPEC int opal_common_ucx_wpmem_fence(opal_common_ucx_wpmem_t *mem);
248259

249-
OPAL_DECLSPEC int opal_common_ucx_flush(ucp_ep_h ep, ucp_worker_h worker,
250-
opal_common_ucx_flush_type_t type,
251-
opal_common_ucx_flush_scope_t scope,
252-
ucs_status_ptr_t *req_ptr);
260+
OPAL_DECLSPEC int opal_common_ucx_winfo_flush(opal_common_ucx_winfo_t *winfo, int target,
261+
opal_common_ucx_flush_type_t type,
262+
opal_common_ucx_flush_scope_t scope,
263+
ucs_status_ptr_t *req_ptr);
264+
265+
static inline
266+
int opal_common_ucx_wait_request_mt(ucs_status_ptr_t request, const char *msg)
267+
{
268+
ucs_status_t status;
269+
int ctr = 0, ret = 0;
270+
opal_common_ucx_winfo_t *winfo;
271+
272+
/* check for request completed or failed */
273+
if (OPAL_LIKELY(UCS_OK == request)) {
274+
return OPAL_SUCCESS;
275+
} else if (OPAL_UNLIKELY(UCS_PTR_IS_ERR(request))) {
276+
MCA_COMMON_UCX_VERBOSE(1, "%s failed: %d, %s", msg ? msg : __func__,
277+
UCS_PTR_STATUS(request),
278+
ucs_status_string(UCS_PTR_STATUS(request)));
279+
return OPAL_ERROR;
280+
}
281+
282+
winfo = ((opal_common_ucx_request_t *)request)->winfo;
283+
assert(winfo != NULL);
284+
285+
do {
286+
ctr = opal_common_ucx.progress_iterations;
287+
opal_mutex_lock(&winfo->mutex);
288+
do {
289+
ret = ucp_worker_progress(winfo->worker);
290+
status = opal_common_ucx_request_status(request);
291+
if (status != UCS_INPROGRESS) {
292+
ucp_request_free(request);
293+
if (OPAL_UNLIKELY(UCS_OK != status)) {
294+
MCA_COMMON_UCX_VERBOSE(1, "%s failed: %d, %s",
295+
msg ? msg : __func__,
296+
UCS_PTR_STATUS(request),
297+
ucs_status_string(UCS_PTR_STATUS(request)));
298+
opal_mutex_unlock(&winfo->mutex);
299+
return OPAL_ERROR;
300+
}
301+
break;
302+
}
303+
ctr--;
304+
} while (ctr > 0 && ret > 0 && status == UCS_INPROGRESS);
305+
opal_mutex_unlock(&winfo->mutex);
306+
opal_progress();
307+
} while (status == UCS_INPROGRESS);
308+
309+
return OPAL_SUCCESS;
310+
}
253311

254312
static inline int _periodical_flush_nb(opal_common_ucx_wpmem_t *mem,
255313
opal_common_ucx_winfo_t *winfo,
@@ -264,8 +322,8 @@ static inline int _periodical_flush_nb(opal_common_ucx_wpmem_t *mem,
264322
opal_common_ucx_flush_scope_t scope;
265323

266324
if (winfo->inflight_req != UCS_OK) {
267-
rc = opal_common_ucx_wait_request(winfo->inflight_req, winfo->worker,
268-
"opal_common_ucx_flush_nb");
325+
rc = opal_common_ucx_wait_request_mt(winfo->inflight_req,
326+
"opal_common_ucx_flush_nb");
269327
if(OPAL_UNLIKELY(OPAL_SUCCESS != rc)){
270328
MCA_COMMON_UCX_VERBOSE(1, "opal_common_ucx_wait_request failed: %d", rc);
271329
return rc;
@@ -283,13 +341,13 @@ static inline int _periodical_flush_nb(opal_common_ucx_wpmem_t *mem,
283341
winfo->inflight_ops[target] = 0;
284342
}
285343

286-
rc = opal_common_ucx_flush(winfo->endpoints[target], winfo->worker,
287-
OPAL_COMMON_UCX_FLUSH_NB_PREFERRED, scope,
288-
&winfo->inflight_req);
344+
rc = opal_common_ucx_winfo_flush(winfo, target, OPAL_COMMON_UCX_FLUSH_NB_PREFERRED,
345+
scope, &winfo->inflight_req);
289346
if(OPAL_UNLIKELY(OPAL_SUCCESS != rc)){
290347
MCA_COMMON_UCX_VERBOSE(1, "opal_common_ucx_flush failed: %d", rc);
291348
return rc;
292349
}
350+
((opal_common_ucx_request_t *)winfo->inflight_req)->winfo = winfo;
293351
} else if (OPAL_UNLIKELY(winfo->inflight_req != UCS_OK)) {
294352
int ret;
295353
do {
@@ -510,6 +568,7 @@ opal_common_ucx_wpmem_fetch_nb(opal_common_ucx_wpmem_t *mem,
510568
if (UCS_PTR_IS_PTR(req)) {
511569
req->ext_req = user_req_ptr;
512570
req->ext_cb = user_req_cb;
571+
req->winfo = winfo;
513572
} else {
514573
if (user_req_cb != NULL) {
515574
(*user_req_cb)(user_req_ptr);

0 commit comments

Comments
 (0)