Skip to content

Commit d50bc22

Browse files
author
Sergey Oblomov
committed
SPML/UCX: added strong sync for fence
- added infra for strong sync for fence calls - added [non]blocking get and flush implementations to strong fence - refactoring for strong sync parameters: used global config instead of per-context config Signed-off-by: Sergey Oblomov <sergeyo@nvidia.com> (cherry picked from commit 78185f9)
1 parent edccf46 commit d50bc22

File tree

4 files changed

+108
-25
lines changed

4 files changed

+108
-25
lines changed

config/ompi_check_ucx.m4

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,8 @@ AC_DEFUN([OMPI_CHECK_UCX],[
123123
[#include <ucp/api/ucp.h>])
124124
AC_CHECK_DECLS([ucp_ep_flush_nb, ucp_worker_flush_nb,
125125
ucp_request_check_status, ucp_put_nb, ucp_get_nb,
126-
ucp_put_nbx, ucp_get_nbx, ucp_atomic_op_nbx],
126+
ucp_put_nbx, ucp_get_nbx, ucp_atomic_op_nbx,
127+
ucp_ep_flush_nbx],
127128
[], [],
128129
[#include <ucp/api/ucp.h>])
129130
AC_CHECK_DECLS([ucm_test_events,

oshmem/mca/spml/ucx/spml_ucx.c

Lines changed: 78 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,15 @@ mca_spml_ucx_t mca_spml_ucx = {
7878
.num_disconnect = 1,
7979
.heap_reg_nb = 0,
8080
.enabled = 0,
81-
.get_mkey_slow = NULL
81+
.get_mkey_slow = NULL,
82+
.synchronized_quiet = false,
83+
.strong_sync = SPML_UCX_STRONG_ORDERING_NONE
8284
};
8385

8486
mca_spml_ucx_ctx_t mca_spml_ucx_ctx_default = {
8587
.ucp_worker = NULL,
8688
.ucp_peers = NULL,
87-
.options = 0,
88-
.synchronized_quiet = false
89+
.options = 0
8990
};
9091

9192
#if HAVE_DECL_UCP_ATOMIC_OP_NBX
@@ -404,7 +405,7 @@ int mca_spml_ucx_init_put_op_mask(mca_spml_ucx_ctx_t *ctx, size_t nprocs)
404405
{
405406
int res;
406407

407-
if (mca_spml_ucx.synchronized_quiet) {
408+
if (mca_spml_ucx_is_strong_ordering()) {
408409
ctx->put_proc_indexes = malloc(nprocs * sizeof(*ctx->put_proc_indexes));
409410
if (NULL == ctx->put_proc_indexes) {
410411
return OSHMEM_ERR_OUT_OF_RESOURCE;
@@ -426,7 +427,7 @@ int mca_spml_ucx_init_put_op_mask(mca_spml_ucx_ctx_t *ctx, size_t nprocs)
426427

427428
int mca_spml_ucx_clear_put_op_mask(mca_spml_ucx_ctx_t *ctx)
428429
{
429-
if (mca_spml_ucx.synchronized_quiet && ctx->put_proc_indexes) {
430+
if (mca_spml_ucx_is_strong_ordering() && ctx->put_proc_indexes) {
430431
OBJ_DESTRUCT(&ctx->put_op_bitmap);
431432
free(ctx->put_proc_indexes);
432433
}
@@ -843,7 +844,6 @@ static int mca_spml_ucx_ctx_create_common(long options, mca_spml_ucx_ctx_t **ucx
843844
ucx_ctx->options = options;
844845
ucx_ctx->ucp_worker = calloc(1, sizeof(ucp_worker_h));
845846
ucx_ctx->ucp_workers = 1;
846-
ucx_ctx->synchronized_quiet = mca_spml_ucx_ctx_default.synchronized_quiet;
847847

848848
params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE;
849849
if (oshmem_mpi_thread_provided == SHMEM_THREAD_SINGLE || options & SHMEM_CTX_PRIVATE || options & SHMEM_CTX_SERIALIZED) {
@@ -1178,14 +1178,81 @@ int mca_spml_ucx_put_nb_wprogress(shmem_ctx_t ctx, void* dst_addr, size_t size,
11781178
return ucx_status_to_oshmem_nb(status);
11791179
}
11801180

1181+
static int mca_spml_ucx_strong_sync(shmem_ctx_t ctx)
1182+
{
1183+
mca_spml_ucx_ctx_t *ucx_ctx = (mca_spml_ucx_ctx_t *)ctx;
1184+
ucs_status_ptr_t request;
1185+
static int flush_get_data;
1186+
unsigned i;
1187+
int ret;
1188+
int idx;
1189+
#if !(HAVE_DECL_UCP_EP_FLUSH_NBX || HAVE_DECL_UCP_EP_FLUSH_NB)
1190+
ucs_status_t status;
1191+
#endif
1192+
1193+
for (i = 0; i < ucx_ctx->put_proc_count; i++) {
1194+
idx = ucx_ctx->put_proc_indexes[i];
1195+
1196+
switch (mca_spml_ucx.strong_sync) {
1197+
case SPML_UCX_STRONG_ORDERING_NONE:
1198+
case SPML_UCX_STRONG_ORDERING_GETNB:
1199+
ret = mca_spml_ucx_get_nb(ctx,
1200+
ucx_ctx->ucp_peers[idx].mkeys[SPML_UCX_SERVICE_SEG]->super.super.va_base,
1201+
sizeof(flush_get_data), &flush_get_data, idx, NULL);
1202+
break;
1203+
case SPML_UCX_STRONG_ORDERING_GET:
1204+
ret = mca_spml_ucx_get(ctx,
1205+
ucx_ctx->ucp_peers[idx].mkeys[SPML_UCX_SERVICE_SEG]->super.super.va_base,
1206+
sizeof(flush_get_data), &flush_get_data, idx);
1207+
break;
1208+
#if HAVE_DECL_UCP_EP_FLUSH_NBX
1209+
case SPML_UCX_STRONG_ORDERING_FLUSH:
1210+
request = ucp_ep_flush_nbx(ucx_ctx->ucp_peers[idx].ucp_conn,
1211+
&mca_spml_ucx_request_param_b);
1212+
ret = opal_common_ucx_wait_request(request, ucx_ctx->ucp_worker[0], "ucp_flush_nbx");
1213+
#elif HAVE_DECL_UCP_EP_FLUSH_NB
1214+
request = ucp_ep_flush_nb(ucx_ctx->ucp_peers[idx].ucp_conn, 0, opal_common_ucx_empty_complete_cb);
1215+
ret = opal_common_ucx_wait_request(request, ucx_ctx->ucp_worker[0], "ucp_flush_nb");
1216+
#else
1217+
status = ucp_ep_flush(ucx_ctx->ucp_peers[idx].ucp_conn);
1218+
ret = (status == UCS_OK) ? OPAL_SUCCESS : OPAL_ERROR;
1219+
#endif
1220+
break;
1221+
default:
1222+
/* unknown mode */
1223+
ret = OMPI_SUCCESS;
1224+
break;
1225+
}
1226+
1227+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
1228+
oshmem_shmem_abort(-1);
1229+
return ret;
1230+
}
1231+
1232+
opal_bitmap_clear_bit(&ucx_ctx->put_op_bitmap, idx);
1233+
}
1234+
1235+
ucx_ctx->put_proc_count = 0;
1236+
return OSHMEM_SUCCESS;
1237+
}
1238+
11811239
int mca_spml_ucx_fence(shmem_ctx_t ctx)
11821240
{
1241+
mca_spml_ucx_ctx_t *ucx_ctx = (mca_spml_ucx_ctx_t *)ctx;
11831242
ucs_status_t err;
1243+
int ret;
11841244
unsigned int i = 0;
1185-
mca_spml_ucx_ctx_t *ucx_ctx = (mca_spml_ucx_ctx_t *)ctx;
11861245

11871246
opal_atomic_wmb();
11881247

1248+
if (mca_spml_ucx.strong_sync != SPML_UCX_STRONG_ORDERING_NONE) {
1249+
ret = mca_spml_ucx_strong_sync(ctx);
1250+
if (ret != OSHMEM_SUCCESS) {
1251+
oshmem_shmem_abort(-1);
1252+
return ret;
1253+
}
1254+
}
1255+
11891256
for (i=0; i < ucx_ctx->ucp_workers; i++) {
11901257
if (ucx_ctx->ucp_worker[i] != NULL) {
11911258
err = ucp_worker_fence(ucx_ctx->ucp_worker[i]);
@@ -1201,26 +1268,16 @@ int mca_spml_ucx_fence(shmem_ctx_t ctx)
12011268

12021269
int mca_spml_ucx_quiet(shmem_ctx_t ctx)
12031270
{
1204-
int flush_get_data;
12051271
int ret;
12061272
unsigned i;
1207-
int idx;
12081273
mca_spml_ucx_ctx_t *ucx_ctx = (mca_spml_ucx_ctx_t *)ctx;
12091274

12101275
if (mca_spml_ucx.synchronized_quiet) {
1211-
for (i = 0; i < ucx_ctx->put_proc_count; i++) {
1212-
idx = ucx_ctx->put_proc_indexes[i];
1213-
ret = mca_spml_ucx_get_nb(ctx,
1214-
ucx_ctx->ucp_peers[idx].mkeys[SPML_UCX_SERVICE_SEG]->super.super.va_base,
1215-
sizeof(flush_get_data), &flush_get_data, idx, NULL);
1216-
if (OMPI_SUCCESS != ret) {
1217-
oshmem_shmem_abort(-1);
1218-
return ret;
1219-
}
1220-
1221-
opal_bitmap_clear_bit(&ucx_ctx->put_op_bitmap, idx);
1276+
ret = mca_spml_ucx_strong_sync(ctx);
1277+
if (ret != OSHMEM_SUCCESS) {
1278+
oshmem_shmem_abort(-1);
1279+
return ret;
12221280
}
1223-
ucx_ctx->put_proc_count = 0;
12241281
}
12251282

12261283
opal_atomic_wmb();

oshmem/mca/spml/ucx/spml_ucx.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,13 @@ BEGIN_C_DECLS
4848
#define SPML_UCX_TRANSP_CNT 1
4949
#define SPML_UCX_SERVICE_SEG 0
5050

51+
enum {
52+
SPML_UCX_STRONG_ORDERING_NONE = 0, /* don't use strong ordering */
53+
SPML_UCX_STRONG_ORDERING_GETNB = 1, /* use non-blocking read to provide ordering */
54+
SPML_UCX_STRONG_ORDERING_GET = 2, /* use blocking read to provide ordering*/
55+
SPML_UCX_STRONG_ORDERING_FLUSH = 3 /* flush EP to provide ordering */
56+
};
57+
5158
/**
5259
* UCX SPML module
5360
*/
@@ -79,7 +86,6 @@ struct mca_spml_ucx_ctx {
7986
unsigned int ucp_workers;
8087
int *put_proc_indexes;
8188
unsigned put_proc_count;
82-
bool synchronized_quiet;
8389
};
8490
typedef struct mca_spml_ucx_ctx mca_spml_ucx_ctx_t;
8591

@@ -115,6 +121,7 @@ struct mca_spml_ucx {
115121
pthread_spinlock_t async_lock;
116122
int aux_refcnt;
117123
bool synchronized_quiet;
124+
int strong_sync;
118125
unsigned long nb_progress_thresh_global;
119126
unsigned long nb_put_progress_thresh;
120127
unsigned long nb_get_progress_thresh;
@@ -294,9 +301,15 @@ static inline int ucx_status_to_oshmem_nb(ucs_status_t status)
294301
#endif
295302
}
296303

304+
static inline int mca_spml_ucx_is_strong_ordering(void)
305+
{
306+
return (mca_spml_ucx.strong_sync != SPML_UCX_STRONG_ORDERING_NONE) ||
307+
mca_spml_ucx.synchronized_quiet;
308+
}
309+
297310
static inline void mca_spml_ucx_remote_op_posted(mca_spml_ucx_ctx_t *ctx, int dst)
298311
{
299-
if (OPAL_UNLIKELY(ctx->synchronized_quiet)) {
312+
if (OPAL_UNLIKELY(mca_spml_ucx_is_strong_ordering())) {
300313
if (!opal_bitmap_is_set_bit(&ctx->put_op_bitmap, dst)) {
301314
ctx->put_proc_indexes[ctx->put_proc_count++] = dst;
302315
opal_bitmap_set_bit(&ctx->put_op_bitmap, dst);

oshmem/mca/spml/ucx/spml_ucx_component.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,12 @@ static int mca_spml_ucx_component_register(void)
159159

160160
mca_spml_ucx_param_register_bool("synchronized_quiet", 0,
161161
"Use synchronized quiet on shmem_quiet or shmem_barrier_all operations",
162-
&mca_spml_ucx_ctx_default.synchronized_quiet);
162+
&mca_spml_ucx.synchronized_quiet);
163+
164+
mca_spml_ucx_param_register_int("strong_sync", 0,
165+
"Use strong synchronization on shmem_quiet, shmem_fence or shmem_barrier_all operations: "
166+
"0 - don't do strong synchronization, 1 - use non blocking get, 2 - use blocking get, 3 - use flush operation",
167+
&mca_spml_ucx.strong_sync);
163168

164169
mca_spml_ucx_param_register_ulong("nb_progress_thresh_global", 0,
165170
"Number of nb_put or nb_get operations before ucx progress is triggered. Disabled by default (0). Setting this value will override nb_put/get_progress_thresh.",
@@ -384,6 +389,13 @@ mca_spml_ucx_component_init(int* priority,
384389
return NULL ;
385390

386391
SPML_UCX_VERBOSE(50, "*** ucx initialized ****");
392+
393+
if ((mca_spml_ucx.strong_sync < SPML_UCX_STRONG_ORDERING_NONE) ||
394+
(mca_spml_ucx.strong_sync > SPML_UCX_STRONG_ORDERING_FLUSH)) {
395+
SPML_UCX_ERROR("incorrect value of strong_sync parameter: %d",
396+
mca_spml_ucx.strong_sync);
397+
}
398+
387399
return &mca_spml_ucx.super;
388400
}
389401

0 commit comments

Comments
 (0)